updated table frontend

2025-08-07 11:45:23 +03:00
commit 917396ec15
10 changed files with 7169 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,2 @@
+.venv
+__pycache__
--- a/accountRecords.csv
+++ b/accountRecords.csv
--- a/account_records_incoming.csv
+++ b/account_records_incoming.csv
--- a/base_import.py
+++ b/base_import.py
@@ -0,0 +1,61 @@
+from contextlib import contextmanager
+from functools import lru_cache
+from typing import Generator
+
+from sqlalchemy import create_engine
+from sqlalchemy.orm import declarative_base, sessionmaker, scoped_session, Session
+
+
+# Configure the database engine with proper pooling
+engine = create_engine(
+    "postgresql+psycopg2://postgres:password@10.10.2.14:5432/postgres",
+    pool_pre_ping=True,
+    pool_size=10,  # Reduced from 20 to better match your CPU cores
+    max_overflow=5,  # Reduced from 10 to prevent too many connections
+    pool_recycle=600,  # Keep as is
+    pool_timeout=30,  # Keep as is
+    echo=False,  # Consider setting to False in production
+)
+
+Base = declarative_base()
+
+
+# Create a cached session factory
+@lru_cache()
+def get_session_factory() -> scoped_session:
+    """Create a thread-safe session factory."""
+    session_local = sessionmaker(
+        bind=engine,
+        autocommit=False,
+        autoflush=False,
+        expire_on_commit=True,  # Prevent expired object issues
+    )
+    return scoped_session(session_local)
+
+
+# Get database session with proper connection management
+@contextmanager
+def get_db() -> Generator[Session, None, None]:
+    """Get database session with proper connection management.
+
+    This context manager ensures:
+    - Proper connection pooling
+    - Session cleanup
+    - Connection return to pool
+    - Thread safety
+
+    Yields:
+        Session: SQLAlchemy session object
+    """
+
+    session_factory = get_session_factory()
+    session = session_factory()
+    try:
+        yield session
+        session.commit()
+    except Exception:
+        session.rollback()
+        raise
+    finally:
+        session.close()
+        session_factory.remove()  # Clean up the session from the registry
--- a/dataCleaning.py
+++ b/dataCleaning.py
@@ -0,0 +1,225 @@
+import re
+import pandas as pd
+from unidecode import unidecode
+from difflib import SequenceMatcher
+from itertools import permutations
+from time import perf_counter
+from base_import import get_session_factory
+from sqlalchemy import text as sqlalchemy_text
+
+session_factory = get_session_factory()
+session = session_factory()
+
+turkish_months = ["OCAK", "ŞUBAT", "MART", "NİSAN", "MAYIS", "HAZİRAN", "TEMMUZ", "AĞUSTOS", "EYLÜL", "EKİM", "KASIM", "ARALIK"]
+turkish_months_abbr = {
+    "OCA": "OCAK", "SUB": "ŞUBAT", "ŞUB": "ŞUBAT", "MAR": "MART", "NIS": "NİSAN", "MAY": "MAYIS", "HAZ": "HAZİRAN", "HZR": "HAZİRAN",
+    "TEM": "TEMMUZ", "AGU": "AĞUSTOS", "AGT": "AĞUSTOS", "EYL": "EYLÜL", "EKI": "EKİM", "KAS": "KASIM", "ARA": "ARALIK",
+}
+start_year = 1950
+current_year = pd.Timestamp.now().year
+people = session.execute(
+    sqlalchemy_text("""
+    SELECT p.firstname, p.middle_name, p.surname, bl.id FROM public.people as p
+    inner join public.build_living_space as bl on bl.person_id = p.id
+    inner join public.build_parts as bp on bp.id = bl.build_parts_id
+    inner join public.build as b on b.id = bp.build_id
+    where b.id = 1
+    """)
+).all()
+people = [{"firstname": p[0], "middle_name": p[1], "last_name": p[2], 'id': p[3]} for p in people]
+
+
+def clean_text(text):
+    # Convert to string just in case
+    text = str(text)
+    # Remove extra spaces and tabs by splitting and joining
+    text = " ".join(text.split())
+    text_remove_underscore = text.replace("-", " ")
+    text_remove_asterisk = text_remove_underscore.replace("*", " ")
+    text_remove_comma = text_remove_asterisk.replace(",", " ")
+    # Optionally lower-case or normalize unicode if needed
+    return text_remove_comma
+
+
+def is_valid_char_match(word, month):
+    """Ensure all letters in the word exist in the target month name."""
+    word_chars = set(word)
+    month_chars = set(month)
+    return word_chars.issubset(month_chars)
+
+
+def best_month_matches(text, threshold=0.7):
+    matches = []
+    words = clean_text(text).split()
+
+    for word in words:
+        # First check abbreviations dictionary exact match
+        if word in turkish_months_abbr:
+            full_month = turkish_months_abbr[word]
+            matches.append((full_month.capitalize(), word, 1.0))
+            continue
+        
+        # Else fuzzy match full months
+        for month in turkish_months:
+            month_clean = unidecode(month.upper())
+            ratio = SequenceMatcher(None, word, month_clean).ratio()
+            if ratio >= threshold and is_valid_char_match(word, month_clean):
+                matches.append((month.capitalize(), word, round(ratio, 2)))
+                break
+
+    return matches if matches else ["N/A"]
+
+
+def print_out_results(data_frame, df):
+    for ix, process_comment in enumerate(data_frame):
+        print(f'Row number {ix} : {process_comment}')
+    print(len(data_frame), '/', len(df))
+
+
+def extract_years(text):
+    matches = re.findall(r'\b(19\d{2}|20\d{2})\b', text)
+    valid_years = [year for year in matches if start_year <= int(year) <= current_year]
+    return valid_years if valid_years else ["N/A"]
+
+
+def extract_months(text):
+    # Normalize text and months by converting to ASCII lowercase
+    text_norm = unidecode(str(text)).lower()
+    months_norm = [unidecode(m).lower() for m in turkish_months]
+    found = [turkish_months[i] for i, m in enumerate(months_norm) if m in text_norm]
+    return found if found else ["N/A"]
+
+
+def normalize_text(text):
+    return unidecode(text).lower()
+
+
+def build_name_regex_all_combinations(person):
+    firstname = person.get("firstname", "").strip()
+    middle_name = person.get("middle_name", "").strip()
+    last_name = person.get("last_name", "").strip()
+
+    parts = [unidecode(p).lower() for p in [firstname, middle_name, last_name] if p]
+
+    patterns = []
+    for r in range(1, len(parts) + 1):
+        # Use permutations instead of combinations to get all orderings
+        for permuted_parts in permutations(parts, r):
+            regex_pattern = r"\b" + r"\s*".join(map(re.escape, permuted_parts)) + r"\b"
+            patterns.append((re.compile(regex_pattern, flags=re.IGNORECASE), r))
+
+    # Add full merged version like "fatihergunguclu"
+    if len(parts) >= 2:
+        merged = ''.join(parts)
+        patterns.append((re.compile(rf"\b{re.escape(merged)}\b", flags=re.IGNORECASE), len(parts)))
+
+    return patterns
+
+
+def get_person_initials(person):
+    parts = [person.get("firstname", ""), person.get("middle_name", ""), person.get("last_name", "")]
+    return [unidecode(p.strip())[0].upper() for p in parts if p]
+
+def get_text_initials(matched_text):
+    return [unidecode(word.strip())[0].upper() for word in matched_text.split() if word.strip()]
+
+def find_person_regex_all_combinations(text, person):
+    text_norm = normalize_text(text)
+    scored_matches = []
+
+    for pattern, weight in build_name_regex_all_combinations(person):
+        for match in pattern.finditer(text_norm):
+            matched_text = match.group().strip()
+            scored_matches.append({
+                "matched_person": person,
+                "matched_text": matched_text,
+                "weight": weight
+            })
+
+    return scored_matches
+
+def find_all_person_matches(text):
+    all_valid_matches = {}
+    
+    for person in people:
+        matches = find_person_regex_all_combinations(text, person)
+
+        for match in matches:
+            matched_person = match["matched_person"]
+            matched_text = match["matched_text"]
+            weight = match["weight"]
+
+            person_initials = get_person_initials(matched_person)
+            found_text_letters = get_text_initials(matched_text)
+
+            match_count = sum(1 for c in found_text_letters if c in person_initials)
+
+            if match_count >= 2:
+                person_key = tuple(matched_person.values())
+                if person_key not in all_valid_matches or all_valid_matches[person_key][0] < weight:
+                    all_valid_matches[person_key] = (weight, matched_person)
+
+    sorted_matches = sorted(all_valid_matches.values(), key=lambda x: -x[0])
+    return [m[1] for m in sorted_matches] or ["N/A"]
+
+
+def find_all_person_matches_possibilities(text):
+    all_valid_matches = {}
+    
+    for person in people:
+        matches = find_person_regex_all_combinations(text, person)
+
+        for match in matches:
+            matched_person = match["matched_person"]
+            matched_text = match["matched_text"]
+            weight = match["weight"]
+
+            person_initials = get_person_initials(matched_person)
+            found_text_letters = get_text_initials(matched_text)
+            person_key = tuple(matched_person.values())
+            all_valid_matches[person_key] = (weight, matched_person)
+    sorted_matches = sorted(all_valid_matches.values(), key=lambda x: -x[0])
+    return [m[1] for m in sorted_matches] or ["N/A"]
+
+start_time = perf_counter()
+df = pd.read_csv("account_records_incoming.csv")
+
+df["process_comment"] = df["process_comment"].apply(clean_text)
+df["year"] = df["process_comment"].str.extract(r'(\d{4})')
+df["year"] = pd.to_numeric(df["year"], errors='coerce')
+
+df["year"] = df["year"].where(df["year"].between(start_year, current_year))
+df["years_found"] = df["process_comment"].apply(extract_years)
+df["months_found"] = df["process_comment"].apply(extract_months)
+df["fuzzy_months_found"] = df["process_comment"].apply(best_month_matches)
+df["person_name_matches"] = df["process_comment"].apply(find_all_person_matches)
+df["person_name_matches_possibilities"] = df["process_comment"].apply(find_all_person_matches_possibilities)
+
+# df["year_str"] = df["year"].apply(lambda x: str(int(x)) if pd.notnull(x) else "N/A")
+# print(df[["process_comment", "year_str", "months_found"]].head(100))
+
+# for ix, row in df[["process_comment", "years_found", "months_found", "fuzzy_months_found", "person_name_matches", "person_name_matches_possibilities"]].iterrows():
+#     if row['person_name_matches'] == ["N/A"] and row['person_name_matches_possibilities'] != ["N/A"]:
+#         print(f"Row number {ix} | {row['process_comment']}")
+#         # print(f"Years found: {row['years_found']} | Months found: {row['months_found']} | Fuzzy months found: {row['fuzzy_months_found']}")
+#         print(f"Person name matches: {row['person_name_matches']} | Person name matches possibilities: {row['person_name_matches_possibilities']}")
+
+month_direct_found = df[df['months_found'].apply(lambda x: x != ["N/A"])]
+month_indirect_found = df[df['months_found'].apply(lambda x: x == ["N/A"]) | df['fuzzy_months_found'].apply(lambda x: x != ["N/A"])]
+years_found = df[df['years_found'].apply(lambda x: x != ["N/A"])]
+people_found = df[df['person_name_matches'].apply(lambda x: x != ["N/A"]) ]
+people_found_possibilities = df[df['person_name_matches_possibilities'].apply(lambda x: x != ["N/A"]) & df['person_name_matches'].apply(lambda x: x == ["N/A"])]
+
+print("length of years where is not N/A :           ", len(years_found), '/', len(df))
+print("length of months where is not N/A :          ", len(month_direct_found), '/', len(df))
+print("length of fuzzy months where is not N/A :    ", len(month_indirect_found), '/', len(df))
+print("lenght of names found", len(people_found), '/', len(df))
+print("lenght of names found with possibilities", len(people_found_possibilities), '/', len(df))
+end_time = perf_counter()
+print(f"Time taken to read CSV: {end_time - start_time:.2f} seconds")
+# for ix, row in df[['id', 'process_comment', 'years_found', 'months_found', 'fuzzy_months_found', 'person_name_matches', 'person_name_matches_possibilities']].iterrows():
+#     print(f"id {row['id']} | {row['process_comment']} | {row['years_found']} | {row['months_found']} | Person Name Matches : {row['person_name_matches']}")
+#     if row['person_name_matches'] == ["N/A"] and row['person_name_matches_possibilities'] != ["N/A"]:
+#         print(f"id {row['id']} | Fuzzy person Matches : {row['person_name_matches_possibilities']}")
+#     if row['months_found'] == ["N/A"] and row['fuzzy_months_found'] != ["N/A"]:
+#         print(f"id {row['id']} | Fuzzy month Matches  : {row['fuzzy_months_found']}")
--- a/miner.py
+++ b/miner.py
@@ -0,0 +1,266 @@
+from base_import import get_session_factory
+from sqlalchemy import text as sqlalchemy_text
+
+session_factory = get_session_factory()
+session = session_factory()
+
+# Add LIMIT and OFFSET directly to the SQL query
+def query_to_run(limit: int, offset: int):
+    return sqlalchemy_text(f"""
+        SELECT a.id, a.uu_id, a.iban, a.bank_date, a.process_comment, a.payment_result_type, 
+        b.part_code, COUNT(a2.id) AS sum_a2_id
+        FROM public.account_records AS a
+        INNER JOIN public.build_parts b ON b.id = a.build_parts_id
+        INNER JOIN public.api_enum_dropdown ae ON ae.id = a.payment_result_type
+        INNER JOIN public.build_living_space bl ON bl.id = a.living_space_id
+        INNER JOIN public.people p ON p.id = bl.person_id
+        LEFT JOIN public.account_records_model_train a2 ON a2.account_records_id = a.id
+        WHERE a.bank_date::date > '2023-06-30' and a2.id is null
+        GROUP BY a.id, a.uu_id, a.iban, a.bank_date, a.process_comment, a.payment_result_type, b.part_code
+        ORDER BY a.bank_date ASC
+        LIMIT {int(limit)} OFFSET {int(offset)};
+        """)
+
+
+"""
+52	56b75aec-d28f-4cd4-84e9-ea222cc1d9bd	BuildTypes	APT_KZN	Apartman Kazan Dairesi
+53	a9f854d1-d01d-4f2a-af5f-1ccf34193e0f	BuildTypes	APT_GRJ	Apartman Garaj
+54	ed7371a4-0a0a-491d-b1f9-015025b6ac91	BuildTypes	APT_DP	Apartman Depo
+55	f6eb95dd-5ed0-407b-8205-4bc855199b06	BuildTypes	DAIRE	Apartman Dairesi
+56	242bbe5e-44df-4f10-9583-9d80ff93c52d	BuildTypes	APT	Apartman Binası
+57	8920eb8b-a5aa-42c3-81d0-13afca85ba1f	BuildTypes	APT_YNT	Apartman Yönetimi
+58	a7b98daf-c83e-494d-8938-d716be131b5d	BuildTypes	APT_PRK	Apartman Açık Park Alanı
+59	628188d9-b5e3-493e-9a42-afac3f5bf816	BuildTypes	APT_YSL	Apartman Yeşil Alan
+60	8b5bcca2-7702-4486-904c-d708248ccd4d	BuildTypes	APT_YOL	Apartman Ara Yol
+5	1b51381a-b5a9-485e-884e-fab07b4adf21	BuildDuesTypes	BDT-S	Service fee
+6	4619b29f-7b60-4b95-9a97-50a4e5d40f94	BuildDuesTypes	BDT-I	Information
+1	2d0127eb-899e-47c5-ad86-67a78174bf90	BuildDuesTypes	BDT-D	Bina Aidat
+2	11656423-24b7-4ed9-96e7-1563f639da53	BuildDuesTypes	BDT-A	Bina Ek Aidat
+3	c74c72f4-5e10-4d00-8016-4f9ddd50b3c4	BuildDuesTypes	BDT-R	Bina Tadilat
+4	5edeb654-b7ce-4c1f-b7e3-2c717bb1d263	BuildDuesTypes	BDT-L	Bina Yasal Harcama
+48	f14ae805-8238-438f-a522-d8ac6553f717	TimePeriod	TP-W	Weekly
+49	184c3356-6397-476d-a965-45ddf26a4ff5	TimePeriod	TP-M	Monthly
+50	ba36110f-7afe-4c41-bcad-f80ce71f626e	TimePeriod	TP-Q	Quarterly
+51	89ff94c6-126e-45c2-9bc7-6d1007d02528	TimePeriod	TP-Y	Yearly
+32	3cf533a4-3947-4563-9a43-16ea2bab1119	PerComType	1	Person
+
+"""
+
+class Categories:
+    APTKZN = "APTKZN"
+    APTGRJ = "APTGRJ"
+    APTDP = "APTDP"
+    DAIRE = "DAIRE"
+    APT = "APT"
+    APTYNT = "APTYNT"
+    APTPRK = "APTPRK"
+    APTYSL = "APTYSL"
+    APTYOL = "APTYOL"
+    BDTI = "BDTI"
+    BDTD = "BDTD"
+    BDTA = "BDTA"
+    BDTR = "BDTR"
+    BDTL = "BDTL"
+    TPW = "TPW"
+    TPM = "TPM"
+    TPQ = "TPQ"
+    TPY = "TPY"
+    PERSON = "PERSON"
+
+    @classmethod
+    def get_category_id(cls, category_name):
+        category_dict = {
+            "APTKZN": (52, "56b75aec-d28f-4cd4-84e9-ea222cc1d9bd"),
+            "APTGRJ": (53, "a9f854d1-d01d-4f2a-af5f-1ccf34193e0f"),
+            "APTDP": (54, "ed7371a4-0a0a-491d-b1f9-015025b6ac91"),
+            "DAIRE": (55, "f6eb95dd-5ed0-407b-8205-4bc855199b06"),
+            "APT": (56, "242bbe5e-44df-4f10-9583-9d80ff93c52d"),
+            "APTYNT": (57, "8920eb8b-a5aa-42c3-81d0-13afca85ba1f"),
+            "APTPRK": (58, "a7b98daf-c83e-494d-8938-d716be131b5d"),
+            "APTYSL": (59, "628188d9-b5e3-493e-9a42-afac3f5bf816"),
+            "APTYOL": (60, "8b5bcca2-7702-4486-904c-d708248ccd4d"),
+            "BDTI": (5, "1b51381a-b5a9-485e-884e-fab07b4adf21"),
+            "BDTD": (1, "2d0127eb-899e-47c5-ad86-67a78174bf90"),
+            "BDTA": (2, "11656423-24b7-4ed9-96e7-1563f639da53"),
+            "BDTR": (3, "c74c72f4-5e10-4d00-8016-4f9ddd50b3c4"),
+            "BDTL": (4, "5edeb654-b7ce-4c1f-b7e3-2c717bb1d263"),
+            "TPW": (48, "f14ae805-8238-438f-a522-d8ac6553f717"),
+            "TPM": (49, "184c3356-6397-476d-a965-45ddf26a4ff5"),
+            "TPQ": (50, "ba36110f-7afe-4c41-bcad-f80ce71f626e"),
+            "TPY": (51, "89ff94c6-126e-45c2-9bc7-6d1007d02528"),
+            "PERSON": (32, "3cf533a4-3947-4563-9a43-16ea2bab1119"),
+        }
+        if not category_name in category_dict:
+            raise ValueError(f"Invalid category name: {category_name}")
+        return category_dict.get(category_name)
+    
+
+def get_model_train_query():
+    """
+    Returns a SQLAlchemy text object for inserting model train data.
+    Parameters will be bound when executing the query.
+    """
+    return sqlalchemy_text("""
+        INSERT INTO public.account_records_model_train (
+            account_records_id,
+            account_records_uu_id,
+            search_text,
+            start_index,
+            end_index,
+            category_id,
+            category_uu_id
+        )
+        VALUES (
+            :account_records_id,
+            :account_records_uu_id,
+            :search_text,
+            :start_index,
+            :end_index,
+            :category_id,
+            :category_uu_id
+        );
+    """)
+
+
+def find_indices(search_text: str, target_text: str, offset: int = 1):
+    """
+    Returns start and end indices of the search_text within target_text
+    
+    Args:
+        search_text (str): The text to search for
+        target_text (str): The text to search within
+        offset (int, optional): Which occurrence to find (1 for first, 2 for second, etc). Defaults to 1.
+    
+    Returns:
+        tuple: (start_index, end_index) of the found text, or (-1, -1) if not found or offset is too large
+    """
+    if offset < 1:
+        offset = 1
+        
+    current_pos = 0
+    occurrence_count = 0
+    
+    while occurrence_count < offset:
+        start_index = target_text.find(search_text, current_pos)
+        
+        if start_index == -1:  # Not found
+            return -1, -1
+            
+        occurrence_count += 1
+        
+        if occurrence_count == offset:
+            end_index = start_index + len(search_text) - 1
+            return start_index, end_index + 1
+            
+        # Move past this occurrence to find the next one
+        current_pos = start_index + 1
+    
+    raise Exception("Offset is too large")
+
+
+class AccountRecord:
+
+    def __init__(self, id, uu_id, iban, bank_date, process_comment, payment_result_type, part_code, sum_a2_id):
+        self.id = id
+        self.uu_id = str(uu_id)
+        self.iban = iban
+        self.bank_date = bank_date
+        self.process_comment = process_comment
+        self.payment_result_type = payment_result_type
+        self.part_code = part_code
+        self.sum_a2_id = sum_a2_id
+
+    def to_dict(self):
+        return {
+            "id": self.id,
+            "uu_id": self.uu_id,
+            "iban": self.iban,
+            "bank_date": self.bank_date,
+            "process_comment": self.process_comment,
+            "payment_result_type": self.payment_result_type,
+            "part_code": self.part_code,
+            "sum_a2_id": self.sum_a2_id,
+        }
+
+# Execute the query and process results directly
+results = session.execute(query_to_run(limit=1, offset=0))
+
+account_record_dict = dict()
+for result in results:
+    """
+    (791, UUID('9d276cc8-289f-45c1-9805-44464af5d7bf'), 'TR400006400000142450093333', 
+    datetime.datetime(2023, 7, 1, 12, 22, 27, tzinfo=datetime.timezone.utc), 
+    '2 NOLU DAİRE TEMMUZ Ç3 AİDAT*SONGÜL VAR*Hİ7748686973', 1, 'DAIRE_2', 4)
+    """
+    account_record = AccountRecord(*result)
+    account_record_dict = account_record.to_dict()
+
+
+"""
+"APTKZN": (52, "56b75aec-d28f-4cd4-84e9-ea222cc1d9bd"),
+"APTGRJ": (53, "a9f854d1-d01d-4f2a-af5f-1ccf34193e0f"),
+"APTDP": (54, "ed7371a4-0a0a-491d-b1f9-015025b6ac91"),
+"DAIRE": (55, "f6eb95dd-5ed0-407b-8205-4bc855199b06"),
+"APT": (56, "242bbe5e-44df-4f10-9583-9d80ff93c52d"),
+"APTYNT": (57, "8920eb8b-a5aa-42c3-81d0-13afca85ba1f"),
+"APTPRK": (58, "a7b98daf-c83e-494d-8938-d716be131b5d"),
+"APTYSL": (59, "628188d9-b5e3-493e-9a42-afac3f5bf816"),
+"APTYOL": (60, "8b5bcca2-7702-4486-904c-d708248ccd4d"),
+"BDTI": (5, "1b51381a-b5a9-485e-884e-fab07b4adf21"),
+"BDTD": (6, "4619b29f-7b60-4b95-9a97-50a4e5d40f94"),
+"BDTA": (2, "11656423-24b7-4ed9-96e7-1563f639da53"),
+"BDTR": (3, "c74c72f4-5e10-4d00-8016-4f9ddd50b3c4"),
+"BDTL": (4, "5edeb654-b7ce-4c1f-b7e3-2c717bb1d263"),
+"TPW": (48, "f14ae805-8238-438f-a522-d8ac6553f717"),
+"TPM": (49, "184c3356-6397-476d-a965-45ddf26a4ff5"),
+"TPQ": (50, "ba36110f-7afe-4c41-bcad-f80ce71f626e"),
+"TPY": (51, "89ff94c6-126e-45c2-9bc7-6d1007d02528"),
+"PCT": (32, "3cf533a4-3947-4563-9a43-16ea2bab1119"),
+"""
+
+print(account_record_dict['id'])
+print(account_record_dict['uu_id'])
+print(account_record_dict['process_comment'])
+
+
+account_records_id = int(219)
+account_records_uuid = str("5d301273-806c-47d6-aeeb-e056dc119494")
+sample_text = "GÜNEŞ APARTMANI AİDAT EYLÜL*HASAN CİHAN ŞENKÜÇÜK*Hİ9021822604"
+search_text = "HASAN CİHAN ŞENKÜÇÜK"
+start_index, end_index = find_indices(search_text, sample_text, offset=1)
+#exit()
+print("start_index", start_index)
+print("end_index", end_index)
+category_id, category_uuid = Categories.get_category_id(Categories.PERSON)
+
+write_dict = dict(
+    account_records_id=account_records_id,
+    account_records_uu_id=account_records_uuid,
+    search_text=search_text,
+    start_index=start_index,
+    end_index=end_index,
+    category_id=category_id,
+    category_uu_id=category_uuid
+)
+print('write_dict', write_dict)
+
+# Prepare the parameters for the query
+params = {
+    "account_records_id": account_records_id,
+    "account_records_uu_id": account_records_uuid,
+    "search_text": search_text,
+    "start_index": start_index,
+    "end_index": end_index,
+    "category_id": category_id,
+    "category_uu_id": category_uuid
+}
+
+# Get the parameterized query template
+query_template = get_model_train_query()
+
+# Print parameters for debugging
+print("Parameters:", params)
+
+# Execute the query with parameters
+session.execute(query_template, params)
+session.commit()
--- a/1
+++ b/1
--- a/requirements.txt
+++ b/requirements.txt
@@ -0,0 +1,5 @@
+sqlalchemy-mixins>=2.0.5
+psycopg2-binary>=2.9.10
+arrow>=1.3.0
+pandas>=2.2.2
+numpy>=1.26.4
--- a/toCsvFrame.py
+++ b/toCsvFrame.py
@@ -0,0 +1,9 @@
+from base_import import get_session_factory, engine
+from sqlalchemy import text as sqlalchemy_text
+import pandas as pd
+
+# session_factory = get_session_factory()
+# session = session_factory()
+
+query = sqlalchemy_text("SELECT * FROM public.account_records where currency_value > 0")
+pd.read_sql(query, engine).to_csv("account_records_incoming.csv", index=False)
--- a/withoutPandas.py
+++ b/withoutPandas.py
@@ -0,0 +1,553 @@
+import re
+import arrow
+from unidecode import unidecode
+from difflib import SequenceMatcher
+from itertools import permutations
+from time import perf_counter
+from base_import import get_session_factory
+from sqlalchemy import text as sqlalchemy_text
+
+
+session_factory = get_session_factory()
+session = session_factory()
+
+
+turkish_months = ["OCAK", "ŞUBAT", "MART", "NİSAN", "MAYIS", "HAZİRAN", "TEMMUZ", "AĞUSTOS", "EYLÜL", "EKİM", "KASIM", "ARALIK"]
+turkish_months_abbr = {
+    "OCA": "OCAK", "SUB": "ŞUBAT", "ŞUB": "ŞUBAT", "MAR": "MART", "NIS": "NİSAN", "MAY": "MAYIS", "HAZ": "HAZİRAN", "HZR": "HAZİRAN",
+    "TEM": "TEMMUZ", "AGU": "AĞUSTOS", "AGT": "AĞUSTOS", "EYL": "EYLÜL", "EKI": "EKİM", "KAS": "KASIM", "ARA": "ARALIK",
+}
+start_year = 1950
+current_year = arrow.now().year
+
+people_query = sqlalchemy_text("""
+    SELECT DISTINCT ON (p.id) p.firstname, p.middle_name, p.surname, p.birthname, bl.id 
+    FROM public.people as p
+    INNER JOIN public.build_living_space as bl ON bl.person_id = p.id
+    INNER JOIN public.build_parts as bp ON bp.id = bl.build_parts_id
+    INNER JOIN public.build as b ON b.id = bp.build_id
+    WHERE b.id = 1
+    ORDER BY p.id
+""")
+people_raw = session.execute(people_query).all()
+
+remove_duplicate = list()
+clean_people_list = list()
+for person in people_raw:
+    merged_name = f"{person[0]} {person[1]} {person[2]} {person[3]}"
+    if merged_name not in remove_duplicate:
+        clean_people_list.append(person)
+        remove_duplicate.append(merged_name)
+    
+people = [{"firstname": p[0], "middle_name": p[1], "surname": p[2], "birthname": p[3], 'id': p[4]} for p in clean_people_list]
+
+query_account_records = sqlalchemy_text("""
+    SELECT a.id, a.iban, a.bank_date, a.process_comment FROM public.account_records as a where currency_value > 0
+    
+""")    # and bank_date::date >= '2020-01-01'
+account_records = session.execute(query_account_records).all()
+account_records = [{"id": ar[0], "iban": ar[1], "bank_date": ar[2], "process_comment": ar[3]} for ar in account_records]
+
+
+def clean_text(text):
+    text = str(text)
+    text = re.sub(r'\d{8,}', ' ', text)
+    # text = re.sub(r'\b[A-Za-z0-9]*?[0-9]+[A-Za-z0-9]*?[A-Za-z]+[A-Za-z0-9]*\b|\b[A-Za-z0-9]*?[A-Za-z]+[A-Za-z0-9]*?[0-9]+[A-Za-z0-9]*\b', ' ', text)
+    text = text.replace("/", " ")
+    text = text.replace("_", " ")
+    text_remove_underscore = text.replace("-", " ").replace("+", " ")
+    text_remove_asterisk = text_remove_underscore.replace("*", " ")
+    text_remove_comma = text_remove_asterisk.replace(",", " ")
+    text_remove_dots = text_remove_comma.replace(".", " ")
+    text_remove_dots = re.sub(r'\s+', ' ', text_remove_dots)
+    text_remove_dots = text_remove_dots.strip()
+    return text_remove_dots
+
+
+def normalize_text(text):
+    text = text.replace('İ', 'i')
+    text = text.replace('I', 'ı')
+    text = text.replace('Ş', 'ş')
+    text = text.replace('Ğ', 'ğ')
+    text = text.replace('Ü', 'ü')
+    text = text.replace('Ö', 'ö')
+    text = text.replace('Ç', 'ç')
+    return unidecode(text).lower()
+
+
+def get_person_initials(person):
+    parts = [person.get("firstname", ""), person.get("middle_name", ""), person.get("surname", ""), person.get("birthname", "")]
+    return [unidecode(p.strip())[0].upper() for p in parts if p]
+
+
+def get_text_initials(matched_text):
+    return [unidecode(word.strip())[0].upper() for word in matched_text.split() if word.strip()]
+
+
+def generate_dictonary_of_patterns():
+
+    """
+    completly remove middle_name instead do regex firstName + SomeWord + surname
+    """
+    patterns_dict = {}
+
+    for person in people:
+        person_id = person.get('id')
+        firstname = person.get('firstname', '').strip() if person.get('firstname') else ""
+        middle_name = person.get('middle_name', '').strip() if person.get('middle_name') else ""
+        surname = person.get('surname', '').strip() if person.get('surname') else ""
+        birthname = person.get('birthname', '').strip() if person.get('birthname') else ""
+
+        if not firstname or not surname:
+            continue
+        
+        name_parts = {
+            'firstname': {
+                'orig': firstname,
+                'norm': normalize_text(firstname) if firstname else "",
+                'init': normalize_text(firstname)[0] if firstname else ""
+            },
+            'surname': {
+                'orig': surname,
+                'norm': normalize_text(surname) if surname else "",
+                'init': normalize_text(surname)[0] if surname else ""
+            }
+        }
+        
+        if middle_name:
+            name_parts['middle_name'] = {
+                'orig': middle_name,
+                'norm': normalize_text(middle_name) if middle_name else "",
+                'init': normalize_text(middle_name)[0] if middle_name else ""
+            }
+
+        if birthname and normalize_text(birthname) != normalize_text(surname):
+            name_parts['birthname'] = {
+                'orig': birthname,
+                'norm': normalize_text(birthname),
+                'init': normalize_text(birthname)[0] if birthname else ""
+            }
+        
+        person_patterns = set()
+        
+        def create_pattern(parts, formats, separators=None):
+            if separators is None:
+                separators = [""]
+            
+            patterns = []
+            for fmt in formats:
+                for sep in separators:
+                    pattern_parts = []
+                    for part_type, part_name in fmt:
+                        if part_name in parts and part_type in parts[part_name]:
+                            pattern_parts.append(re.escape(parts[part_name][part_type]))
+                    if pattern_parts:
+                        patterns.append(r"\b" + sep.join(pattern_parts) + r"\b")
+            return patterns
+        
+        name_formats = [
+            [('orig', 'firstname'), ('orig', 'surname')],
+            [('norm', 'firstname'), ('norm', 'surname')],
+            [('orig', 'surname'), ('orig', 'firstname')],
+            [('norm', 'surname'), ('norm', 'firstname')],
+        ]
+        if 'middle_name' in name_parts:
+            name_formats = [
+                [('orig', 'firstname'), ('orig', 'middle_name'), ('orig', 'surname')],
+                [('norm', 'firstname'), ('norm', 'middle_name'), ('norm', 'surname')],
+            ]
+
+        person_patterns.update(create_pattern(name_parts, name_formats, [" ", ""]))
+        
+        if 'middle_name' in name_parts:
+            middle_name_formats = [
+                [('orig', 'firstname'), ('orig', 'middle_name')],
+                [('norm', 'firstname'), ('norm', 'middle_name')],
+                [('orig', 'middle_name'), ('orig', 'surname')],
+                [('norm', 'middle_name'), ('norm', 'surname')],
+            ]
+            person_patterns.update(create_pattern(name_parts, middle_name_formats, [" ", ""]))
+        
+        if 'birthname' in name_parts and name_parts['surname']['orig'] != name_parts['birthname']['orig']:
+            birthname_formats = [
+                [('orig', 'firstname'), ('orig', 'birthname')],
+                [('norm', 'firstname'), ('norm', 'birthname')],
+                [('orig', 'birthname'), ('orig', 'firstname')],
+                [('norm', 'birthname'), ('norm', 'firstname')],
+            ]
+            person_patterns.update(create_pattern(name_parts, birthname_formats, [" ", ""]))
+        
+        initial_formats = [
+            [('init', 'firstname'), ('init', 'middle_name'), ('init', 'surname')],
+            [('init', 'firstname'), ('init', 'surname')],
+        ]
+        person_patterns.update(create_pattern(name_parts, initial_formats, ["", ".", " ", ". "]))
+        
+        if 'middle_name' in name_parts:
+            triple_initial_formats = [
+                [('init', 'firstname'), ('init', 'middle_name'), ('init', 'surname')],
+            ]
+            person_patterns.update(create_pattern(name_parts, triple_initial_formats, ["", ".", " ", ". "]))
+        
+        compiled_patterns = [re.compile(pattern, re.IGNORECASE) for pattern in person_patterns]
+        
+        patterns_dict[person_id] = compiled_patterns
+    
+    return patterns_dict
+
+
+def extract_person_name_with_regex(found_dict, process_comment, patterns_dict):
+    cleaned_text = process_comment  
+    all_matches = []
+
+    for person_id, patterns in patterns_dict.items():
+        person = next((p for p in people if p.get('id') == person_id), None)
+        if not person:
+            continue
+            
+        firstname_norm = normalize_text(person.get("firstname", "").strip()) if person.get("firstname") else ""
+        middle_name_norm = normalize_text(person.get("middle_name", "").strip()) if person.get("middle_name") else ""
+        surname_norm = normalize_text(person.get("surname", "").strip()) if person.get("surname") else ""
+        birthname_norm = normalize_text(person.get("birthname", "").strip()) if person.get("birthname") else ""
+        
+        text_norm = normalize_text(process_comment)
+        person_matches = []
+        
+        for pattern in patterns:
+            for match in pattern.finditer(text_norm):
+                start, end = match.span()
+                matched_text = process_comment[start:end]  
+                matched_text_norm = normalize_text(matched_text)
+                
+                is_valid_match = False
+                
+                # Strict validation: require both firstname AND surname/birthname
+                # No single-word matches allowed
+                if len(matched_text_norm.split()) <= 1:
+                    # Single word matches are not allowed
+                    is_valid_match = False
+                else:
+                    # For multi-word matches, require firstname AND (surname OR birthname)
+                    has_firstname = firstname_norm and firstname_norm in matched_text_norm
+                    has_surname = surname_norm and surname_norm in matched_text_norm
+                    has_birthname = birthname_norm and birthname_norm in matched_text_norm
+                    
+                    # Both firstname and surname/birthname must be present
+                    if (has_firstname and has_surname) or (has_firstname and has_birthname):
+                        is_valid_match = True
+                
+                if is_valid_match:
+                    person_matches.append({
+                        'matched_text': matched_text,
+                        'start': start,
+                        'end': end
+                    })
+        
+        if person_matches:
+            person_matches.sort(key=lambda x: len(x['matched_text']), reverse=True)
+            
+            non_overlapping_matches = []
+            for match in person_matches:
+                overlaps = False
+                for existing_match in non_overlapping_matches:
+                    if (match['start'] < existing_match['end'] and match['end'] > existing_match['start']):
+                        overlaps = True
+                        break
+                
+                if not overlaps:
+                    non_overlapping_matches.append(match)
+            
+            if non_overlapping_matches:
+                found_dict["name_match"] = person
+                all_matches.extend([(match, person) for match in non_overlapping_matches])
+    
+    if all_matches:
+        all_matches.sort(key=lambda x: x[0]['start'], reverse=True)
+        
+        for match, person in all_matches:
+            matched_text = match['matched_text']
+            
+            matched_words = matched_text.split()
+            
+            for word in matched_words:
+                word_norm = normalize_text(word).strip()
+                
+                if not word_norm:
+                    continue
+                
+                text_norm = normalize_text(cleaned_text)
+                for word_match in re.finditer(rf'\b{re.escape(word_norm)}\b', text_norm, re.IGNORECASE):
+                    start, end = word_match.span()
+                    cleaned_text = cleaned_text[:start] + ' ' * (end - start) + cleaned_text[end:]
+        
+        cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
+    
+    return found_dict, cleaned_text
+
+
+def extract_build_parts_info(found_dict, process_comment):
+
+    """
+    Regex of parts such as :
+    2 nolu daire
+    9 NUMARALI DAI
+    daire 3
+    3 nolu dairenin
+    11nolu daire
+    Daire No 12
+    2NOLU DAIRE
+    12 No lu daire
+    D:10
+    NO:11
+    NO :3
+    """
+    # Initialize apartment number variable
+    apartment_number = None
+    cleaned_text = process_comment
+    
+    def clean_text_apartment_number(text, match):
+        clean_text = text.replace(match.group(0), '').strip()
+        clean_text = re.sub(r'\s+', ' ', clean_text).strip()
+        return clean_text
+
+    # Pattern 1: X nolu daire (with space)
+    pattern1 = re.compile(r'(\d+)\s*nolu\s*daire', re.IGNORECASE)
+    match = pattern1.search(cleaned_text)
+    if match:
+        apartment_number = match.group(1)
+        found_dict['apartment_number'] = apartment_number
+        return found_dict, clean_text_apartment_number(cleaned_text, match)
+    
+    # Pattern 4: X nolu dairenin
+    pattern4 = re.compile(r'(\d+)\s*nolu\s*daire\w*', re.IGNORECASE)
+    match = pattern4.search(cleaned_text)
+    if match:
+        apartment_number = match.group(1)
+        found_dict['apartment_number'] = apartment_number
+        return found_dict, clean_text_apartment_number(cleaned_text, match)
+    
+    # Pattern 5: XNolu daire (without space)
+    pattern5 = re.compile(r'(\d+)nolu\s*daire', re.IGNORECASE)
+    match = pattern5.search(cleaned_text)
+    if match:
+        apartment_number = match.group(1)
+        found_dict['apartment_number'] = apartment_number
+        return found_dict, clean_text_apartment_number(cleaned_text, match)
+
+    # Pattern 7: XNOLU DAIRE (all caps, no space)
+    pattern7 = re.compile(r'(\d+)nolu\s*daire', re.IGNORECASE)
+    match = pattern7.search(cleaned_text)
+    if match:
+        apartment_number = match.group(1)
+        found_dict['apartment_number'] = apartment_number
+        return found_dict, clean_text_apartment_number(cleaned_text, match)
+    
+    # Pattern 8: X No lu daire
+    pattern8 = re.compile(r'(\d+)\s*no\s*lu\s*daire', re.IGNORECASE)
+    match = pattern8.search(cleaned_text)
+    if match:
+        apartment_number = match.group(1)
+        found_dict['apartment_number'] = apartment_number
+        return found_dict, clean_text_apartment_number(cleaned_text, match)
+
+    # Pattern 6: Daire No X
+    pattern6 = re.compile(r'daire\s*no\s*(\d+)', re.IGNORECASE)
+    match = pattern6.search(cleaned_text)
+    if match:
+        apartment_number = match.group(1)
+        found_dict['apartment_number'] = apartment_number
+        return found_dict, clean_text_apartment_number(cleaned_text, match)
+
+    # Pattern 2: X NUMARALI DAI
+    pattern2 = re.compile(r'(\d+)\s*numarali\s*dai', re.IGNORECASE)
+    match = pattern2.search(cleaned_text)
+    if match:
+        apartment_number = match.group(1)
+        found_dict['apartment_number'] = apartment_number
+        return found_dict, clean_text_apartment_number(cleaned_text, match)
+    
+    # Pattern 3: daire X
+    pattern3 = re.compile(r'daire\s*(\d+)', re.IGNORECASE)
+    match = pattern3.search(cleaned_text)
+    if match:
+        apartment_number = match.group(1)
+        found_dict['apartment_number'] = apartment_number
+        return found_dict, clean_text_apartment_number(cleaned_text, match)
+    
+    # Pattern 9: D:X
+    pattern9 = re.compile(r'd\s*:\s*(\d+)', re.IGNORECASE)
+    match = pattern9.search(cleaned_text)
+    if match:
+        apartment_number = match.group(1)
+        found_dict['apartment_number'] = apartment_number
+        return found_dict, clean_text_apartment_number(cleaned_text, match)
+    
+    # Pattern 10: NO:X or NO :X
+    pattern10 = re.compile(r'no\s*:\s*(\d+)', re.IGNORECASE)
+    match = pattern10.search(cleaned_text)
+    if match:
+        apartment_number = match.group(1)
+        found_dict['apartment_number'] = apartment_number
+        return found_dict, clean_text_apartment_number(cleaned_text, match)
+    
+    return found_dict, cleaned_text
+
+
+def extract_months(found_dict, process_comment):
+    """
+    Extract Turkish month names and abbreviations from the process comment
+    """
+    original_text = process_comment
+    # Updated dictionary with normalized keys for better matching
+    month_to_number_dict = {
+        "ocak": 1, "şubat": 2, "mart": 3, "nisan": 4, "mayıs": 5, "haziran": 6,
+        "temmuz": 7, "ağustos": 8, "eylül": 9, "ekim": 10, "kasım": 11, "aralık": 12,
+        # Add normalized versions without Turkish characters
+        "ocak": 1, "subat": 2, "mart": 3, "nisan": 4, "mayis": 5, "haziran": 6,
+        "temmuz": 7, "agustos": 8, "eylul": 9, "ekim": 10, "kasim": 11, "aralik": 12
+    }
+    
+    def clean_text_month(text, match):
+        clean_text = text.replace(match.group(0), '').strip()
+        clean_text = re.sub(r'\s+', ' ', clean_text).strip()
+        return clean_text
+    
+    def normalize_turkish(text):
+        """Properly normalize Turkish text for case-insensitive comparison"""
+        text = text.lower()
+        text = text.replace('i̇', 'i')  # Handle dotted i properly
+        text = text.replace('ı', 'i')   # Convert dotless i to regular i for matching
+        text = unidecode(text)          # Remove other diacritics
+        return text
+    
+    if 'months' not in found_dict:
+        found_dict['months'] = []
+    
+    months_found, working_text = False, original_text
+    
+    for month in turkish_months:
+        pattern = re.compile(r'\b' + re.escape(month) + r'\b', re.IGNORECASE)
+        for match in pattern.finditer(original_text):
+            matched_text = match.group(0)
+            
+            normalized_month = normalize_turkish(month)
+            month_number = None
+            
+            if month.lower() in month_to_number_dict:
+                month_number = month_to_number_dict[month.lower()]
+            elif normalized_month in month_to_number_dict:
+                month_number = month_to_number_dict[normalized_month]
+            
+            month_info = {'name': month, 'number': month_number}
+            found_dict['months'].append(month_info)
+            months_found = True
+            working_text = working_text.replace(matched_text, '', 1)
+    
+    for abbr, full_month in turkish_months_abbr.items():
+        pattern = re.compile(r'\b' + re.escape(abbr) + r'\b', re.IGNORECASE)
+        
+        for match in pattern.finditer(working_text):
+            matched_text = match.group(0)
+            normalized_month = normalize_turkish(full_month)
+            month_number = None
+            
+            if full_month.lower() in month_to_number_dict:
+                month_number = month_to_number_dict[full_month.lower()]
+            elif normalized_month in month_to_number_dict:
+                month_number = month_to_number_dict[normalized_month]
+            
+            month_info = {'name': full_month, 'number': month_number}
+            found_dict['months'].append(month_info)
+            months_found = True
+            working_text = working_text.replace(matched_text, '', 1)
+
+    return found_dict, working_text
+
+
+def extract_year(found_dict, process_comment):
+    """
+    Extract years from the process comment
+    """
+    original_text = process_comment
+    
+    if 'years' not in found_dict:
+        found_dict['years'] = []
+    
+    working_text = original_text
+    
+    for year in range(start_year, current_year + 1):
+        pattern = re.compile(r'\b' + str(year) + r'\b', re.IGNORECASE)
+        for match in pattern.finditer(original_text):
+            matched_text = match.group(0)
+            if str(matched_text).isdigit():
+                found_dict['years'].append(int(matched_text))
+                working_text = working_text.replace(matched_text, '', 1)
+
+    return found_dict, working_text
+
+
+def extract_payment_type(found_dict, process_comment):
+    """
+    Extract payment type from the process comment
+    aidat
+    AİD
+    aidatı
+    TADİLAT
+    YAKIT
+    yakıt
+    yakit
+    """
+    original_text = process_comment
+    working_text = original_text
+    
+    if 'payment_types' not in found_dict:
+        found_dict['payment_types'] = []
+    
+    payment_keywords = {
+        'aidat': ['aidat', 'aİd', 'aid', 'aidatı', 'aidati'],
+        'tadilat': ['tadilat', 'tadİlat', 'tadilatı'],
+        'yakit': ['yakit', 'yakıt', 'yakıtı', 'yakiti']
+    }
+    
+    for payment_type, keywords in payment_keywords.items():
+        for keyword in keywords:
+            pattern = re.compile(r'\b' + keyword + r'\b', re.IGNORECASE)
+            for match in pattern.finditer(original_text):
+                matched_text = match.group(0)
+                if payment_type not in found_dict['payment_types']:
+                    found_dict['payment_types'].append(payment_type)
+                working_text = working_text.replace(matched_text, '', 1)
+    
+    return found_dict, working_text
+
+
+def main():
+
+    list_of_regex_patterns = generate_dictonary_of_patterns()
+    dicts_found = dict()
+    dicts_not_found = dict()
+    for account_record in account_records:
+        account_record_id = str(account_record["id"])
+        found_dict = {}
+        process_comment_iteration = clean_text(text=account_record["process_comment"])
+        found_dict, cleaned_process_comment = extract_person_name_with_regex(
+            found_dict=found_dict, process_comment=process_comment_iteration, patterns_dict=list_of_regex_patterns
+        )
+
+        found_dict, cleaned_process_comment = extract_build_parts_info(found_dict=found_dict, process_comment=cleaned_process_comment)
+        found_dict, cleaned_process_comment = extract_months(found_dict=found_dict, process_comment=cleaned_process_comment)
+        found_dict, cleaned_process_comment = extract_year(found_dict=found_dict, process_comment=cleaned_process_comment)
+        found_dict, cleaned_process_comment = extract_payment_type(found_dict=found_dict, process_comment=cleaned_process_comment)
+        if found_dict:
+            dicts_found[process_comment_iteration] = found_dict
+        else:
+            dicts_not_found[process_comment_iteration] = account_record_id
+
+    print("\n===== SUMMARY =====")
+    print(f"extracted data total            : {len(dicts_found)}")
+    print(f"not extracted data total        : {len(account_records) - len(dicts_found)}")
+    print(f"Total account records processed : {len(account_records)}")
+
+
+if __name__ == "__main__":
+    main()