updated table frontend

2025-08-07 11:45:23 +03:00
commit 917396ec15
10 changed files with 7169 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,2 @@
 .venv
 __pycache__
--- a/accountRecords.csv
+++ b/accountRecords.csv
--- a/account_records_incoming.csv
+++ b/account_records_incoming.csv
--- a/base_import.py
+++ b/base_import.py
@@ -0,0 +1,61 @@
 from contextlib import contextmanager
 from functools import lru_cache
 from typing import Generator
 from sqlalchemy import create_engine
 from sqlalchemy.orm import declarative_base, sessionmaker, scoped_session, Session
 # Configure the database engine with proper pooling
 engine = create_engine(
    "postgresql+psycopg2://postgres:password@10.10.2.14:5432/postgres",
    pool_pre_ping=True,
    pool_size=10,  # Reduced from 20 to better match your CPU cores
    max_overflow=5,  # Reduced from 10 to prevent too many connections
    pool_recycle=600,  # Keep as is
    pool_timeout=30,  # Keep as is
    echo=False,  # Consider setting to False in production
 )
 Base = declarative_base()
 # Create a cached session factory
@lru_cache()
 def get_session_factory() -> scoped_session:
    """Create a thread-safe session factory."""
    session_local = sessionmaker(
        bind=engine,
        autocommit=False,
        autoflush=False,
        expire_on_commit=True,  # Prevent expired object issues
    )
    return scoped_session(session_local)
 # Get database session with proper connection management
@contextmanager
 def get_db() -> Generator[Session, None, None]:
    """Get database session with proper connection management.
    This context manager ensures:
    - Proper connection pooling
    - Session cleanup
    - Connection return to pool
    - Thread safety
    Yields:
        Session: SQLAlchemy session object
    """
    session_factory = get_session_factory()
    session = session_factory()
    try:
        yield session
        session.commit()
    except Exception:
        session.rollback()
        raise
    finally:
        session.close()
        session_factory.remove()  # Clean up the session from the registry
--- a/dataCleaning.py
+++ b/dataCleaning.py
@@ -0,0 +1,225 @@
 import re
 import pandas as pd
 from unidecode import unidecode
 from difflib import SequenceMatcher
 from itertools import permutations
 from time import perf_counter
 from base_import import get_session_factory
 from sqlalchemy import text as sqlalchemy_text
 session_factory = get_session_factory()
 session = session_factory()
 turkish_months = ["OCAK", "ŞUBAT", "MART", "NİSAN", "MAYIS", "HAZİRAN", "TEMMUZ", "AĞUSTOS", "EYLÜL", "EKİM", "KASIM", "ARALIK"]
 turkish_months_abbr = {
    "OCA": "OCAK", "SUB": "ŞUBAT", "ŞUB": "ŞUBAT", "MAR": "MART", "NIS": "NİSAN", "MAY": "MAYIS", "HAZ": "HAZİRAN", "HZR": "HAZİRAN",
    "TEM": "TEMMUZ", "AGU": "AĞUSTOS", "AGT": "AĞUSTOS", "EYL": "EYLÜL", "EKI": "EKİM", "KAS": "KASIM", "ARA": "ARALIK",
 }
 start_year = 1950
 current_year = pd.Timestamp.now().year
 people = session.execute(
    sqlalchemy_text("""
    SELECT p.firstname, p.middle_name, p.surname, bl.id FROM public.people as p
    inner join public.build_living_space as bl on bl.person_id = p.id
    inner join public.build_parts as bp on bp.id = bl.build_parts_id
    inner join public.build as b on b.id = bp.build_id
    where b.id = 1
    """)
 ).all()
 people = [{"firstname": p[0], "middle_name": p[1], "last_name": p[2], 'id': p[3]} for p in people]
 def clean_text(text):
    # Convert to string just in case
    text = str(text)
    # Remove extra spaces and tabs by splitting and joining
    text = " ".join(text.split())
    text_remove_underscore = text.replace("-", " ")
    text_remove_asterisk = text_remove_underscore.replace("*", " ")
    text_remove_comma = text_remove_asterisk.replace(",", " ")
    # Optionally lower-case or normalize unicode if needed
    return text_remove_comma
 def is_valid_char_match(word, month):
    """Ensure all letters in the word exist in the target month name."""
    word_chars = set(word)
    month_chars = set(month)
    return word_chars.issubset(month_chars)
 def best_month_matches(text, threshold=0.7):
    matches = []
    words = clean_text(text).split()
    for word in words:
        # First check abbreviations dictionary exact match
        if word in turkish_months_abbr:
            full_month = turkish_months_abbr[word]
            matches.append((full_month.capitalize(), word, 1.0))
            continue
        # Else fuzzy match full months
        for month in turkish_months:
            month_clean = unidecode(month.upper())
            ratio = SequenceMatcher(None, word, month_clean).ratio()
            if ratio >= threshold and is_valid_char_match(word, month_clean):
                matches.append((month.capitalize(), word, round(ratio, 2)))
                break
    return matches if matches else ["N/A"]
 def print_out_results(data_frame, df):
    for ix, process_comment in enumerate(data_frame):
        print(f'Row number {ix} : {process_comment}')
    print(len(data_frame), '/', len(df))
 def extract_years(text):
    matches = re.findall(r'\b(19\d{2}|20\d{2})\b', text)
    valid_years = [year for year in matches if start_year <= int(year) <= current_year]
    return valid_years if valid_years else ["N/A"]
 def extract_months(text):
    # Normalize text and months by converting to ASCII lowercase
    text_norm = unidecode(str(text)).lower()
    months_norm = [unidecode(m).lower() for m in turkish_months]
    found = [turkish_months[i] for i, m in enumerate(months_norm) if m in text_norm]
    return found if found else ["N/A"]
 def normalize_text(text):
    return unidecode(text).lower()
 def build_name_regex_all_combinations(person):
    firstname = person.get("firstname", "").strip()
    middle_name = person.get("middle_name", "").strip()
    last_name = person.get("last_name", "").strip()
    parts = [unidecode(p).lower() for p in [firstname, middle_name, last_name] if p]
    patterns = []
    for r in range(1, len(parts) + 1):
        # Use permutations instead of combinations to get all orderings
        for permuted_parts in permutations(parts, r):
            regex_pattern = r"\b" + r"\s*".join(map(re.escape, permuted_parts)) + r"\b"
            patterns.append((re.compile(regex_pattern, flags=re.IGNORECASE), r))
    # Add full merged version like "fatihergunguclu"
    if len(parts) >= 2:
        merged = ''.join(parts)
        patterns.append((re.compile(rf"\b{re.escape(merged)}\b", flags=re.IGNORECASE), len(parts)))
    return patterns
 def get_person_initials(person):
    parts = [person.get("firstname", ""), person.get("middle_name", ""), person.get("last_name", "")]
    return [unidecode(p.strip())[0].upper() for p in parts if p]
 def get_text_initials(matched_text):
    return [unidecode(word.strip())[0].upper() for word in matched_text.split() if word.strip()]
 def find_person_regex_all_combinations(text, person):
    text_norm = normalize_text(text)
    scored_matches = []
    for pattern, weight in build_name_regex_all_combinations(person):
        for match in pattern.finditer(text_norm):
            matched_text = match.group().strip()
            scored_matches.append({
                "matched_person": person,
                "matched_text": matched_text,
                "weight": weight
            })
    return scored_matches
 def find_all_person_matches(text):
    all_valid_matches = {}
    for person in people:
        matches = find_person_regex_all_combinations(text, person)
        for match in matches:
            matched_person = match["matched_person"]
            matched_text = match["matched_text"]
            weight = match["weight"]
            person_initials = get_person_initials(matched_person)
            found_text_letters = get_text_initials(matched_text)
            match_count = sum(1 for c in found_text_letters if c in person_initials)
            if match_count >= 2:
                person_key = tuple(matched_person.values())
                if person_key not in all_valid_matches or all_valid_matches[person_key][0] < weight:
                    all_valid_matches[person_key] = (weight, matched_person)
    sorted_matches = sorted(all_valid_matches.values(), key=lambda x: -x[0])
    return [m[1] for m in sorted_matches] or ["N/A"]
 def find_all_person_matches_possibilities(text):
    all_valid_matches = {}
    for person in people:
        matches = find_person_regex_all_combinations(text, person)
        for match in matches:
            matched_person = match["matched_person"]
            matched_text = match["matched_text"]
            weight = match["weight"]
            person_initials = get_person_initials(matched_person)
            found_text_letters = get_text_initials(matched_text)
            person_key = tuple(matched_person.values())
            all_valid_matches[person_key] = (weight, matched_person)
    sorted_matches = sorted(all_valid_matches.values(), key=lambda x: -x[0])
    return [m[1] for m in sorted_matches] or ["N/A"]
 start_time = perf_counter()
 df = pd.read_csv("account_records_incoming.csv")
 df["process_comment"] = df["process_comment"].apply(clean_text)
 df["year"] = df["process_comment"].str.extract(r'(\d{4})')
 df["year"] = pd.to_numeric(df["year"], errors='coerce')
 df["year"] = df["year"].where(df["year"].between(start_year, current_year))
 df["years_found"] = df["process_comment"].apply(extract_years)
 df["months_found"] = df["process_comment"].apply(extract_months)
 df["fuzzy_months_found"] = df["process_comment"].apply(best_month_matches)
 df["person_name_matches"] = df["process_comment"].apply(find_all_person_matches)
 df["person_name_matches_possibilities"] = df["process_comment"].apply(find_all_person_matches_possibilities)
 # df["year_str"] = df["year"].apply(lambda x: str(int(x)) if pd.notnull(x) else "N/A")
 # print(df[["process_comment", "year_str", "months_found"]].head(100))
 # for ix, row in df[["process_comment", "years_found", "months_found", "fuzzy_months_found", "person_name_matches", "person_name_matches_possibilities"]].iterrows():
 #     if row['person_name_matches'] == ["N/A"] and row['person_name_matches_possibilities'] != ["N/A"]:
 #         print(f"Row number {ix} | {row['process_comment']}")
 #         # print(f"Years found: {row['years_found']} | Months found: {row['months_found']} | Fuzzy months found: {row['fuzzy_months_found']}")
 #         print(f"Person name matches: {row['person_name_matches']} | Person name matches possibilities: {row['person_name_matches_possibilities']}")
 month_direct_found = df[df['months_found'].apply(lambda x: x != ["N/A"])]
 month_indirect_found = df[df['months_found'].apply(lambda x: x == ["N/A"]) | df['fuzzy_months_found'].apply(lambda x: x != ["N/A"])]
 years_found = df[df['years_found'].apply(lambda x: x != ["N/A"])]
 people_found = df[df['person_name_matches'].apply(lambda x: x != ["N/A"]) ]
 people_found_possibilities = df[df['person_name_matches_possibilities'].apply(lambda x: x != ["N/A"]) & df['person_name_matches'].apply(lambda x: x == ["N/A"])]
 print("length of years where is not N/A :           ", len(years_found), '/', len(df))
 print("length of months where is not N/A :          ", len(month_direct_found), '/', len(df))
 print("length of fuzzy months where is not N/A :    ", len(month_indirect_found), '/', len(df))
 print("lenght of names found", len(people_found), '/', len(df))
 print("lenght of names found with possibilities", len(people_found_possibilities), '/', len(df))
 end_time = perf_counter()
 print(f"Time taken to read CSV: {end_time - start_time:.2f} seconds")
 # for ix, row in df[['id', 'process_comment', 'years_found', 'months_found', 'fuzzy_months_found', 'person_name_matches', 'person_name_matches_possibilities']].iterrows():
 #     print(f"id {row['id']} | {row['process_comment']} | {row['years_found']} | {row['months_found']} | Person Name Matches : {row['person_name_matches']}")
 #     if row['person_name_matches'] == ["N/A"] and row['person_name_matches_possibilities'] != ["N/A"]:
 #         print(f"id {row['id']} | Fuzzy person Matches : {row['person_name_matches_possibilities']}")
 #     if row['months_found'] == ["N/A"] and row['fuzzy_months_found'] != ["N/A"]:
 #         print(f"id {row['id']} | Fuzzy month Matches  : {row['fuzzy_months_found']}")
--- a/miner.py
+++ b/miner.py
@@ -0,0 +1,266 @@
 from base_import import get_session_factory
 from sqlalchemy import text as sqlalchemy_text
 session_factory = get_session_factory()
 session = session_factory()
 # Add LIMIT and OFFSET directly to the SQL query
 def query_to_run(limit: int, offset: int):
    return sqlalchemy_text(f"""
        SELECT a.id, a.uu_id, a.iban, a.bank_date, a.process_comment, a.payment_result_type, 
        b.part_code, COUNT(a2.id) AS sum_a2_id
        FROM public.account_records AS a
        INNER JOIN public.build_parts b ON b.id = a.build_parts_id
        INNER JOIN public.api_enum_dropdown ae ON ae.id = a.payment_result_type
        INNER JOIN public.build_living_space bl ON bl.id = a.living_space_id
        INNER JOIN public.people p ON p.id = bl.person_id
        LEFT JOIN public.account_records_model_train a2 ON a2.account_records_id = a.id
        WHERE a.bank_date::date > '2023-06-30' and a2.id is null
        GROUP BY a.id, a.uu_id, a.iban, a.bank_date, a.process_comment, a.payment_result_type, b.part_code
        ORDER BY a.bank_date ASC
        LIMIT {int(limit)} OFFSET {int(offset)};
        """)
 """
 52	56b75aec-d28f-4cd4-84e9-ea222cc1d9bd	BuildTypes	APT_KZN	Apartman Kazan Dairesi
 53	a9f854d1-d01d-4f2a-af5f-1ccf34193e0f	BuildTypes	APT_GRJ	Apartman Garaj
 54	ed7371a4-0a0a-491d-b1f9-015025b6ac91	BuildTypes	APT_DP	Apartman Depo
 55	f6eb95dd-5ed0-407b-8205-4bc855199b06	BuildTypes	DAIRE	Apartman Dairesi
 56	242bbe5e-44df-4f10-9583-9d80ff93c52d	BuildTypes	APT	Apartman Binası
 57	8920eb8b-a5aa-42c3-81d0-13afca85ba1f	BuildTypes	APT_YNT	Apartman Yönetimi
 58	a7b98daf-c83e-494d-8938-d716be131b5d	BuildTypes	APT_PRK	Apartman Açık Park Alanı
 59	628188d9-b5e3-493e-9a42-afac3f5bf816	BuildTypes	APT_YSL	Apartman Yeşil Alan
 60	8b5bcca2-7702-4486-904c-d708248ccd4d	BuildTypes	APT_YOL	Apartman Ara Yol
 5	1b51381a-b5a9-485e-884e-fab07b4adf21	BuildDuesTypes	BDT-S	Service fee
 6	4619b29f-7b60-4b95-9a97-50a4e5d40f94	BuildDuesTypes	BDT-I	Information
 1	2d0127eb-899e-47c5-ad86-67a78174bf90	BuildDuesTypes	BDT-D	Bina Aidat
 2	11656423-24b7-4ed9-96e7-1563f639da53	BuildDuesTypes	BDT-A	Bina Ek Aidat
 3	c74c72f4-5e10-4d00-8016-4f9ddd50b3c4	BuildDuesTypes	BDT-R	Bina Tadilat
 4	5edeb654-b7ce-4c1f-b7e3-2c717bb1d263	BuildDuesTypes	BDT-L	Bina Yasal Harcama
 48	f14ae805-8238-438f-a522-d8ac6553f717	TimePeriod	TP-W	Weekly
 49	184c3356-6397-476d-a965-45ddf26a4ff5	TimePeriod	TP-M	Monthly
 50	ba36110f-7afe-4c41-bcad-f80ce71f626e	TimePeriod	TP-Q	Quarterly
 51	89ff94c6-126e-45c2-9bc7-6d1007d02528	TimePeriod	TP-Y	Yearly
 32	3cf533a4-3947-4563-9a43-16ea2bab1119	PerComType	1	Person
 """
 class Categories:
    APTKZN = "APTKZN"
    APTGRJ = "APTGRJ"
    APTDP = "APTDP"
    DAIRE = "DAIRE"
    APT = "APT"
    APTYNT = "APTYNT"
    APTPRK = "APTPRK"
    APTYSL = "APTYSL"
    APTYOL = "APTYOL"
    BDTI = "BDTI"
    BDTD = "BDTD"
    BDTA = "BDTA"
    BDTR = "BDTR"
    BDTL = "BDTL"
    TPW = "TPW"
    TPM = "TPM"
    TPQ = "TPQ"
    TPY = "TPY"
    PERSON = "PERSON"
    @classmethod
    def get_category_id(cls, category_name):
        category_dict = {
            "APTKZN": (52, "56b75aec-d28f-4cd4-84e9-ea222cc1d9bd"),
            "APTGRJ": (53, "a9f854d1-d01d-4f2a-af5f-1ccf34193e0f"),
            "APTDP": (54, "ed7371a4-0a0a-491d-b1f9-015025b6ac91"),
            "DAIRE": (55, "f6eb95dd-5ed0-407b-8205-4bc855199b06"),
            "APT": (56, "242bbe5e-44df-4f10-9583-9d80ff93c52d"),
            "APTYNT": (57, "8920eb8b-a5aa-42c3-81d0-13afca85ba1f"),
            "APTPRK": (58, "a7b98daf-c83e-494d-8938-d716be131b5d"),
            "APTYSL": (59, "628188d9-b5e3-493e-9a42-afac3f5bf816"),
            "APTYOL": (60, "8b5bcca2-7702-4486-904c-d708248ccd4d"),
            "BDTI": (5, "1b51381a-b5a9-485e-884e-fab07b4adf21"),
            "BDTD": (1, "2d0127eb-899e-47c5-ad86-67a78174bf90"),
            "BDTA": (2, "11656423-24b7-4ed9-96e7-1563f639da53"),
            "BDTR": (3, "c74c72f4-5e10-4d00-8016-4f9ddd50b3c4"),
            "BDTL": (4, "5edeb654-b7ce-4c1f-b7e3-2c717bb1d263"),
            "TPW": (48, "f14ae805-8238-438f-a522-d8ac6553f717"),
            "TPM": (49, "184c3356-6397-476d-a965-45ddf26a4ff5"),
            "TPQ": (50, "ba36110f-7afe-4c41-bcad-f80ce71f626e"),
            "TPY": (51, "89ff94c6-126e-45c2-9bc7-6d1007d02528"),
            "PERSON": (32, "3cf533a4-3947-4563-9a43-16ea2bab1119"),
        }
        if not category_name in category_dict:
            raise ValueError(f"Invalid category name: {category_name}")
        return category_dict.get(category_name)
 def get_model_train_query():
    """
    Returns a SQLAlchemy text object for inserting model train data.
    Parameters will be bound when executing the query.
    """
    return sqlalchemy_text("""
        INSERT INTO public.account_records_model_train (
            account_records_id,
            account_records_uu_id,
            search_text,
            start_index,
            end_index,
            category_id,
            category_uu_id
        )
        VALUES (
            :account_records_id,
            :account_records_uu_id,
            :search_text,
            :start_index,
            :end_index,
            :category_id,
            :category_uu_id
        );
    """)
 def find_indices(search_text: str, target_text: str, offset: int = 1):
    """
    Returns start and end indices of the search_text within target_text
    Args:
        search_text (str): The text to search for
        target_text (str): The text to search within
        offset (int, optional): Which occurrence to find (1 for first, 2 for second, etc). Defaults to 1.
    Returns:
        tuple: (start_index, end_index) of the found text, or (-1, -1) if not found or offset is too large
    """
    if offset < 1:
        offset = 1
    current_pos = 0
    occurrence_count = 0
    while occurrence_count < offset:
        start_index = target_text.find(search_text, current_pos)
        if start_index == -1:  # Not found
            return -1, -1
        occurrence_count += 1
        if occurrence_count == offset:
            end_index = start_index + len(search_text) - 1
            return start_index, end_index + 1
        # Move past this occurrence to find the next one
        current_pos = start_index + 1
    raise Exception("Offset is too large")
 class AccountRecord:
    def __init__(self, id, uu_id, iban, bank_date, process_comment, payment_result_type, part_code, sum_a2_id):
        self.id = id
        self.uu_id = str(uu_id)
        self.iban = iban
        self.bank_date = bank_date
        self.process_comment = process_comment
        self.payment_result_type = payment_result_type
        self.part_code = part_code
        self.sum_a2_id = sum_a2_id
    def to_dict(self):
        return {
            "id": self.id,
            "uu_id": self.uu_id,
            "iban": self.iban,
            "bank_date": self.bank_date,
            "process_comment": self.process_comment,
            "payment_result_type": self.payment_result_type,
            "part_code": self.part_code,
            "sum_a2_id": self.sum_a2_id,
        }
 # Execute the query and process results directly
 results = session.execute(query_to_run(limit=1, offset=0))
 account_record_dict = dict()
 for result in results:
    """
    (791, UUID('9d276cc8-289f-45c1-9805-44464af5d7bf'), 'TR400006400000142450093333', 
    datetime.datetime(2023, 7, 1, 12, 22, 27, tzinfo=datetime.timezone.utc), 
    '2 NOLU DAİRE TEMMUZ Ç3 AİDAT*SONGÜL VAR*Hİ7748686973', 1, 'DAIRE_2', 4)
    """
    account_record = AccountRecord(*result)
    account_record_dict = account_record.to_dict()
 """
 "APTKZN": (52, "56b75aec-d28f-4cd4-84e9-ea222cc1d9bd"),
 "APTGRJ": (53, "a9f854d1-d01d-4f2a-af5f-1ccf34193e0f"),
 "APTDP": (54, "ed7371a4-0a0a-491d-b1f9-015025b6ac91"),
 "DAIRE": (55, "f6eb95dd-5ed0-407b-8205-4bc855199b06"),
 "APT": (56, "242bbe5e-44df-4f10-9583-9d80ff93c52d"),
 "APTYNT": (57, "8920eb8b-a5aa-42c3-81d0-13afca85ba1f"),
 "APTPRK": (58, "a7b98daf-c83e-494d-8938-d716be131b5d"),
 "APTYSL": (59, "628188d9-b5e3-493e-9a42-afac3f5bf816"),
 "APTYOL": (60, "8b5bcca2-7702-4486-904c-d708248ccd4d"),
 "BDTI": (5, "1b51381a-b5a9-485e-884e-fab07b4adf21"),
 "BDTD": (6, "4619b29f-7b60-4b95-9a97-50a4e5d40f94"),
 "BDTA": (2, "11656423-24b7-4ed9-96e7-1563f639da53"),
 "BDTR": (3, "c74c72f4-5e10-4d00-8016-4f9ddd50b3c4"),
 "BDTL": (4, "5edeb654-b7ce-4c1f-b7e3-2c717bb1d263"),
 "TPW": (48, "f14ae805-8238-438f-a522-d8ac6553f717"),
 "TPM": (49, "184c3356-6397-476d-a965-45ddf26a4ff5"),
 "TPQ": (50, "ba36110f-7afe-4c41-bcad-f80ce71f626e"),
 "TPY": (51, "89ff94c6-126e-45c2-9bc7-6d1007d02528"),
 "PCT": (32, "3cf533a4-3947-4563-9a43-16ea2bab1119"),
 """
 print(account_record_dict['id'])
 print(account_record_dict['uu_id'])
 print(account_record_dict['process_comment'])
 account_records_id = int(219)
 account_records_uuid = str("5d301273-806c-47d6-aeeb-e056dc119494")
 sample_text = "GÜNEŞ APARTMANI AİDAT EYLÜL*HASAN CİHAN ŞENKÜÇÜK*Hİ9021822604"
 search_text = "HASAN CİHAN ŞENKÜÇÜK"
 start_index, end_index = find_indices(search_text, sample_text, offset=1)
 #exit()
 print("start_index", start_index)
 print("end_index", end_index)
 category_id, category_uuid = Categories.get_category_id(Categories.PERSON)
 write_dict = dict(
    account_records_id=account_records_id,
    account_records_uu_id=account_records_uuid,
    search_text=search_text,
    start_index=start_index,
    end_index=end_index,
    category_id=category_id,
    category_uu_id=category_uuid
 )
 print('write_dict', write_dict)
 # Prepare the parameters for the query
 params = {
    "account_records_id": account_records_id,
    "account_records_uu_id": account_records_uuid,
    "search_text": search_text,
    "start_index": start_index,
    "end_index": end_index,
    "category_id": category_id,
    "category_uu_id": category_uuid
 }
 # Get the parameterized query template
 query_template = get_model_train_query()
 # Print parameters for debugging
 print("Parameters:", params)
 # Execute the query with parameters
 session.execute(query_template, params)
 session.commit()
--- a/1
+++ b/1
--- a/requirements.txt
+++ b/requirements.txt
@@ -0,0 +1,5 @@
 sqlalchemy-mixins>=2.0.5
 psycopg2-binary>=2.9.10
 arrow>=1.3.0
 pandas>=2.2.2
 numpy>=1.26.4
--- a/toCsvFrame.py
+++ b/toCsvFrame.py
@@ -0,0 +1,9 @@
 from base_import import get_session_factory, engine
 from sqlalchemy import text as sqlalchemy_text
 import pandas as pd
 # session_factory = get_session_factory()
 # session = session_factory()
 query = sqlalchemy_text("SELECT * FROM public.account_records where currency_value > 0")
 pd.read_sql(query, engine).to_csv("account_records_incoming.csv", index=False)
--- a/withoutPandas.py
+++ b/withoutPandas.py
@@ -0,0 +1,553 @@
 import re
 import arrow
 from unidecode import unidecode
 from difflib import SequenceMatcher
 from itertools import permutations
 from time import perf_counter
 from base_import import get_session_factory
 from sqlalchemy import text as sqlalchemy_text
 session_factory = get_session_factory()
 session = session_factory()
 turkish_months = ["OCAK", "ŞUBAT", "MART", "NİSAN", "MAYIS", "HAZİRAN", "TEMMUZ", "AĞUSTOS", "EYLÜL", "EKİM", "KASIM", "ARALIK"]
 turkish_months_abbr = {
    "OCA": "OCAK", "SUB": "ŞUBAT", "ŞUB": "ŞUBAT", "MAR": "MART", "NIS": "NİSAN", "MAY": "MAYIS", "HAZ": "HAZİRAN", "HZR": "HAZİRAN",
    "TEM": "TEMMUZ", "AGU": "AĞUSTOS", "AGT": "AĞUSTOS", "EYL": "EYLÜL", "EKI": "EKİM", "KAS": "KASIM", "ARA": "ARALIK",
 }
 start_year = 1950
 current_year = arrow.now().year
 people_query = sqlalchemy_text("""
    SELECT DISTINCT ON (p.id) p.firstname, p.middle_name, p.surname, p.birthname, bl.id 
    FROM public.people as p
    INNER JOIN public.build_living_space as bl ON bl.person_id = p.id
    INNER JOIN public.build_parts as bp ON bp.id = bl.build_parts_id
    INNER JOIN public.build as b ON b.id = bp.build_id
    WHERE b.id = 1
    ORDER BY p.id
 """)
 people_raw = session.execute(people_query).all()
 remove_duplicate = list()
 clean_people_list = list()
 for person in people_raw:
    merged_name = f"{person[0]} {person[1]} {person[2]} {person[3]}"
    if merged_name not in remove_duplicate:
        clean_people_list.append(person)
        remove_duplicate.append(merged_name)
 people = [{"firstname": p[0], "middle_name": p[1], "surname": p[2], "birthname": p[3], 'id': p[4]} for p in clean_people_list]
 query_account_records = sqlalchemy_text("""
    SELECT a.id, a.iban, a.bank_date, a.process_comment FROM public.account_records as a where currency_value > 0
 """)    # and bank_date::date >= '2020-01-01'
 account_records = session.execute(query_account_records).all()
 account_records = [{"id": ar[0], "iban": ar[1], "bank_date": ar[2], "process_comment": ar[3]} for ar in account_records]
 def clean_text(text):
    text = str(text)
    text = re.sub(r'\d{8,}', ' ', text)
    # text = re.sub(r'\b[A-Za-z0-9]*?[0-9]+[A-Za-z0-9]*?[A-Za-z]+[A-Za-z0-9]*\b|\b[A-Za-z0-9]*?[A-Za-z]+[A-Za-z0-9]*?[0-9]+[A-Za-z0-9]*\b', ' ', text)
    text = text.replace("/", " ")
    text = text.replace("_", " ")
    text_remove_underscore = text.replace("-", " ").replace("+", " ")
    text_remove_asterisk = text_remove_underscore.replace("*", " ")
    text_remove_comma = text_remove_asterisk.replace(",", " ")
    text_remove_dots = text_remove_comma.replace(".", " ")
    text_remove_dots = re.sub(r'\s+', ' ', text_remove_dots)
    text_remove_dots = text_remove_dots.strip()
    return text_remove_dots
 def normalize_text(text):
    text = text.replace('İ', 'i')
    text = text.replace('I', 'ı')
    text = text.replace('Ş', 'ş')
    text = text.replace('Ğ', 'ğ')
    text = text.replace('Ü', 'ü')
    text = text.replace('Ö', 'ö')
    text = text.replace('Ç', 'ç')
    return unidecode(text).lower()
 def get_person_initials(person):
    parts = [person.get("firstname", ""), person.get("middle_name", ""), person.get("surname", ""), person.get("birthname", "")]
    return [unidecode(p.strip())[0].upper() for p in parts if p]
 def get_text_initials(matched_text):
    return [unidecode(word.strip())[0].upper() for word in matched_text.split() if word.strip()]
 def generate_dictonary_of_patterns():
    """
    completly remove middle_name instead do regex firstName + SomeWord + surname
    """
    patterns_dict = {}
    for person in people:
        person_id = person.get('id')
        firstname = person.get('firstname', '').strip() if person.get('firstname') else ""
        middle_name = person.get('middle_name', '').strip() if person.get('middle_name') else ""
        surname = person.get('surname', '').strip() if person.get('surname') else ""
        birthname = person.get('birthname', '').strip() if person.get('birthname') else ""
        if not firstname or not surname:
            continue
        name_parts = {
            'firstname': {
                'orig': firstname,
                'norm': normalize_text(firstname) if firstname else "",
                'init': normalize_text(firstname)[0] if firstname else ""
            },
            'surname': {
                'orig': surname,
                'norm': normalize_text(surname) if surname else "",
                'init': normalize_text(surname)[0] if surname else ""
            }
        }
        if middle_name:
            name_parts['middle_name'] = {
                'orig': middle_name,
                'norm': normalize_text(middle_name) if middle_name else "",
                'init': normalize_text(middle_name)[0] if middle_name else ""
            }
        if birthname and normalize_text(birthname) != normalize_text(surname):
            name_parts['birthname'] = {
                'orig': birthname,
                'norm': normalize_text(birthname),
                'init': normalize_text(birthname)[0] if birthname else ""
            }
        person_patterns = set()
        def create_pattern(parts, formats, separators=None):
            if separators is None:
                separators = [""]
            patterns = []
            for fmt in formats:
                for sep in separators:
                    pattern_parts = []
                    for part_type, part_name in fmt:
                        if part_name in parts and part_type in parts[part_name]:
                            pattern_parts.append(re.escape(parts[part_name][part_type]))
                    if pattern_parts:
                        patterns.append(r"\b" + sep.join(pattern_parts) + r"\b")
            return patterns
        name_formats = [
            [('orig', 'firstname'), ('orig', 'surname')],
            [('norm', 'firstname'), ('norm', 'surname')],
            [('orig', 'surname'), ('orig', 'firstname')],
            [('norm', 'surname'), ('norm', 'firstname')],
        ]
        if 'middle_name' in name_parts:
            name_formats = [
                [('orig', 'firstname'), ('orig', 'middle_name'), ('orig', 'surname')],
                [('norm', 'firstname'), ('norm', 'middle_name'), ('norm', 'surname')],
            ]
        person_patterns.update(create_pattern(name_parts, name_formats, [" ", ""]))
        if 'middle_name' in name_parts:
            middle_name_formats = [
                [('orig', 'firstname'), ('orig', 'middle_name')],
                [('norm', 'firstname'), ('norm', 'middle_name')],
                [('orig', 'middle_name'), ('orig', 'surname')],
                [('norm', 'middle_name'), ('norm', 'surname')],
            ]
            person_patterns.update(create_pattern(name_parts, middle_name_formats, [" ", ""]))
        if 'birthname' in name_parts and name_parts['surname']['orig'] != name_parts['birthname']['orig']:
            birthname_formats = [
                [('orig', 'firstname'), ('orig', 'birthname')],
                [('norm', 'firstname'), ('norm', 'birthname')],
                [('orig', 'birthname'), ('orig', 'firstname')],
                [('norm', 'birthname'), ('norm', 'firstname')],
            ]
            person_patterns.update(create_pattern(name_parts, birthname_formats, [" ", ""]))
        initial_formats = [
            [('init', 'firstname'), ('init', 'middle_name'), ('init', 'surname')],
            [('init', 'firstname'), ('init', 'surname')],
        ]
        person_patterns.update(create_pattern(name_parts, initial_formats, ["", ".", " ", ". "]))
        if 'middle_name' in name_parts:
            triple_initial_formats = [
                [('init', 'firstname'), ('init', 'middle_name'), ('init', 'surname')],
            ]
            person_patterns.update(create_pattern(name_parts, triple_initial_formats, ["", ".", " ", ". "]))
        compiled_patterns = [re.compile(pattern, re.IGNORECASE) for pattern in person_patterns]
        patterns_dict[person_id] = compiled_patterns
    return patterns_dict
 def extract_person_name_with_regex(found_dict, process_comment, patterns_dict):
    cleaned_text = process_comment  
    all_matches = []
    for person_id, patterns in patterns_dict.items():
        person = next((p for p in people if p.get('id') == person_id), None)
        if not person:
            continue
        firstname_norm = normalize_text(person.get("firstname", "").strip()) if person.get("firstname") else ""
        middle_name_norm = normalize_text(person.get("middle_name", "").strip()) if person.get("middle_name") else ""
        surname_norm = normalize_text(person.get("surname", "").strip()) if person.get("surname") else ""
        birthname_norm = normalize_text(person.get("birthname", "").strip()) if person.get("birthname") else ""
        text_norm = normalize_text(process_comment)
        person_matches = []
        for pattern in patterns:
            for match in pattern.finditer(text_norm):
                start, end = match.span()
                matched_text = process_comment[start:end]  
                matched_text_norm = normalize_text(matched_text)
                is_valid_match = False
                # Strict validation: require both firstname AND surname/birthname
                # No single-word matches allowed
                if len(matched_text_norm.split()) <= 1:
                    # Single word matches are not allowed
                    is_valid_match = False
                else:
                    # For multi-word matches, require firstname AND (surname OR birthname)
                    has_firstname = firstname_norm and firstname_norm in matched_text_norm
                    has_surname = surname_norm and surname_norm in matched_text_norm
                    has_birthname = birthname_norm and birthname_norm in matched_text_norm
                    # Both firstname and surname/birthname must be present
                    if (has_firstname and has_surname) or (has_firstname and has_birthname):
                        is_valid_match = True
                if is_valid_match:
                    person_matches.append({
                        'matched_text': matched_text,
                        'start': start,
                        'end': end
                    })
        if person_matches:
            person_matches.sort(key=lambda x: len(x['matched_text']), reverse=True)
            non_overlapping_matches = []
            for match in person_matches:
                overlaps = False
                for existing_match in non_overlapping_matches:
                    if (match['start'] < existing_match['end'] and match['end'] > existing_match['start']):
                        overlaps = True
                        break
                if not overlaps:
                    non_overlapping_matches.append(match)
            if non_overlapping_matches:
                found_dict["name_match"] = person
                all_matches.extend([(match, person) for match in non_overlapping_matches])
    if all_matches:
        all_matches.sort(key=lambda x: x[0]['start'], reverse=True)
        for match, person in all_matches:
            matched_text = match['matched_text']
            matched_words = matched_text.split()
            for word in matched_words:
                word_norm = normalize_text(word).strip()
                if not word_norm:
                    continue
                text_norm = normalize_text(cleaned_text)
                for word_match in re.finditer(rf'\b{re.escape(word_norm)}\b', text_norm, re.IGNORECASE):
                    start, end = word_match.span()
                    cleaned_text = cleaned_text[:start] + ' ' * (end - start) + cleaned_text[end:]
        cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
    return found_dict, cleaned_text
 def extract_build_parts_info(found_dict, process_comment):
    """
    Regex of parts such as :
    2 nolu daire
    9 NUMARALI DAI
    daire 3
    3 nolu dairenin
    11nolu daire
    Daire No 12
    2NOLU DAIRE
    12 No lu daire
    D:10
    NO:11
    NO :3
    """
    # Initialize apartment number variable
    apartment_number = None
    cleaned_text = process_comment
    def clean_text_apartment_number(text, match):
        clean_text = text.replace(match.group(0), '').strip()
        clean_text = re.sub(r'\s+', ' ', clean_text).strip()
        return clean_text
    # Pattern 1: X nolu daire (with space)
    pattern1 = re.compile(r'(\d+)\s*nolu\s*daire', re.IGNORECASE)
    match = pattern1.search(cleaned_text)
    if match:
        apartment_number = match.group(1)
        found_dict['apartment_number'] = apartment_number
        return found_dict, clean_text_apartment_number(cleaned_text, match)
    # Pattern 4: X nolu dairenin
    pattern4 = re.compile(r'(\d+)\s*nolu\s*daire\w*', re.IGNORECASE)
    match = pattern4.search(cleaned_text)
    if match:
        apartment_number = match.group(1)
        found_dict['apartment_number'] = apartment_number
        return found_dict, clean_text_apartment_number(cleaned_text, match)
    # Pattern 5: XNolu daire (without space)
    pattern5 = re.compile(r'(\d+)nolu\s*daire', re.IGNORECASE)
    match = pattern5.search(cleaned_text)
    if match:
        apartment_number = match.group(1)
        found_dict['apartment_number'] = apartment_number
        return found_dict, clean_text_apartment_number(cleaned_text, match)
    # Pattern 7: XNOLU DAIRE (all caps, no space)
    pattern7 = re.compile(r'(\d+)nolu\s*daire', re.IGNORECASE)
    match = pattern7.search(cleaned_text)
    if match:
        apartment_number = match.group(1)
        found_dict['apartment_number'] = apartment_number
        return found_dict, clean_text_apartment_number(cleaned_text, match)
    # Pattern 8: X No lu daire
    pattern8 = re.compile(r'(\d+)\s*no\s*lu\s*daire', re.IGNORECASE)
    match = pattern8.search(cleaned_text)
    if match:
        apartment_number = match.group(1)
        found_dict['apartment_number'] = apartment_number
        return found_dict, clean_text_apartment_number(cleaned_text, match)
    # Pattern 6: Daire No X
    pattern6 = re.compile(r'daire\s*no\s*(\d+)', re.IGNORECASE)
    match = pattern6.search(cleaned_text)
    if match:
        apartment_number = match.group(1)
        found_dict['apartment_number'] = apartment_number
        return found_dict, clean_text_apartment_number(cleaned_text, match)
    # Pattern 2: X NUMARALI DAI
    pattern2 = re.compile(r'(\d+)\s*numarali\s*dai', re.IGNORECASE)
    match = pattern2.search(cleaned_text)
    if match:
        apartment_number = match.group(1)
        found_dict['apartment_number'] = apartment_number
        return found_dict, clean_text_apartment_number(cleaned_text, match)
    # Pattern 3: daire X
    pattern3 = re.compile(r'daire\s*(\d+)', re.IGNORECASE)
    match = pattern3.search(cleaned_text)
    if match:
        apartment_number = match.group(1)
        found_dict['apartment_number'] = apartment_number
        return found_dict, clean_text_apartment_number(cleaned_text, match)
    # Pattern 9: D:X
    pattern9 = re.compile(r'd\s*:\s*(\d+)', re.IGNORECASE)
    match = pattern9.search(cleaned_text)
    if match:
        apartment_number = match.group(1)
        found_dict['apartment_number'] = apartment_number
        return found_dict, clean_text_apartment_number(cleaned_text, match)
    # Pattern 10: NO:X or NO :X
    pattern10 = re.compile(r'no\s*:\s*(\d+)', re.IGNORECASE)
    match = pattern10.search(cleaned_text)
    if match:
        apartment_number = match.group(1)
        found_dict['apartment_number'] = apartment_number
        return found_dict, clean_text_apartment_number(cleaned_text, match)
    return found_dict, cleaned_text
 def extract_months(found_dict, process_comment):
    """
    Extract Turkish month names and abbreviations from the process comment
    """
    original_text = process_comment
    # Updated dictionary with normalized keys for better matching
    month_to_number_dict = {
        "ocak": 1, "şubat": 2, "mart": 3, "nisan": 4, "mayıs": 5, "haziran": 6,
        "temmuz": 7, "ağustos": 8, "eylül": 9, "ekim": 10, "kasım": 11, "aralık": 12,
        # Add normalized versions without Turkish characters
        "ocak": 1, "subat": 2, "mart": 3, "nisan": 4, "mayis": 5, "haziran": 6,
        "temmuz": 7, "agustos": 8, "eylul": 9, "ekim": 10, "kasim": 11, "aralik": 12
    }
    def clean_text_month(text, match):
        clean_text = text.replace(match.group(0), '').strip()
        clean_text = re.sub(r'\s+', ' ', clean_text).strip()
        return clean_text
    def normalize_turkish(text):
        """Properly normalize Turkish text for case-insensitive comparison"""
        text = text.lower()
        text = text.replace('i̇', 'i')  # Handle dotted i properly
        text = text.replace('ı', 'i')   # Convert dotless i to regular i for matching
        text = unidecode(text)          # Remove other diacritics
        return text
    if 'months' not in found_dict:
        found_dict['months'] = []
    months_found, working_text = False, original_text
    for month in turkish_months:
        pattern = re.compile(r'\b' + re.escape(month) + r'\b', re.IGNORECASE)
        for match in pattern.finditer(original_text):
            matched_text = match.group(0)
            normalized_month = normalize_turkish(month)
            month_number = None
            if month.lower() in month_to_number_dict:
                month_number = month_to_number_dict[month.lower()]
            elif normalized_month in month_to_number_dict:
                month_number = month_to_number_dict[normalized_month]
            month_info = {'name': month, 'number': month_number}
            found_dict['months'].append(month_info)
            months_found = True
            working_text = working_text.replace(matched_text, '', 1)
    for abbr, full_month in turkish_months_abbr.items():
        pattern = re.compile(r'\b' + re.escape(abbr) + r'\b', re.IGNORECASE)
        for match in pattern.finditer(working_text):
            matched_text = match.group(0)
            normalized_month = normalize_turkish(full_month)
            month_number = None
            if full_month.lower() in month_to_number_dict:
                month_number = month_to_number_dict[full_month.lower()]
            elif normalized_month in month_to_number_dict:
                month_number = month_to_number_dict[normalized_month]
            month_info = {'name': full_month, 'number': month_number}
            found_dict['months'].append(month_info)
            months_found = True
            working_text = working_text.replace(matched_text, '', 1)
    return found_dict, working_text
 def extract_year(found_dict, process_comment):
    """
    Extract years from the process comment
    """
    original_text = process_comment
    if 'years' not in found_dict:
        found_dict['years'] = []
    working_text = original_text
    for year in range(start_year, current_year + 1):
        pattern = re.compile(r'\b' + str(year) + r'\b', re.IGNORECASE)
        for match in pattern.finditer(original_text):
            matched_text = match.group(0)
            if str(matched_text).isdigit():
                found_dict['years'].append(int(matched_text))
                working_text = working_text.replace(matched_text, '', 1)
    return found_dict, working_text
 def extract_payment_type(found_dict, process_comment):
    """
    Extract payment type from the process comment
    aidat
    AİD
    aidatı
    TADİLAT
    YAKIT
    yakıt
    yakit
    """
    original_text = process_comment
    working_text = original_text
    if 'payment_types' not in found_dict:
        found_dict['payment_types'] = []
    payment_keywords = {
        'aidat': ['aidat', 'aİd', 'aid', 'aidatı', 'aidati'],
        'tadilat': ['tadilat', 'tadİlat', 'tadilatı'],
        'yakit': ['yakit', 'yakıt', 'yakıtı', 'yakiti']
    }
    for payment_type, keywords in payment_keywords.items():
        for keyword in keywords:
            pattern = re.compile(r'\b' + keyword + r'\b', re.IGNORECASE)
            for match in pattern.finditer(original_text):
                matched_text = match.group(0)
                if payment_type not in found_dict['payment_types']:
                    found_dict['payment_types'].append(payment_type)
                working_text = working_text.replace(matched_text, '', 1)
    return found_dict, working_text
 def main():
    list_of_regex_patterns = generate_dictonary_of_patterns()
    dicts_found = dict()
    dicts_not_found = dict()
    for account_record in account_records:
        account_record_id = str(account_record["id"])
        found_dict = {}
        process_comment_iteration = clean_text(text=account_record["process_comment"])
        found_dict, cleaned_process_comment = extract_person_name_with_regex(
            found_dict=found_dict, process_comment=process_comment_iteration, patterns_dict=list_of_regex_patterns
        )
        found_dict, cleaned_process_comment = extract_build_parts_info(found_dict=found_dict, process_comment=cleaned_process_comment)
        found_dict, cleaned_process_comment = extract_months(found_dict=found_dict, process_comment=cleaned_process_comment)
        found_dict, cleaned_process_comment = extract_year(found_dict=found_dict, process_comment=cleaned_process_comment)
        found_dict, cleaned_process_comment = extract_payment_type(found_dict=found_dict, process_comment=cleaned_process_comment)
        if found_dict:
            dicts_found[process_comment_iteration] = found_dict
        else:
            dicts_not_found[process_comment_iteration] = account_record_id
    print("\n===== SUMMARY =====")
    print(f"extracted data total            : {len(dicts_found)}")
    print(f"not extracted data total        : {len(account_records) - len(dicts_found)}")
    print(f"Total account records processed : {len(account_records)}")
 if __name__ == "__main__":
    main()