python-account-miner/dataCleaning.py

import re
import pandas as pd
from unidecode import unidecode
from difflib import SequenceMatcher
from itertools import permutations
from time import perf_counter
from base_import import get_session_factory
from sqlalchemy import text as sqlalchemy_text

session_factory = get_session_factory()
session = session_factory()

turkish_months = ["OCAK", "ŞUBAT", "MART", "NİSAN", "MAYIS", "HAZİRAN", "TEMMUZ", "AĞUSTOS", "EYLÜL", "EKİM", "KASIM", "ARALIK"]
turkish_months_abbr = {
    "OCA": "OCAK", "SUB": "ŞUBAT", "ŞUB": "ŞUBAT", "MAR": "MART", "NIS": "NİSAN", "MAY": "MAYIS", "HAZ": "HAZİRAN", "HZR": "HAZİRAN",
    "TEM": "TEMMUZ", "AGU": "AĞUSTOS", "AGT": "AĞUSTOS", "EYL": "EYLÜL", "EKI": "EKİM", "KAS": "KASIM", "ARA": "ARALIK",
}
start_year = 1950
current_year = pd.Timestamp.now().year
people = session.execute(
    sqlalchemy_text("""
    SELECT p.firstname, p.middle_name, p.surname, bl.id FROM public.people as p
    inner join public.build_living_space as bl on bl.person_id = p.id
    inner join public.build_parts as bp on bp.id = bl.build_parts_id
    inner join public.build as b on b.id = bp.build_id
    where b.id = 1
    """)
).all()
people = [{"firstname": p[0], "middle_name": p[1], "last_name": p[2], 'id': p[3]} for p in people]


def clean_text(text):
    # Convert to string just in case
    text = str(text)
    # Remove extra spaces and tabs by splitting and joining
    text = " ".join(text.split())
    text_remove_underscore = text.replace("-", " ")
    text_remove_asterisk = text_remove_underscore.replace("*", " ")
    text_remove_comma = text_remove_asterisk.replace(",", " ")
    # Optionally lower-case or normalize unicode if needed
    return text_remove_comma


def is_valid_char_match(word, month):
    """Ensure all letters in the word exist in the target month name."""
    word_chars = set(word)
    month_chars = set(month)
    return word_chars.issubset(month_chars)


def best_month_matches(text, threshold=0.7):
    matches = []
    words = clean_text(text).split()

    for word in words:
        # First check abbreviations dictionary exact match
        if word in turkish_months_abbr:
            full_month = turkish_months_abbr[word]
            matches.append((full_month.capitalize(), word, 1.0))
            continue

        # Else fuzzy match full months
        for month in turkish_months:
            month_clean = unidecode(month.upper())
            ratio = SequenceMatcher(None, word, month_clean).ratio()
            if ratio >= threshold and is_valid_char_match(word, month_clean):
                matches.append((month.capitalize(), word, round(ratio, 2)))
                break

    return matches if matches else ["N/A"]


def print_out_results(data_frame, df):
    for ix, process_comment in enumerate(data_frame):
        print(f'Row number {ix} : {process_comment}')
    print(len(data_frame), '/', len(df))


def extract_years(text):
    matches = re.findall(r'\b(19\d{2}|20\d{2})\b', text)
    valid_years = [year for year in matches if start_year <= int(year) <= current_year]
    return valid_years if valid_years else ["N/A"]


def extract_months(text):
    # Normalize text and months by converting to ASCII lowercase
    text_norm = unidecode(str(text)).lower()
    months_norm = [unidecode(m).lower() for m in turkish_months]
    found = [turkish_months[i] for i, m in enumerate(months_norm) if m in text_norm]
    return found if found else ["N/A"]


def normalize_text(text):
    return unidecode(text).lower()


def build_name_regex_all_combinations(person):
    firstname = person.get("firstname", "").strip()
    middle_name = person.get("middle_name", "").strip()
    last_name = person.get("last_name", "").strip()

    parts = [unidecode(p).lower() for p in [firstname, middle_name, last_name] if p]

    patterns = []
    for r in range(1, len(parts) + 1):
        # Use permutations instead of combinations to get all orderings
        for permuted_parts in permutations(parts, r):
            regex_pattern = r"\b" + r"\s*".join(map(re.escape, permuted_parts)) + r"\b"
            patterns.append((re.compile(regex_pattern, flags=re.IGNORECASE), r))

    # Add full merged version like "fatihergunguclu"
    if len(parts) >= 2:
        merged = ''.join(parts)
        patterns.append((re.compile(rf"\b{re.escape(merged)}\b", flags=re.IGNORECASE), len(parts)))

    return patterns


def get_person_initials(person):
    parts = [person.get("firstname", ""), person.get("middle_name", ""), person.get("last_name", "")]
    return [unidecode(p.strip())[0].upper() for p in parts if p]

def get_text_initials(matched_text):
    return [unidecode(word.strip())[0].upper() for word in matched_text.split() if word.strip()]

def find_person_regex_all_combinations(text, person):
    text_norm = normalize_text(text)
    scored_matches = []

    for pattern, weight in build_name_regex_all_combinations(person):
        for match in pattern.finditer(text_norm):
            matched_text = match.group().strip()
            scored_matches.append({
                "matched_person": person,
                "matched_text": matched_text,
                "weight": weight
            })

    return scored_matches

def find_all_person_matches(text):
    all_valid_matches = {}

    for person in people:
        matches = find_person_regex_all_combinations(text, person)

        for match in matches:
            matched_person = match["matched_person"]
            matched_text = match["matched_text"]
            weight = match["weight"]

            person_initials = get_person_initials(matched_person)
            found_text_letters = get_text_initials(matched_text)

            match_count = sum(1 for c in found_text_letters if c in person_initials)

            if match_count >= 2:
                person_key = tuple(matched_person.values())
                if person_key not in all_valid_matches or all_valid_matches[person_key][0] < weight:
                    all_valid_matches[person_key] = (weight, matched_person)

    sorted_matches = sorted(all_valid_matches.values(), key=lambda x: -x[0])
    return [m[1] for m in sorted_matches] or ["N/A"]


def find_all_person_matches_possibilities(text):
    all_valid_matches = {}

    for person in people:
        matches = find_person_regex_all_combinations(text, person)

        for match in matches:
            matched_person = match["matched_person"]
            matched_text = match["matched_text"]
            weight = match["weight"]

            person_initials = get_person_initials(matched_person)
            found_text_letters = get_text_initials(matched_text)
            person_key = tuple(matched_person.values())
            all_valid_matches[person_key] = (weight, matched_person)
    sorted_matches = sorted(all_valid_matches.values(), key=lambda x: -x[0])
    return [m[1] for m in sorted_matches] or ["N/A"]

start_time = perf_counter()
df = pd.read_csv("account_records_incoming.csv")

df["process_comment"] = df["process_comment"].apply(clean_text)
df["year"] = df["process_comment"].str.extract(r'(\d{4})')
df["year"] = pd.to_numeric(df["year"], errors='coerce')

df["year"] = df["year"].where(df["year"].between(start_year, current_year))
df["years_found"] = df["process_comment"].apply(extract_years)
df["months_found"] = df["process_comment"].apply(extract_months)
df["fuzzy_months_found"] = df["process_comment"].apply(best_month_matches)
df["person_name_matches"] = df["process_comment"].apply(find_all_person_matches)
df["person_name_matches_possibilities"] = df["process_comment"].apply(find_all_person_matches_possibilities)

# df["year_str"] = df["year"].apply(lambda x: str(int(x)) if pd.notnull(x) else "N/A")
# print(df[["process_comment", "year_str", "months_found"]].head(100))

# for ix, row in df[["process_comment", "years_found", "months_found", "fuzzy_months_found", "person_name_matches", "person_name_matches_possibilities"]].iterrows():
#     if row['person_name_matches'] == ["N/A"] and row['person_name_matches_possibilities'] != ["N/A"]:
#         print(f"Row number {ix} | {row['process_comment']}")
#         # print(f"Years found: {row['years_found']} | Months found: {row['months_found']} | Fuzzy months found: {row['fuzzy_months_found']}")
#         print(f"Person name matches: {row['person_name_matches']} | Person name matches possibilities: {row['person_name_matches_possibilities']}")

month_direct_found = df[df['months_found'].apply(lambda x: x != ["N/A"])]
month_indirect_found = df[df['months_found'].apply(lambda x: x == ["N/A"]) | df['fuzzy_months_found'].apply(lambda x: x != ["N/A"])]
years_found = df[df['years_found'].apply(lambda x: x != ["N/A"])]
people_found = df[df['person_name_matches'].apply(lambda x: x != ["N/A"]) ]
people_found_possibilities = df[df['person_name_matches_possibilities'].apply(lambda x: x != ["N/A"]) & df['person_name_matches'].apply(lambda x: x == ["N/A"])]

print("length of years where is not N/A :           ", len(years_found), '/', len(df))
print("length of months where is not N/A :          ", len(month_direct_found), '/', len(df))
print("length of fuzzy months where is not N/A :    ", len(month_indirect_found), '/', len(df))
print("lenght of names found", len(people_found), '/', len(df))
print("lenght of names found with possibilities", len(people_found_possibilities), '/', len(df))
end_time = perf_counter()
print(f"Time taken to read CSV: {end_time - start_time:.2f} seconds")
# for ix, row in df[['id', 'process_comment', 'years_found', 'months_found', 'fuzzy_months_found', 'person_name_matches', 'person_name_matches_possibilities']].iterrows():
#     print(f"id {row['id']} | {row['process_comment']} | {row['years_found']} | {row['months_found']} | Person Name Matches : {row['person_name_matches']}")
#     if row['person_name_matches'] == ["N/A"] and row['person_name_matches_possibilities'] != ["N/A"]:
#         print(f"id {row['id']} | Fuzzy person Matches : {row['person_name_matches_possibilities']}")
#     if row['months_found'] == ["N/A"] and row['fuzzy_months_found'] != ["N/A"]:
#         print(f"id {row['id']} | Fuzzy month Matches  : {row['fuzzy_months_found']}")