import re import pandas as pd from unidecode import unidecode from difflib import SequenceMatcher from itertools import permutations from time import perf_counter from base_import import get_session_factory from sqlalchemy import text as sqlalchemy_text session_factory = get_session_factory() session = session_factory() turkish_months = ["OCAK", "ŞUBAT", "MART", "NİSAN", "MAYIS", "HAZİRAN", "TEMMUZ", "AĞUSTOS", "EYLÜL", "EKİM", "KASIM", "ARALIK"] turkish_months_abbr = { "OCA": "OCAK", "SUB": "ŞUBAT", "ŞUB": "ŞUBAT", "MAR": "MART", "NIS": "NİSAN", "MAY": "MAYIS", "HAZ": "HAZİRAN", "HZR": "HAZİRAN", "TEM": "TEMMUZ", "AGU": "AĞUSTOS", "AGT": "AĞUSTOS", "EYL": "EYLÜL", "EKI": "EKİM", "KAS": "KASIM", "ARA": "ARALIK", } start_year = 1950 current_year = pd.Timestamp.now().year people = session.execute( sqlalchemy_text(""" SELECT p.firstname, p.middle_name, p.surname, bl.id FROM public.people as p inner join public.build_living_space as bl on bl.person_id = p.id inner join public.build_parts as bp on bp.id = bl.build_parts_id inner join public.build as b on b.id = bp.build_id where b.id = 1 """) ).all() people = [{"firstname": p[0], "middle_name": p[1], "last_name": p[2], 'id': p[3]} for p in people] def clean_text(text): # Convert to string just in case text = str(text) # Remove extra spaces and tabs by splitting and joining text = " ".join(text.split()) text_remove_underscore = text.replace("-", " ") text_remove_asterisk = text_remove_underscore.replace("*", " ") text_remove_comma = text_remove_asterisk.replace(",", " ") # Optionally lower-case or normalize unicode if needed return text_remove_comma def is_valid_char_match(word, month): """Ensure all letters in the word exist in the target month name.""" word_chars = set(word) month_chars = set(month) return word_chars.issubset(month_chars) def best_month_matches(text, threshold=0.7): matches = [] words = clean_text(text).split() for word in words: # First check abbreviations dictionary exact match if word in turkish_months_abbr: full_month = turkish_months_abbr[word] matches.append((full_month.capitalize(), word, 1.0)) continue # Else fuzzy match full months for month in turkish_months: month_clean = unidecode(month.upper()) ratio = SequenceMatcher(None, word, month_clean).ratio() if ratio >= threshold and is_valid_char_match(word, month_clean): matches.append((month.capitalize(), word, round(ratio, 2))) break return matches if matches else ["N/A"] def print_out_results(data_frame, df): for ix, process_comment in enumerate(data_frame): print(f'Row number {ix} : {process_comment}') print(len(data_frame), '/', len(df)) def extract_years(text): matches = re.findall(r'\b(19\d{2}|20\d{2})\b', text) valid_years = [year for year in matches if start_year <= int(year) <= current_year] return valid_years if valid_years else ["N/A"] def extract_months(text): # Normalize text and months by converting to ASCII lowercase text_norm = unidecode(str(text)).lower() months_norm = [unidecode(m).lower() for m in turkish_months] found = [turkish_months[i] for i, m in enumerate(months_norm) if m in text_norm] return found if found else ["N/A"] def normalize_text(text): return unidecode(text).lower() def build_name_regex_all_combinations(person): firstname = person.get("firstname", "").strip() middle_name = person.get("middle_name", "").strip() last_name = person.get("last_name", "").strip() parts = [unidecode(p).lower() for p in [firstname, middle_name, last_name] if p] patterns = [] for r in range(1, len(parts) + 1): # Use permutations instead of combinations to get all orderings for permuted_parts in permutations(parts, r): regex_pattern = r"\b" + r"\s*".join(map(re.escape, permuted_parts)) + r"\b" patterns.append((re.compile(regex_pattern, flags=re.IGNORECASE), r)) # Add full merged version like "fatihergunguclu" if len(parts) >= 2: merged = ''.join(parts) patterns.append((re.compile(rf"\b{re.escape(merged)}\b", flags=re.IGNORECASE), len(parts))) return patterns def get_person_initials(person): parts = [person.get("firstname", ""), person.get("middle_name", ""), person.get("last_name", "")] return [unidecode(p.strip())[0].upper() for p in parts if p] def get_text_initials(matched_text): return [unidecode(word.strip())[0].upper() for word in matched_text.split() if word.strip()] def find_person_regex_all_combinations(text, person): text_norm = normalize_text(text) scored_matches = [] for pattern, weight in build_name_regex_all_combinations(person): for match in pattern.finditer(text_norm): matched_text = match.group().strip() scored_matches.append({ "matched_person": person, "matched_text": matched_text, "weight": weight }) return scored_matches def find_all_person_matches(text): all_valid_matches = {} for person in people: matches = find_person_regex_all_combinations(text, person) for match in matches: matched_person = match["matched_person"] matched_text = match["matched_text"] weight = match["weight"] person_initials = get_person_initials(matched_person) found_text_letters = get_text_initials(matched_text) match_count = sum(1 for c in found_text_letters if c in person_initials) if match_count >= 2: person_key = tuple(matched_person.values()) if person_key not in all_valid_matches or all_valid_matches[person_key][0] < weight: all_valid_matches[person_key] = (weight, matched_person) sorted_matches = sorted(all_valid_matches.values(), key=lambda x: -x[0]) return [m[1] for m in sorted_matches] or ["N/A"] def find_all_person_matches_possibilities(text): all_valid_matches = {} for person in people: matches = find_person_regex_all_combinations(text, person) for match in matches: matched_person = match["matched_person"] matched_text = match["matched_text"] weight = match["weight"] person_initials = get_person_initials(matched_person) found_text_letters = get_text_initials(matched_text) person_key = tuple(matched_person.values()) all_valid_matches[person_key] = (weight, matched_person) sorted_matches = sorted(all_valid_matches.values(), key=lambda x: -x[0]) return [m[1] for m in sorted_matches] or ["N/A"] start_time = perf_counter() df = pd.read_csv("account_records_incoming.csv") df["process_comment"] = df["process_comment"].apply(clean_text) df["year"] = df["process_comment"].str.extract(r'(\d{4})') df["year"] = pd.to_numeric(df["year"], errors='coerce') df["year"] = df["year"].where(df["year"].between(start_year, current_year)) df["years_found"] = df["process_comment"].apply(extract_years) df["months_found"] = df["process_comment"].apply(extract_months) df["fuzzy_months_found"] = df["process_comment"].apply(best_month_matches) df["person_name_matches"] = df["process_comment"].apply(find_all_person_matches) df["person_name_matches_possibilities"] = df["process_comment"].apply(find_all_person_matches_possibilities) # df["year_str"] = df["year"].apply(lambda x: str(int(x)) if pd.notnull(x) else "N/A") # print(df[["process_comment", "year_str", "months_found"]].head(100)) # for ix, row in df[["process_comment", "years_found", "months_found", "fuzzy_months_found", "person_name_matches", "person_name_matches_possibilities"]].iterrows(): # if row['person_name_matches'] == ["N/A"] and row['person_name_matches_possibilities'] != ["N/A"]: # print(f"Row number {ix} | {row['process_comment']}") # # print(f"Years found: {row['years_found']} | Months found: {row['months_found']} | Fuzzy months found: {row['fuzzy_months_found']}") # print(f"Person name matches: {row['person_name_matches']} | Person name matches possibilities: {row['person_name_matches_possibilities']}") month_direct_found = df[df['months_found'].apply(lambda x: x != ["N/A"])] month_indirect_found = df[df['months_found'].apply(lambda x: x == ["N/A"]) | df['fuzzy_months_found'].apply(lambda x: x != ["N/A"])] years_found = df[df['years_found'].apply(lambda x: x != ["N/A"])] people_found = df[df['person_name_matches'].apply(lambda x: x != ["N/A"]) ] people_found_possibilities = df[df['person_name_matches_possibilities'].apply(lambda x: x != ["N/A"]) & df['person_name_matches'].apply(lambda x: x == ["N/A"])] print("length of years where is not N/A : ", len(years_found), '/', len(df)) print("length of months where is not N/A : ", len(month_direct_found), '/', len(df)) print("length of fuzzy months where is not N/A : ", len(month_indirect_found), '/', len(df)) print("lenght of names found", len(people_found), '/', len(df)) print("lenght of names found with possibilities", len(people_found_possibilities), '/', len(df)) end_time = perf_counter() print(f"Time taken to read CSV: {end_time - start_time:.2f} seconds") # for ix, row in df[['id', 'process_comment', 'years_found', 'months_found', 'fuzzy_months_found', 'person_name_matches', 'person_name_matches_possibilities']].iterrows(): # print(f"id {row['id']} | {row['process_comment']} | {row['years_found']} | {row['months_found']} | Person Name Matches : {row['person_name_matches']}") # if row['person_name_matches'] == ["N/A"] and row['person_name_matches_possibilities'] != ["N/A"]: # print(f"id {row['id']} | Fuzzy person Matches : {row['person_name_matches_possibilities']}") # if row['months_found'] == ["N/A"] and row['fuzzy_months_found'] != ["N/A"]: # print(f"id {row['id']} | Fuzzy month Matches : {row['fuzzy_months_found']}")