225 lines
9.9 KiB
Python
225 lines
9.9 KiB
Python
import re
|
|
import pandas as pd
|
|
from unidecode import unidecode
|
|
from difflib import SequenceMatcher
|
|
from itertools import permutations
|
|
from time import perf_counter
|
|
from base_import import get_session_factory
|
|
from sqlalchemy import text as sqlalchemy_text
|
|
|
|
session_factory = get_session_factory()
|
|
session = session_factory()
|
|
|
|
turkish_months = ["OCAK", "ŞUBAT", "MART", "NİSAN", "MAYIS", "HAZİRAN", "TEMMUZ", "AĞUSTOS", "EYLÜL", "EKİM", "KASIM", "ARALIK"]
|
|
turkish_months_abbr = {
|
|
"OCA": "OCAK", "SUB": "ŞUBAT", "ŞUB": "ŞUBAT", "MAR": "MART", "NIS": "NİSAN", "MAY": "MAYIS", "HAZ": "HAZİRAN", "HZR": "HAZİRAN",
|
|
"TEM": "TEMMUZ", "AGU": "AĞUSTOS", "AGT": "AĞUSTOS", "EYL": "EYLÜL", "EKI": "EKİM", "KAS": "KASIM", "ARA": "ARALIK",
|
|
}
|
|
start_year = 1950
|
|
current_year = pd.Timestamp.now().year
|
|
people = session.execute(
|
|
sqlalchemy_text("""
|
|
SELECT p.firstname, p.middle_name, p.surname, bl.id FROM public.people as p
|
|
inner join public.build_living_space as bl on bl.person_id = p.id
|
|
inner join public.build_parts as bp on bp.id = bl.build_parts_id
|
|
inner join public.build as b on b.id = bp.build_id
|
|
where b.id = 1
|
|
""")
|
|
).all()
|
|
people = [{"firstname": p[0], "middle_name": p[1], "last_name": p[2], 'id': p[3]} for p in people]
|
|
|
|
|
|
def clean_text(text):
|
|
# Convert to string just in case
|
|
text = str(text)
|
|
# Remove extra spaces and tabs by splitting and joining
|
|
text = " ".join(text.split())
|
|
text_remove_underscore = text.replace("-", " ")
|
|
text_remove_asterisk = text_remove_underscore.replace("*", " ")
|
|
text_remove_comma = text_remove_asterisk.replace(",", " ")
|
|
# Optionally lower-case or normalize unicode if needed
|
|
return text_remove_comma
|
|
|
|
|
|
def is_valid_char_match(word, month):
|
|
"""Ensure all letters in the word exist in the target month name."""
|
|
word_chars = set(word)
|
|
month_chars = set(month)
|
|
return word_chars.issubset(month_chars)
|
|
|
|
|
|
def best_month_matches(text, threshold=0.7):
|
|
matches = []
|
|
words = clean_text(text).split()
|
|
|
|
for word in words:
|
|
# First check abbreviations dictionary exact match
|
|
if word in turkish_months_abbr:
|
|
full_month = turkish_months_abbr[word]
|
|
matches.append((full_month.capitalize(), word, 1.0))
|
|
continue
|
|
|
|
# Else fuzzy match full months
|
|
for month in turkish_months:
|
|
month_clean = unidecode(month.upper())
|
|
ratio = SequenceMatcher(None, word, month_clean).ratio()
|
|
if ratio >= threshold and is_valid_char_match(word, month_clean):
|
|
matches.append((month.capitalize(), word, round(ratio, 2)))
|
|
break
|
|
|
|
return matches if matches else ["N/A"]
|
|
|
|
|
|
def print_out_results(data_frame, df):
|
|
for ix, process_comment in enumerate(data_frame):
|
|
print(f'Row number {ix} : {process_comment}')
|
|
print(len(data_frame), '/', len(df))
|
|
|
|
|
|
def extract_years(text):
|
|
matches = re.findall(r'\b(19\d{2}|20\d{2})\b', text)
|
|
valid_years = [year for year in matches if start_year <= int(year) <= current_year]
|
|
return valid_years if valid_years else ["N/A"]
|
|
|
|
|
|
def extract_months(text):
|
|
# Normalize text and months by converting to ASCII lowercase
|
|
text_norm = unidecode(str(text)).lower()
|
|
months_norm = [unidecode(m).lower() for m in turkish_months]
|
|
found = [turkish_months[i] for i, m in enumerate(months_norm) if m in text_norm]
|
|
return found if found else ["N/A"]
|
|
|
|
|
|
def normalize_text(text):
|
|
return unidecode(text).lower()
|
|
|
|
|
|
def build_name_regex_all_combinations(person):
|
|
firstname = person.get("firstname", "").strip()
|
|
middle_name = person.get("middle_name", "").strip()
|
|
last_name = person.get("last_name", "").strip()
|
|
|
|
parts = [unidecode(p).lower() for p in [firstname, middle_name, last_name] if p]
|
|
|
|
patterns = []
|
|
for r in range(1, len(parts) + 1):
|
|
# Use permutations instead of combinations to get all orderings
|
|
for permuted_parts in permutations(parts, r):
|
|
regex_pattern = r"\b" + r"\s*".join(map(re.escape, permuted_parts)) + r"\b"
|
|
patterns.append((re.compile(regex_pattern, flags=re.IGNORECASE), r))
|
|
|
|
# Add full merged version like "fatihergunguclu"
|
|
if len(parts) >= 2:
|
|
merged = ''.join(parts)
|
|
patterns.append((re.compile(rf"\b{re.escape(merged)}\b", flags=re.IGNORECASE), len(parts)))
|
|
|
|
return patterns
|
|
|
|
|
|
def get_person_initials(person):
|
|
parts = [person.get("firstname", ""), person.get("middle_name", ""), person.get("last_name", "")]
|
|
return [unidecode(p.strip())[0].upper() for p in parts if p]
|
|
|
|
def get_text_initials(matched_text):
|
|
return [unidecode(word.strip())[0].upper() for word in matched_text.split() if word.strip()]
|
|
|
|
def find_person_regex_all_combinations(text, person):
|
|
text_norm = normalize_text(text)
|
|
scored_matches = []
|
|
|
|
for pattern, weight in build_name_regex_all_combinations(person):
|
|
for match in pattern.finditer(text_norm):
|
|
matched_text = match.group().strip()
|
|
scored_matches.append({
|
|
"matched_person": person,
|
|
"matched_text": matched_text,
|
|
"weight": weight
|
|
})
|
|
|
|
return scored_matches
|
|
|
|
def find_all_person_matches(text):
|
|
all_valid_matches = {}
|
|
|
|
for person in people:
|
|
matches = find_person_regex_all_combinations(text, person)
|
|
|
|
for match in matches:
|
|
matched_person = match["matched_person"]
|
|
matched_text = match["matched_text"]
|
|
weight = match["weight"]
|
|
|
|
person_initials = get_person_initials(matched_person)
|
|
found_text_letters = get_text_initials(matched_text)
|
|
|
|
match_count = sum(1 for c in found_text_letters if c in person_initials)
|
|
|
|
if match_count >= 2:
|
|
person_key = tuple(matched_person.values())
|
|
if person_key not in all_valid_matches or all_valid_matches[person_key][0] < weight:
|
|
all_valid_matches[person_key] = (weight, matched_person)
|
|
|
|
sorted_matches = sorted(all_valid_matches.values(), key=lambda x: -x[0])
|
|
return [m[1] for m in sorted_matches] or ["N/A"]
|
|
|
|
|
|
def find_all_person_matches_possibilities(text):
|
|
all_valid_matches = {}
|
|
|
|
for person in people:
|
|
matches = find_person_regex_all_combinations(text, person)
|
|
|
|
for match in matches:
|
|
matched_person = match["matched_person"]
|
|
matched_text = match["matched_text"]
|
|
weight = match["weight"]
|
|
|
|
person_initials = get_person_initials(matched_person)
|
|
found_text_letters = get_text_initials(matched_text)
|
|
person_key = tuple(matched_person.values())
|
|
all_valid_matches[person_key] = (weight, matched_person)
|
|
sorted_matches = sorted(all_valid_matches.values(), key=lambda x: -x[0])
|
|
return [m[1] for m in sorted_matches] or ["N/A"]
|
|
|
|
start_time = perf_counter()
|
|
df = pd.read_csv("account_records_incoming.csv")
|
|
|
|
df["process_comment"] = df["process_comment"].apply(clean_text)
|
|
df["year"] = df["process_comment"].str.extract(r'(\d{4})')
|
|
df["year"] = pd.to_numeric(df["year"], errors='coerce')
|
|
|
|
df["year"] = df["year"].where(df["year"].between(start_year, current_year))
|
|
df["years_found"] = df["process_comment"].apply(extract_years)
|
|
df["months_found"] = df["process_comment"].apply(extract_months)
|
|
df["fuzzy_months_found"] = df["process_comment"].apply(best_month_matches)
|
|
df["person_name_matches"] = df["process_comment"].apply(find_all_person_matches)
|
|
df["person_name_matches_possibilities"] = df["process_comment"].apply(find_all_person_matches_possibilities)
|
|
|
|
# df["year_str"] = df["year"].apply(lambda x: str(int(x)) if pd.notnull(x) else "N/A")
|
|
# print(df[["process_comment", "year_str", "months_found"]].head(100))
|
|
|
|
# for ix, row in df[["process_comment", "years_found", "months_found", "fuzzy_months_found", "person_name_matches", "person_name_matches_possibilities"]].iterrows():
|
|
# if row['person_name_matches'] == ["N/A"] and row['person_name_matches_possibilities'] != ["N/A"]:
|
|
# print(f"Row number {ix} | {row['process_comment']}")
|
|
# # print(f"Years found: {row['years_found']} | Months found: {row['months_found']} | Fuzzy months found: {row['fuzzy_months_found']}")
|
|
# print(f"Person name matches: {row['person_name_matches']} | Person name matches possibilities: {row['person_name_matches_possibilities']}")
|
|
|
|
month_direct_found = df[df['months_found'].apply(lambda x: x != ["N/A"])]
|
|
month_indirect_found = df[df['months_found'].apply(lambda x: x == ["N/A"]) | df['fuzzy_months_found'].apply(lambda x: x != ["N/A"])]
|
|
years_found = df[df['years_found'].apply(lambda x: x != ["N/A"])]
|
|
people_found = df[df['person_name_matches'].apply(lambda x: x != ["N/A"]) ]
|
|
people_found_possibilities = df[df['person_name_matches_possibilities'].apply(lambda x: x != ["N/A"]) & df['person_name_matches'].apply(lambda x: x == ["N/A"])]
|
|
|
|
print("length of years where is not N/A : ", len(years_found), '/', len(df))
|
|
print("length of months where is not N/A : ", len(month_direct_found), '/', len(df))
|
|
print("length of fuzzy months where is not N/A : ", len(month_indirect_found), '/', len(df))
|
|
print("lenght of names found", len(people_found), '/', len(df))
|
|
print("lenght of names found with possibilities", len(people_found_possibilities), '/', len(df))
|
|
end_time = perf_counter()
|
|
print(f"Time taken to read CSV: {end_time - start_time:.2f} seconds")
|
|
# for ix, row in df[['id', 'process_comment', 'years_found', 'months_found', 'fuzzy_months_found', 'person_name_matches', 'person_name_matches_possibilities']].iterrows():
|
|
# print(f"id {row['id']} | {row['process_comment']} | {row['years_found']} | {row['months_found']} | Person Name Matches : {row['person_name_matches']}")
|
|
# if row['person_name_matches'] == ["N/A"] and row['person_name_matches_possibilities'] != ["N/A"]:
|
|
# print(f"id {row['id']} | Fuzzy person Matches : {row['person_name_matches_possibilities']}")
|
|
# if row['months_found'] == ["N/A"] and row['fuzzy_months_found'] != ["N/A"]:
|
|
# print(f"id {row['id']} | Fuzzy month Matches : {row['fuzzy_months_found']}") |