python-account-miner/dataCleaning.py

225 lines
9.9 KiB
Python

import re
import pandas as pd
from unidecode import unidecode
from difflib import SequenceMatcher
from itertools import permutations
from time import perf_counter
from base_import import get_session_factory
from sqlalchemy import text as sqlalchemy_text
session_factory = get_session_factory()
session = session_factory()
turkish_months = ["OCAK", "ŞUBAT", "MART", "NİSAN", "MAYIS", "HAZİRAN", "TEMMUZ", "AĞUSTOS", "EYLÜL", "EKİM", "KASIM", "ARALIK"]
turkish_months_abbr = {
"OCA": "OCAK", "SUB": "ŞUBAT", "ŞUB": "ŞUBAT", "MAR": "MART", "NIS": "NİSAN", "MAY": "MAYIS", "HAZ": "HAZİRAN", "HZR": "HAZİRAN",
"TEM": "TEMMUZ", "AGU": "AĞUSTOS", "AGT": "AĞUSTOS", "EYL": "EYLÜL", "EKI": "EKİM", "KAS": "KASIM", "ARA": "ARALIK",
}
start_year = 1950
current_year = pd.Timestamp.now().year
people = session.execute(
sqlalchemy_text("""
SELECT p.firstname, p.middle_name, p.surname, bl.id FROM public.people as p
inner join public.build_living_space as bl on bl.person_id = p.id
inner join public.build_parts as bp on bp.id = bl.build_parts_id
inner join public.build as b on b.id = bp.build_id
where b.id = 1
""")
).all()
people = [{"firstname": p[0], "middle_name": p[1], "last_name": p[2], 'id': p[3]} for p in people]
def clean_text(text):
# Convert to string just in case
text = str(text)
# Remove extra spaces and tabs by splitting and joining
text = " ".join(text.split())
text_remove_underscore = text.replace("-", " ")
text_remove_asterisk = text_remove_underscore.replace("*", " ")
text_remove_comma = text_remove_asterisk.replace(",", " ")
# Optionally lower-case or normalize unicode if needed
return text_remove_comma
def is_valid_char_match(word, month):
"""Ensure all letters in the word exist in the target month name."""
word_chars = set(word)
month_chars = set(month)
return word_chars.issubset(month_chars)
def best_month_matches(text, threshold=0.7):
matches = []
words = clean_text(text).split()
for word in words:
# First check abbreviations dictionary exact match
if word in turkish_months_abbr:
full_month = turkish_months_abbr[word]
matches.append((full_month.capitalize(), word, 1.0))
continue
# Else fuzzy match full months
for month in turkish_months:
month_clean = unidecode(month.upper())
ratio = SequenceMatcher(None, word, month_clean).ratio()
if ratio >= threshold and is_valid_char_match(word, month_clean):
matches.append((month.capitalize(), word, round(ratio, 2)))
break
return matches if matches else ["N/A"]
def print_out_results(data_frame, df):
for ix, process_comment in enumerate(data_frame):
print(f'Row number {ix} : {process_comment}')
print(len(data_frame), '/', len(df))
def extract_years(text):
matches = re.findall(r'\b(19\d{2}|20\d{2})\b', text)
valid_years = [year for year in matches if start_year <= int(year) <= current_year]
return valid_years if valid_years else ["N/A"]
def extract_months(text):
# Normalize text and months by converting to ASCII lowercase
text_norm = unidecode(str(text)).lower()
months_norm = [unidecode(m).lower() for m in turkish_months]
found = [turkish_months[i] for i, m in enumerate(months_norm) if m in text_norm]
return found if found else ["N/A"]
def normalize_text(text):
return unidecode(text).lower()
def build_name_regex_all_combinations(person):
firstname = person.get("firstname", "").strip()
middle_name = person.get("middle_name", "").strip()
last_name = person.get("last_name", "").strip()
parts = [unidecode(p).lower() for p in [firstname, middle_name, last_name] if p]
patterns = []
for r in range(1, len(parts) + 1):
# Use permutations instead of combinations to get all orderings
for permuted_parts in permutations(parts, r):
regex_pattern = r"\b" + r"\s*".join(map(re.escape, permuted_parts)) + r"\b"
patterns.append((re.compile(regex_pattern, flags=re.IGNORECASE), r))
# Add full merged version like "fatihergunguclu"
if len(parts) >= 2:
merged = ''.join(parts)
patterns.append((re.compile(rf"\b{re.escape(merged)}\b", flags=re.IGNORECASE), len(parts)))
return patterns
def get_person_initials(person):
parts = [person.get("firstname", ""), person.get("middle_name", ""), person.get("last_name", "")]
return [unidecode(p.strip())[0].upper() for p in parts if p]
def get_text_initials(matched_text):
return [unidecode(word.strip())[0].upper() for word in matched_text.split() if word.strip()]
def find_person_regex_all_combinations(text, person):
text_norm = normalize_text(text)
scored_matches = []
for pattern, weight in build_name_regex_all_combinations(person):
for match in pattern.finditer(text_norm):
matched_text = match.group().strip()
scored_matches.append({
"matched_person": person,
"matched_text": matched_text,
"weight": weight
})
return scored_matches
def find_all_person_matches(text):
all_valid_matches = {}
for person in people:
matches = find_person_regex_all_combinations(text, person)
for match in matches:
matched_person = match["matched_person"]
matched_text = match["matched_text"]
weight = match["weight"]
person_initials = get_person_initials(matched_person)
found_text_letters = get_text_initials(matched_text)
match_count = sum(1 for c in found_text_letters if c in person_initials)
if match_count >= 2:
person_key = tuple(matched_person.values())
if person_key not in all_valid_matches or all_valid_matches[person_key][0] < weight:
all_valid_matches[person_key] = (weight, matched_person)
sorted_matches = sorted(all_valid_matches.values(), key=lambda x: -x[0])
return [m[1] for m in sorted_matches] or ["N/A"]
def find_all_person_matches_possibilities(text):
all_valid_matches = {}
for person in people:
matches = find_person_regex_all_combinations(text, person)
for match in matches:
matched_person = match["matched_person"]
matched_text = match["matched_text"]
weight = match["weight"]
person_initials = get_person_initials(matched_person)
found_text_letters = get_text_initials(matched_text)
person_key = tuple(matched_person.values())
all_valid_matches[person_key] = (weight, matched_person)
sorted_matches = sorted(all_valid_matches.values(), key=lambda x: -x[0])
return [m[1] for m in sorted_matches] or ["N/A"]
start_time = perf_counter()
df = pd.read_csv("account_records_incoming.csv")
df["process_comment"] = df["process_comment"].apply(clean_text)
df["year"] = df["process_comment"].str.extract(r'(\d{4})')
df["year"] = pd.to_numeric(df["year"], errors='coerce')
df["year"] = df["year"].where(df["year"].between(start_year, current_year))
df["years_found"] = df["process_comment"].apply(extract_years)
df["months_found"] = df["process_comment"].apply(extract_months)
df["fuzzy_months_found"] = df["process_comment"].apply(best_month_matches)
df["person_name_matches"] = df["process_comment"].apply(find_all_person_matches)
df["person_name_matches_possibilities"] = df["process_comment"].apply(find_all_person_matches_possibilities)
# df["year_str"] = df["year"].apply(lambda x: str(int(x)) if pd.notnull(x) else "N/A")
# print(df[["process_comment", "year_str", "months_found"]].head(100))
# for ix, row in df[["process_comment", "years_found", "months_found", "fuzzy_months_found", "person_name_matches", "person_name_matches_possibilities"]].iterrows():
# if row['person_name_matches'] == ["N/A"] and row['person_name_matches_possibilities'] != ["N/A"]:
# print(f"Row number {ix} | {row['process_comment']}")
# # print(f"Years found: {row['years_found']} | Months found: {row['months_found']} | Fuzzy months found: {row['fuzzy_months_found']}")
# print(f"Person name matches: {row['person_name_matches']} | Person name matches possibilities: {row['person_name_matches_possibilities']}")
month_direct_found = df[df['months_found'].apply(lambda x: x != ["N/A"])]
month_indirect_found = df[df['months_found'].apply(lambda x: x == ["N/A"]) | df['fuzzy_months_found'].apply(lambda x: x != ["N/A"])]
years_found = df[df['years_found'].apply(lambda x: x != ["N/A"])]
people_found = df[df['person_name_matches'].apply(lambda x: x != ["N/A"]) ]
people_found_possibilities = df[df['person_name_matches_possibilities'].apply(lambda x: x != ["N/A"]) & df['person_name_matches'].apply(lambda x: x == ["N/A"])]
print("length of years where is not N/A : ", len(years_found), '/', len(df))
print("length of months where is not N/A : ", len(month_direct_found), '/', len(df))
print("length of fuzzy months where is not N/A : ", len(month_indirect_found), '/', len(df))
print("lenght of names found", len(people_found), '/', len(df))
print("lenght of names found with possibilities", len(people_found_possibilities), '/', len(df))
end_time = perf_counter()
print(f"Time taken to read CSV: {end_time - start_time:.2f} seconds")
# for ix, row in df[['id', 'process_comment', 'years_found', 'months_found', 'fuzzy_months_found', 'person_name_matches', 'person_name_matches_possibilities']].iterrows():
# print(f"id {row['id']} | {row['process_comment']} | {row['years_found']} | {row['months_found']} | Person Name Matches : {row['person_name_matches']}")
# if row['person_name_matches'] == ["N/A"] and row['person_name_matches_possibilities'] != ["N/A"]:
# print(f"id {row['id']} | Fuzzy person Matches : {row['person_name_matches_possibilities']}")
# if row['months_found'] == ["N/A"] and row['fuzzy_months_found'] != ["N/A"]:
# print(f"id {row['id']} | Fuzzy month Matches : {row['fuzzy_months_found']}")