import re import arrow from json import loads, dumps from unidecode import unidecode from difflib import SequenceMatcher from itertools import permutations from time import perf_counter from sqlalchemy import text as sqlalchemy_text from Controllers.Postgres.engine import get_session_factory from Schemas.account.account import AccountRecordsPredict, AccountRecords def clean_text(text): text = str(text) text = re.sub(r'\d{8,}', ' ', text) # text = re.sub(r'\b[A-Za-z0-9]*?[0-9]+[A-Za-z0-9]*?[A-Za-z]+[A-Za-z0-9]*\b|\b[A-Za-z0-9]*?[A-Za-z]+[A-Za-z0-9]*?[0-9]+[A-Za-z0-9]*\b', ' ', text) text = text.replace("/", " ") text = text.replace("_", " ") text_remove_underscore = text.replace("-", " ").replace("+", " ") text_remove_asterisk = text_remove_underscore.replace("*", " ") text_remove_comma = text_remove_asterisk.replace(",", " ") text_remove_dots = text_remove_comma.replace(".", " ") text_remove_dots = re.sub(r'\s+', ' ', text_remove_dots) text_remove_dots = text_remove_dots.strip() return text_remove_dots def normalize_text(text): text = text.replace('İ', 'i') text = text.replace('I', 'ı') text = text.replace('Ş', 'ş') text = text.replace('Ğ', 'ğ') text = text.replace('Ü', 'ü') text = text.replace('Ö', 'ö') text = text.replace('Ç', 'ç') return unidecode(text).lower() def get_person_initials(person): parts = [person.get("firstname", ""), person.get("middle_name", ""), person.get("surname", ""), person.get("birthname", "")] return [unidecode(p.strip())[0].upper() for p in parts if p] def get_text_initials(matched_text): return [unidecode(word.strip())[0].upper() for word in matched_text.split() if word.strip()] def generate_dictonary_of_patterns(people): """ completly remove middle_name instead do regex firstName + SomeWord + surname """ patterns_dict = {} for person in people: person_id = person.get('id') firstname = person.get('firstname', '').strip() if person.get('firstname') else "" middle_name = person.get('middle_name', '').strip() if person.get('middle_name') else "" surname = person.get('surname', '').strip() if person.get('surname') else "" birthname = person.get('birthname', '').strip() if person.get('birthname') else "" if not firstname or not surname: continue name_parts = { 'firstname': { 'orig': firstname, 'norm': normalize_text(firstname) if firstname else "", 'init': normalize_text(firstname)[0] if firstname else "" }, 'surname': { 'orig': surname, 'norm': normalize_text(surname) if surname else "", 'init': normalize_text(surname)[0] if surname else "" } } if middle_name: name_parts['middle_name'] = { 'orig': middle_name, 'norm': normalize_text(middle_name) if middle_name else "", 'init': normalize_text(middle_name)[0] if middle_name else "" } if birthname and normalize_text(birthname) != normalize_text(surname): name_parts['birthname'] = { 'orig': birthname, 'norm': normalize_text(birthname), 'init': normalize_text(birthname)[0] if birthname else "" } person_patterns = set() def create_pattern(parts, formats, separators=None): if separators is None: separators = [""] patterns = [] for fmt in formats: for sep in separators: pattern_parts = [] for part_type, part_name in fmt: if part_name in parts and part_type in parts[part_name]: pattern_parts.append(re.escape(parts[part_name][part_type])) if pattern_parts: patterns.append(r"\b" + sep.join(pattern_parts) + r"\b") return patterns name_formats = [ [('orig', 'firstname'), ('orig', 'surname')], [('norm', 'firstname'), ('norm', 'surname')], [('orig', 'surname'), ('orig', 'firstname')], [('norm', 'surname'), ('norm', 'firstname')], ] if 'middle_name' in name_parts: name_formats = [ [('orig', 'firstname'), ('orig', 'middle_name'), ('orig', 'surname')], [('norm', 'firstname'), ('norm', 'middle_name'), ('norm', 'surname')], ] person_patterns.update(create_pattern(name_parts, name_formats, [" ", ""])) if 'middle_name' in name_parts: middle_name_formats = [ [('orig', 'firstname'), ('orig', 'middle_name')], [('norm', 'firstname'), ('norm', 'middle_name')], [('orig', 'middle_name'), ('orig', 'surname')], [('norm', 'middle_name'), ('norm', 'surname')], ] person_patterns.update(create_pattern(name_parts, middle_name_formats, [" ", ""])) if 'birthname' in name_parts and name_parts['surname']['orig'] != name_parts['birthname']['orig']: birthname_formats = [ [('orig', 'firstname'), ('orig', 'birthname')], [('norm', 'firstname'), ('norm', 'birthname')], [('orig', 'birthname'), ('orig', 'firstname')], [('norm', 'birthname'), ('norm', 'firstname')], ] person_patterns.update(create_pattern(name_parts, birthname_formats, [" ", ""])) initial_formats = [ [('init', 'firstname'), ('init', 'middle_name'), ('init', 'surname')], [('init', 'firstname'), ('init', 'surname')], ] person_patterns.update(create_pattern(name_parts, initial_formats, ["", ".", " ", ". "])) if 'middle_name' in name_parts: triple_initial_formats = [ [('init', 'firstname'), ('init', 'middle_name'), ('init', 'surname')], ] person_patterns.update(create_pattern(name_parts, triple_initial_formats, ["", ".", " ", ". "])) compiled_patterns = [re.compile(pattern, re.IGNORECASE) for pattern in person_patterns] patterns_dict[person_id] = compiled_patterns return patterns_dict def extract_person_name_with_regex(found_dict, process_comment, patterns_dict, people): cleaned_text = process_comment all_matches = [] for person_id, patterns in patterns_dict.items(): person = next((p for p in people if p.get('id') == person_id), None) if not person: continue firstname_norm = normalize_text(person.get("firstname", "").strip()) if person.get("firstname") else "" middle_name_norm = normalize_text(person.get("middle_name", "").strip()) if person.get("middle_name") else "" surname_norm = normalize_text(person.get("surname", "").strip()) if person.get("surname") else "" birthname_norm = normalize_text(person.get("birthname", "").strip()) if person.get("birthname") else "" text_norm = normalize_text(process_comment) person_matches = [] for pattern in patterns: for match in pattern.finditer(text_norm): start, end = match.span() matched_text = process_comment[start:end] matched_text_norm = normalize_text(matched_text) is_valid_match = False # Strict validation: require both firstname AND surname/birthname # No single-word matches allowed if len(matched_text_norm.split()) <= 1: # Single word matches are not allowed is_valid_match = False else: # For multi-word matches, require firstname AND (surname OR birthname) has_firstname = firstname_norm and firstname_norm in matched_text_norm has_surname = surname_norm and surname_norm in matched_text_norm has_birthname = birthname_norm and birthname_norm in matched_text_norm # Both firstname and surname/birthname must be present if (has_firstname and has_surname) or (has_firstname and has_birthname): is_valid_match = True if is_valid_match: person_matches.append({ 'matched_text': matched_text, 'start': start, 'end': end }) if person_matches: person_matches.sort(key=lambda x: len(x['matched_text']), reverse=True) non_overlapping_matches = [] for match in person_matches: overlaps = False for existing_match in non_overlapping_matches: if (match['start'] < existing_match['end'] and match['end'] > existing_match['start']): overlaps = True break if not overlaps: non_overlapping_matches.append(match) if non_overlapping_matches: found_dict["name_match"] = person all_matches.extend([(match, person) for match in non_overlapping_matches]) if all_matches: all_matches.sort(key=lambda x: x[0]['start'], reverse=True) for match, person in all_matches: matched_text = match['matched_text'] matched_words = matched_text.split() for word in matched_words: word_norm = normalize_text(word).strip() if not word_norm: continue text_norm = normalize_text(cleaned_text) for word_match in re.finditer(rf'\b{re.escape(word_norm)}\b', text_norm, re.IGNORECASE): start, end = word_match.span() cleaned_text = cleaned_text[:start] + ' ' * (end - start) + cleaned_text[end:] cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip() return found_dict, cleaned_text def extract_build_parts_info(found_dict, process_comment): """ Regex of parts such as : 2 nolu daire 9 NUMARALI DAI daire 3 3 nolu dairenin 11nolu daire Daire No 12 2NOLU DAIRE 12 No lu daire D:10 NO:11 NO :3 """ # Initialize apartment number variable apartment_number = None cleaned_text = process_comment def clean_text_apartment_number(text, match): clean_text = text.replace(match.group(0), '').strip() clean_text = re.sub(r'\s+', ' ', clean_text).strip() return clean_text # Pattern 1: X nolu daire (with space) pattern1 = re.compile(r'(\d+)\s*nolu\s*daire', re.IGNORECASE) match = pattern1.search(cleaned_text) if match: apartment_number = match.group(1) found_dict['apartment_number'] = apartment_number return found_dict, clean_text_apartment_number(cleaned_text, match) # Pattern 4: X nolu dairenin pattern4 = re.compile(r'(\d+)\s*nolu\s*daire\w*', re.IGNORECASE) match = pattern4.search(cleaned_text) if match: apartment_number = match.group(1) found_dict['apartment_number'] = apartment_number return found_dict, clean_text_apartment_number(cleaned_text, match) # Pattern 5: XNolu daire (without space) pattern5 = re.compile(r'(\d+)nolu\s*daire', re.IGNORECASE) match = pattern5.search(cleaned_text) if match: apartment_number = match.group(1) found_dict['apartment_number'] = apartment_number return found_dict, clean_text_apartment_number(cleaned_text, match) # Pattern 7: XNOLU DAIRE (all caps, no space) pattern7 = re.compile(r'(\d+)nolu\s*daire', re.IGNORECASE) match = pattern7.search(cleaned_text) if match: apartment_number = match.group(1) found_dict['apartment_number'] = apartment_number return found_dict, clean_text_apartment_number(cleaned_text, match) # Pattern 8: X No lu daire pattern8 = re.compile(r'(\d+)\s*no\s*lu\s*daire', re.IGNORECASE) match = pattern8.search(cleaned_text) if match: apartment_number = match.group(1) found_dict['apartment_number'] = apartment_number return found_dict, clean_text_apartment_number(cleaned_text, match) # Pattern 6: Daire No X pattern6 = re.compile(r'daire\s*no\s*(\d+)', re.IGNORECASE) match = pattern6.search(cleaned_text) if match: apartment_number = match.group(1) found_dict['apartment_number'] = apartment_number return found_dict, clean_text_apartment_number(cleaned_text, match) # Pattern 2: X NUMARALI DAI pattern2 = re.compile(r'(\d+)\s*numarali\s*dai', re.IGNORECASE) match = pattern2.search(cleaned_text) if match: apartment_number = match.group(1) found_dict['apartment_number'] = apartment_number return found_dict, clean_text_apartment_number(cleaned_text, match) # Pattern 3: daire X pattern3 = re.compile(r'daire\s*(\d+)', re.IGNORECASE) match = pattern3.search(cleaned_text) if match: apartment_number = match.group(1) found_dict['apartment_number'] = apartment_number return found_dict, clean_text_apartment_number(cleaned_text, match) # Pattern 9: D:X pattern9 = re.compile(r'd\s*:\s*(\d+)', re.IGNORECASE) match = pattern9.search(cleaned_text) if match: apartment_number = match.group(1) found_dict['apartment_number'] = apartment_number return found_dict, clean_text_apartment_number(cleaned_text, match) # Pattern 10: NO:X or NO :X pattern10 = re.compile(r'no\s*:\s*(\d+)', re.IGNORECASE) match = pattern10.search(cleaned_text) if match: apartment_number = match.group(1) found_dict['apartment_number'] = apartment_number return found_dict, clean_text_apartment_number(cleaned_text, match) return found_dict, cleaned_text def extract_months(found_dict, process_comment): """ Extract Turkish month names and abbreviations from the process comment """ original_text = process_comment # Updated dictionary with normalized keys for better matching month_to_number_dict = { "ocak": 1, "şubat": 2, "mart": 3, "nisan": 4, "mayıs": 5, "haziran": 6, "temmuz": 7, "ağustos": 8, "eylül": 9, "ekim": 10, "kasım": 11, "aralık": 12, # Add normalized versions without Turkish characters "ocak": 1, "subat": 2, "mart": 3, "nisan": 4, "mayis": 5, "haziran": 6, "temmuz": 7, "agustos": 8, "eylul": 9, "ekim": 10, "kasim": 11, "aralik": 12 } def clean_text_month(text, match): clean_text = text.replace(match.group(0), '').strip() clean_text = re.sub(r'\s+', ' ', clean_text).strip() return clean_text def normalize_turkish(text): """Properly normalize Turkish text for case-insensitive comparison""" text = text.lower() text = text.replace('i̇', 'i') # Handle dotted i properly text = text.replace('ı', 'i') # Convert dotless i to regular i for matching text = unidecode(text) # Remove other diacritics return text if 'months' not in found_dict: found_dict['months'] = [] months_found, working_text = False, original_text for month in turkish_months: pattern = re.compile(r'\b' + re.escape(month) + r'\b', re.IGNORECASE) for match in pattern.finditer(original_text): matched_text = match.group(0) normalized_month = normalize_turkish(month) month_number = None if month.lower() in month_to_number_dict: month_number = month_to_number_dict[month.lower()] elif normalized_month in month_to_number_dict: month_number = month_to_number_dict[normalized_month] month_info = {'name': month, 'number': month_number} found_dict['months'].append(month_info) months_found = True working_text = working_text.replace(matched_text, '', 1) for abbr, full_month in turkish_months_abbr.items(): pattern = re.compile(r'\b' + re.escape(abbr) + r'\b', re.IGNORECASE) for match in pattern.finditer(working_text): matched_text = match.group(0) normalized_month = normalize_turkish(full_month) month_number = None if full_month.lower() in month_to_number_dict: month_number = month_to_number_dict[full_month.lower()] elif normalized_month in month_to_number_dict: month_number = month_to_number_dict[normalized_month] month_info = {'name': full_month, 'number': month_number} found_dict['months'].append(month_info) months_found = True working_text = working_text.replace(matched_text, '', 1) return found_dict, working_text def extract_year(found_dict, process_comment): """ Extract years from the process comment """ original_text = process_comment if 'years' not in found_dict: found_dict['years'] = [] working_text = original_text for year in range(start_year, current_year + 1): pattern = re.compile(r'\b' + str(year) + r'\b', re.IGNORECASE) for match in pattern.finditer(original_text): matched_text = match.group(0) if str(matched_text).isdigit(): found_dict['years'].append(int(matched_text)) working_text = working_text.replace(matched_text, '', 1) return found_dict, working_text def extract_payment_type(found_dict, process_comment): """ Extract payment type from the process comment aidat AİD aidatı TADİLAT YAKIT yakıt yakit """ original_text = process_comment working_text = original_text if 'payment_types' not in found_dict: found_dict['payment_types'] = [] payment_keywords = { 'aidat': ['aidat', 'aİd', 'aid', 'aidatı', 'aidati'], 'tadilat': ['tadilat', 'tadİlat', 'tadilatı'], 'yakit': ['yakit', 'yakıt', 'yakıtı', 'yakiti'] } for payment_type, keywords in payment_keywords.items(): for keyword in keywords: pattern = re.compile(r'\b' + keyword + r'\b', re.IGNORECASE) for match in pattern.finditer(original_text): matched_text = match.group(0) if payment_type not in found_dict['payment_types']: found_dict['payment_types'].append(payment_type) working_text = working_text.replace(matched_text, '', 1) return found_dict, working_text def main(session, account_records, people): list_of_regex_patterns = generate_dictonary_of_patterns(people=people) dicts_found = dict() dicts_not_found = dict() count_extracted = 0 for account_record in account_records: account_record_id = str(account_record["id"]) found_dict = {} process_comment_iteration = clean_text(text=account_record["process_comment"]) found_dict, cleaned_process_comment = extract_person_name_with_regex( found_dict=found_dict, process_comment=process_comment_iteration, patterns_dict=list_of_regex_patterns, people=people ) found_dict, cleaned_process_comment = extract_build_parts_info( found_dict=found_dict, process_comment=cleaned_process_comment ) found_dict, cleaned_process_comment = extract_months( found_dict=found_dict, process_comment=cleaned_process_comment ) found_dict, cleaned_process_comment = extract_year( found_dict=found_dict, process_comment=cleaned_process_comment ) found_dict, cleaned_process_comment = extract_payment_type( found_dict=found_dict, process_comment=cleaned_process_comment ) if found_dict: dicts_found[str(account_record_id)] = found_dict else: dicts_not_found[str(account_record_id)] = account_record_id for id_, item in dicts_found.items(): AccountRecordsPredict.set_session(session) AccountRecords.set_session(session) months_are_valid = bool(item.get("months", [])) years_are_valid = bool(item.get("years", [])) payment_types_are_valid = bool(item.get("payment_types", [])) apartment_number_are_valid = bool(item.get("apartment_number", [])) person_name_are_valid = bool(item.get("name_match", [])) account_record_to_save = AccountRecords.query.filter_by(id=int(id_)).first() save_dict = dict( account_records_id=account_record_to_save.id, account_records_uu_id=str(account_record_to_save.uu_id), prediction_model="regex", treshold=1, is_first_prediction=False ) update_dict = dict(prediction_model="regex", treshold=1, is_first_prediction=False) if any([months_are_valid, years_are_valid, payment_types_are_valid, apartment_number_are_valid, person_name_are_valid]): count_extracted += 1 if months_are_valid: print(f"months: {item['months']}") data_to_save = dumps({"data": item['months']}) prediction_result = AccountRecordsPredict.query.filter_by(account_records_id=account_record_to_save.id, prediction_field="months", prediction_model="regex").first() if not prediction_result: created_account_prediction = AccountRecordsPredict.create(**save_dict, prediction_field="months", prediction_result=data_to_save) created_account_prediction.save() else: prediction_result.update(**update_dict, prediction_result=data_to_save) prediction_result.save() if years_are_valid: print(f"years: {item['years']}") data_to_save = dumps({"data": item['years']}) prediction_result = AccountRecordsPredict.query.filter_by(account_records_id=account_record_to_save.id, prediction_field="years", prediction_model="regex").first() if not prediction_result: created_account_prediction = AccountRecordsPredict.create(**save_dict, prediction_field="years", prediction_result=data_to_save) created_account_prediction.save() else: prediction_result.update(**update_dict, prediction_result=data_to_save) prediction_result.save() if payment_types_are_valid: print(f"payment_types: {item['payment_types']}") data_to_save = dumps({"data": item['payment_types']}) prediction_result = AccountRecordsPredict.query.filter_by(account_records_id=account_record_to_save.id, prediction_field="payment_types", prediction_model="regex").first() if not prediction_result: created_account_prediction = AccountRecordsPredict.create(**save_dict, prediction_field="payment_types", prediction_result=data_to_save) created_account_prediction.save() else: prediction_result.update(**update_dict, prediction_result=data_to_save) prediction_result.save() if apartment_number_are_valid: print(f"apartment_number: {item['apartment_number']}") prediction_result = AccountRecordsPredict.query.filter_by(account_records_id=account_record_to_save.id, prediction_field="apartment_number", prediction_model="regex").first() if not prediction_result: created_account_prediction = AccountRecordsPredict.create(**save_dict, prediction_field="apartment_number", prediction_result=item['apartment_number']) created_account_prediction.save() else: prediction_result.update(**update_dict, prediction_result=item['apartment_number']) prediction_result.save() if person_name_are_valid: print(f"person_name: {item['name_match']}") data_to_save = dumps({"data": item['name_match']}) prediction_result = AccountRecordsPredict.query.filter_by(account_records_id=account_record_to_save.id, prediction_field="person_name", prediction_model="regex").first() if not prediction_result: created_account_prediction = AccountRecordsPredict.create(**save_dict, prediction_field="person_name", prediction_result=data_to_save) created_account_prediction.save() else: prediction_result.update(**update_dict, prediction_result=data_to_save) prediction_result.save() print("\n===== SUMMARY =====") print(f"extracted data total : {count_extracted}") print(f"not extracted data total : {len(account_records) - count_extracted}") print(f"Total account records processed : {len(account_records)}") if __name__ == "__main__": session_factory = get_session_factory() session = session_factory() turkish_months = ["OCAK", "ŞUBAT", "MART", "NİSAN", "MAYIS", "HAZİRAN", "TEMMUZ", "AĞUSTOS", "EYLÜL", "EKİM", "KASIM", "ARALIK"] turkish_months_abbr = { "OCA": "OCAK", "SUB": "ŞUBAT", "ŞUB": "ŞUBAT", "MAR": "MART", "NIS": "NİSAN", "MAY": "MAYIS", "HAZ": "HAZİRAN", "HZR": "HAZİRAN", "TEM": "TEMMUZ", "AGU": "AĞUSTOS", "AGT": "AĞUSTOS", "EYL": "EYLÜL", "EKI": "EKİM", "KAS": "KASIM", "ARA": "ARALIK", } start_year = 1950 current_year = arrow.now().year people_query = sqlalchemy_text(""" SELECT DISTINCT ON (p.id) p.firstname, p.middle_name, p.surname, p.birthname, bl.id FROM public.people as p INNER JOIN public.build_living_space as bl ON bl.person_id = p.id INNER JOIN public.build_parts as bp ON bp.id = bl.build_parts_id INNER JOIN public.build as b ON b.id = bp.build_id WHERE b.id = 1 ORDER BY p.id """) people_raw = session.execute(people_query).all() remove_duplicate = list() clean_people_list = list() for person in people_raw: merged_name = f"{person[0]} {person[1]} {person[2]} {person[3]}" if merged_name not in remove_duplicate: clean_people_list.append(person) remove_duplicate.append(merged_name) people = [{"firstname": p[0], "middle_name": p[1], "surname": p[2], "birthname": p[3], 'id': p[4]} for p in clean_people_list] query_account_records = sqlalchemy_text(""" SELECT a.id, a.iban, a.bank_date, a.process_comment FROM public.account_records as a where currency_value > 0 """) # and bank_date::date >= '2020-01-01' account_records = session.execute(query_account_records).all() account_records = [{"id": ar[0], "iban": ar[1], "bank_date": ar[2], "process_comment": ar[3]} for ar in account_records] try: main(session=session, account_records=account_records, people=people) except Exception as e: print(f"{e}") session.close() session_factory.remove()