production-evyos-systems-an.../ServicesBank/Parser/runner.py

637 lines
27 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import re
import arrow
from json import loads, dumps
from unidecode import unidecode
from difflib import SequenceMatcher
from itertools import permutations
from time import perf_counter
from sqlalchemy import text as sqlalchemy_text
from Controllers.Postgres.engine import get_session_factory
from Schemas.account.account import AccountRecordsPredict, AccountRecords
def clean_text(text):
text = str(text)
text = re.sub(r'\d{8,}', ' ', text)
# text = re.sub(r'\b[A-Za-z0-9]*?[0-9]+[A-Za-z0-9]*?[A-Za-z]+[A-Za-z0-9]*\b|\b[A-Za-z0-9]*?[A-Za-z]+[A-Za-z0-9]*?[0-9]+[A-Za-z0-9]*\b', ' ', text)
text = text.replace("/", " ")
text = text.replace("_", " ")
text_remove_underscore = text.replace("-", " ").replace("+", " ")
text_remove_asterisk = text_remove_underscore.replace("*", " ")
text_remove_comma = text_remove_asterisk.replace(",", " ")
text_remove_dots = text_remove_comma.replace(".", " ")
text_remove_dots = re.sub(r'\s+', ' ', text_remove_dots)
text_remove_dots = text_remove_dots.strip()
return text_remove_dots
def normalize_text(text):
text = text.replace('İ', 'i')
text = text.replace('I', 'ı')
text = text.replace('Ş', 'ş')
text = text.replace('Ğ', 'ğ')
text = text.replace('Ü', 'ü')
text = text.replace('Ö', 'ö')
text = text.replace('Ç', 'ç')
return unidecode(text).lower()
def get_person_initials(person):
parts = [person.get("firstname", ""), person.get("middle_name", ""), person.get("surname", ""), person.get("birthname", "")]
return [unidecode(p.strip())[0].upper() for p in parts if p]
def get_text_initials(matched_text):
return [unidecode(word.strip())[0].upper() for word in matched_text.split() if word.strip()]
def generate_dictonary_of_patterns(people):
"""
completly remove middle_name instead do regex firstName + SomeWord + surname
"""
patterns_dict = {}
for person in people:
person_id = person.get('id')
firstname = person.get('firstname', '').strip() if person.get('firstname') else ""
middle_name = person.get('middle_name', '').strip() if person.get('middle_name') else ""
surname = person.get('surname', '').strip() if person.get('surname') else ""
birthname = person.get('birthname', '').strip() if person.get('birthname') else ""
if not firstname or not surname:
continue
name_parts = {
'firstname': {
'orig': firstname,
'norm': normalize_text(firstname) if firstname else "",
'init': normalize_text(firstname)[0] if firstname else ""
},
'surname': {
'orig': surname,
'norm': normalize_text(surname) if surname else "",
'init': normalize_text(surname)[0] if surname else ""
}
}
if middle_name:
name_parts['middle_name'] = {
'orig': middle_name,
'norm': normalize_text(middle_name) if middle_name else "",
'init': normalize_text(middle_name)[0] if middle_name else ""
}
if birthname and normalize_text(birthname) != normalize_text(surname):
name_parts['birthname'] = {
'orig': birthname,
'norm': normalize_text(birthname),
'init': normalize_text(birthname)[0] if birthname else ""
}
person_patterns = set()
def create_pattern(parts, formats, separators=None):
if separators is None:
separators = [""]
patterns = []
for fmt in formats:
for sep in separators:
pattern_parts = []
for part_type, part_name in fmt:
if part_name in parts and part_type in parts[part_name]:
pattern_parts.append(re.escape(parts[part_name][part_type]))
if pattern_parts:
patterns.append(r"\b" + sep.join(pattern_parts) + r"\b")
return patterns
name_formats = [
[('orig', 'firstname'), ('orig', 'surname')],
[('norm', 'firstname'), ('norm', 'surname')],
[('orig', 'surname'), ('orig', 'firstname')],
[('norm', 'surname'), ('norm', 'firstname')],
]
if 'middle_name' in name_parts:
name_formats = [
[('orig', 'firstname'), ('orig', 'middle_name'), ('orig', 'surname')],
[('norm', 'firstname'), ('norm', 'middle_name'), ('norm', 'surname')],
]
person_patterns.update(create_pattern(name_parts, name_formats, [" ", ""]))
if 'middle_name' in name_parts:
middle_name_formats = [
[('orig', 'firstname'), ('orig', 'middle_name')],
[('norm', 'firstname'), ('norm', 'middle_name')],
[('orig', 'middle_name'), ('orig', 'surname')],
[('norm', 'middle_name'), ('norm', 'surname')],
]
person_patterns.update(create_pattern(name_parts, middle_name_formats, [" ", ""]))
if 'birthname' in name_parts and name_parts['surname']['orig'] != name_parts['birthname']['orig']:
birthname_formats = [
[('orig', 'firstname'), ('orig', 'birthname')],
[('norm', 'firstname'), ('norm', 'birthname')],
[('orig', 'birthname'), ('orig', 'firstname')],
[('norm', 'birthname'), ('norm', 'firstname')],
]
person_patterns.update(create_pattern(name_parts, birthname_formats, [" ", ""]))
initial_formats = [
[('init', 'firstname'), ('init', 'middle_name'), ('init', 'surname')],
[('init', 'firstname'), ('init', 'surname')],
]
person_patterns.update(create_pattern(name_parts, initial_formats, ["", ".", " ", ". "]))
if 'middle_name' in name_parts:
triple_initial_formats = [
[('init', 'firstname'), ('init', 'middle_name'), ('init', 'surname')],
]
person_patterns.update(create_pattern(name_parts, triple_initial_formats, ["", ".", " ", ". "]))
compiled_patterns = [re.compile(pattern, re.IGNORECASE) for pattern in person_patterns]
patterns_dict[person_id] = compiled_patterns
return patterns_dict
def extract_person_name_with_regex(found_dict, process_comment, patterns_dict, people):
cleaned_text = process_comment
all_matches = []
for person_id, patterns in patterns_dict.items():
person = next((p for p in people if p.get('id') == person_id), None)
if not person:
continue
firstname_norm = normalize_text(person.get("firstname", "").strip()) if person.get("firstname") else ""
middle_name_norm = normalize_text(person.get("middle_name", "").strip()) if person.get("middle_name") else ""
surname_norm = normalize_text(person.get("surname", "").strip()) if person.get("surname") else ""
birthname_norm = normalize_text(person.get("birthname", "").strip()) if person.get("birthname") else ""
text_norm = normalize_text(process_comment)
person_matches = []
for pattern in patterns:
for match in pattern.finditer(text_norm):
start, end = match.span()
matched_text = process_comment[start:end]
matched_text_norm = normalize_text(matched_text)
is_valid_match = False
# Strict validation: require both firstname AND surname/birthname
# No single-word matches allowed
if len(matched_text_norm.split()) <= 1:
# Single word matches are not allowed
is_valid_match = False
else:
# For multi-word matches, require firstname AND (surname OR birthname)
has_firstname = firstname_norm and firstname_norm in matched_text_norm
has_surname = surname_norm and surname_norm in matched_text_norm
has_birthname = birthname_norm and birthname_norm in matched_text_norm
# Both firstname and surname/birthname must be present
if (has_firstname and has_surname) or (has_firstname and has_birthname):
is_valid_match = True
if is_valid_match:
person_matches.append({
'matched_text': matched_text,
'start': start,
'end': end
})
if person_matches:
person_matches.sort(key=lambda x: len(x['matched_text']), reverse=True)
non_overlapping_matches = []
for match in person_matches:
overlaps = False
for existing_match in non_overlapping_matches:
if (match['start'] < existing_match['end'] and match['end'] > existing_match['start']):
overlaps = True
break
if not overlaps:
non_overlapping_matches.append(match)
if non_overlapping_matches:
found_dict["name_match"] = person
all_matches.extend([(match, person) for match in non_overlapping_matches])
if all_matches:
all_matches.sort(key=lambda x: x[0]['start'], reverse=True)
for match, person in all_matches:
matched_text = match['matched_text']
matched_words = matched_text.split()
for word in matched_words:
word_norm = normalize_text(word).strip()
if not word_norm:
continue
text_norm = normalize_text(cleaned_text)
for word_match in re.finditer(rf'\b{re.escape(word_norm)}\b', text_norm, re.IGNORECASE):
start, end = word_match.span()
cleaned_text = cleaned_text[:start] + ' ' * (end - start) + cleaned_text[end:]
cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
return found_dict, cleaned_text
def extract_build_parts_info(found_dict, process_comment):
"""
Regex of parts such as :
2 nolu daire
9 NUMARALI DAI
daire 3
3 nolu dairenin
11nolu daire
Daire No 12
2NOLU DAIRE
12 No lu daire
D:10
NO:11
NO :3
"""
# Initialize apartment number variable
apartment_number = None
cleaned_text = process_comment
def clean_text_apartment_number(text, match):
clean_text = text.replace(match.group(0), '').strip()
clean_text = re.sub(r'\s+', ' ', clean_text).strip()
return clean_text
# Pattern 1: X nolu daire (with space)
pattern1 = re.compile(r'(\d+)\s*nolu\s*daire', re.IGNORECASE)
match = pattern1.search(cleaned_text)
if match:
apartment_number = match.group(1)
found_dict['apartment_number'] = apartment_number
return found_dict, clean_text_apartment_number(cleaned_text, match)
# Pattern 4: X nolu dairenin
pattern4 = re.compile(r'(\d+)\s*nolu\s*daire\w*', re.IGNORECASE)
match = pattern4.search(cleaned_text)
if match:
apartment_number = match.group(1)
found_dict['apartment_number'] = apartment_number
return found_dict, clean_text_apartment_number(cleaned_text, match)
# Pattern 5: XNolu daire (without space)
pattern5 = re.compile(r'(\d+)nolu\s*daire', re.IGNORECASE)
match = pattern5.search(cleaned_text)
if match:
apartment_number = match.group(1)
found_dict['apartment_number'] = apartment_number
return found_dict, clean_text_apartment_number(cleaned_text, match)
# Pattern 7: XNOLU DAIRE (all caps, no space)
pattern7 = re.compile(r'(\d+)nolu\s*daire', re.IGNORECASE)
match = pattern7.search(cleaned_text)
if match:
apartment_number = match.group(1)
found_dict['apartment_number'] = apartment_number
return found_dict, clean_text_apartment_number(cleaned_text, match)
# Pattern 8: X No lu daire
pattern8 = re.compile(r'(\d+)\s*no\s*lu\s*daire', re.IGNORECASE)
match = pattern8.search(cleaned_text)
if match:
apartment_number = match.group(1)
found_dict['apartment_number'] = apartment_number
return found_dict, clean_text_apartment_number(cleaned_text, match)
# Pattern 6: Daire No X
pattern6 = re.compile(r'daire\s*no\s*(\d+)', re.IGNORECASE)
match = pattern6.search(cleaned_text)
if match:
apartment_number = match.group(1)
found_dict['apartment_number'] = apartment_number
return found_dict, clean_text_apartment_number(cleaned_text, match)
# Pattern 2: X NUMARALI DAI
pattern2 = re.compile(r'(\d+)\s*numarali\s*dai', re.IGNORECASE)
match = pattern2.search(cleaned_text)
if match:
apartment_number = match.group(1)
found_dict['apartment_number'] = apartment_number
return found_dict, clean_text_apartment_number(cleaned_text, match)
# Pattern 3: daire X
pattern3 = re.compile(r'daire\s*(\d+)', re.IGNORECASE)
match = pattern3.search(cleaned_text)
if match:
apartment_number = match.group(1)
found_dict['apartment_number'] = apartment_number
return found_dict, clean_text_apartment_number(cleaned_text, match)
# Pattern 9: D:X
pattern9 = re.compile(r'd\s*:\s*(\d+)', re.IGNORECASE)
match = pattern9.search(cleaned_text)
if match:
apartment_number = match.group(1)
found_dict['apartment_number'] = apartment_number
return found_dict, clean_text_apartment_number(cleaned_text, match)
# Pattern 10: NO:X or NO :X
pattern10 = re.compile(r'no\s*:\s*(\d+)', re.IGNORECASE)
match = pattern10.search(cleaned_text)
if match:
apartment_number = match.group(1)
found_dict['apartment_number'] = apartment_number
return found_dict, clean_text_apartment_number(cleaned_text, match)
return found_dict, cleaned_text
def extract_months(found_dict, process_comment):
"""
Extract Turkish month names and abbreviations from the process comment
"""
original_text = process_comment
# Updated dictionary with normalized keys for better matching
month_to_number_dict = {
"ocak": 1, "şubat": 2, "mart": 3, "nisan": 4, "mayıs": 5, "haziran": 6,
"temmuz": 7, "ağustos": 8, "eylül": 9, "ekim": 10, "kasım": 11, "aralık": 12,
# Add normalized versions without Turkish characters
"ocak": 1, "subat": 2, "mart": 3, "nisan": 4, "mayis": 5, "haziran": 6,
"temmuz": 7, "agustos": 8, "eylul": 9, "ekim": 10, "kasim": 11, "aralik": 12
}
def clean_text_month(text, match):
clean_text = text.replace(match.group(0), '').strip()
clean_text = re.sub(r'\s+', ' ', clean_text).strip()
return clean_text
def normalize_turkish(text):
"""Properly normalize Turkish text for case-insensitive comparison"""
text = text.lower()
text = text.replace('', 'i') # Handle dotted i properly
text = text.replace('ı', 'i') # Convert dotless i to regular i for matching
text = unidecode(text) # Remove other diacritics
return text
if 'months' not in found_dict:
found_dict['months'] = []
months_found, working_text = False, original_text
for month in turkish_months:
pattern = re.compile(r'\b' + re.escape(month) + r'\b', re.IGNORECASE)
for match in pattern.finditer(original_text):
matched_text = match.group(0)
normalized_month = normalize_turkish(month)
month_number = None
if month.lower() in month_to_number_dict:
month_number = month_to_number_dict[month.lower()]
elif normalized_month in month_to_number_dict:
month_number = month_to_number_dict[normalized_month]
month_info = {'name': month, 'number': month_number}
found_dict['months'].append(month_info)
months_found = True
working_text = working_text.replace(matched_text, '', 1)
for abbr, full_month in turkish_months_abbr.items():
pattern = re.compile(r'\b' + re.escape(abbr) + r'\b', re.IGNORECASE)
for match in pattern.finditer(working_text):
matched_text = match.group(0)
normalized_month = normalize_turkish(full_month)
month_number = None
if full_month.lower() in month_to_number_dict:
month_number = month_to_number_dict[full_month.lower()]
elif normalized_month in month_to_number_dict:
month_number = month_to_number_dict[normalized_month]
month_info = {'name': full_month, 'number': month_number}
found_dict['months'].append(month_info)
months_found = True
working_text = working_text.replace(matched_text, '', 1)
return found_dict, working_text
def extract_year(found_dict, process_comment):
"""
Extract years from the process comment
"""
original_text = process_comment
if 'years' not in found_dict:
found_dict['years'] = []
working_text = original_text
for year in range(start_year, current_year + 1):
pattern = re.compile(r'\b' + str(year) + r'\b', re.IGNORECASE)
for match in pattern.finditer(original_text):
matched_text = match.group(0)
if str(matched_text).isdigit():
found_dict['years'].append(int(matched_text))
working_text = working_text.replace(matched_text, '', 1)
return found_dict, working_text
def extract_payment_type(found_dict, process_comment):
"""
Extract payment type from the process comment
aidat
AİD
aidatı
TADİLAT
YAKIT
yakıt
yakit
"""
original_text = process_comment
working_text = original_text
if 'payment_types' not in found_dict:
found_dict['payment_types'] = []
payment_keywords = {
'aidat': ['aidat', 'aİd', 'aid', 'aidatı', 'aidati'],
'tadilat': ['tadilat', 'tadİlat', 'tadilatı'],
'yakit': ['yakit', 'yakıt', 'yakıtı', 'yakiti']
}
for payment_type, keywords in payment_keywords.items():
for keyword in keywords:
pattern = re.compile(r'\b' + keyword + r'\b', re.IGNORECASE)
for match in pattern.finditer(original_text):
matched_text = match.group(0)
if payment_type not in found_dict['payment_types']:
found_dict['payment_types'].append(payment_type)
working_text = working_text.replace(matched_text, '', 1)
return found_dict, working_text
def main(session, account_records, people):
list_of_regex_patterns = generate_dictonary_of_patterns(people=people)
dicts_found = dict()
dicts_not_found = dict()
count_extracted = 0
for account_record in account_records:
account_record_id = str(account_record["id"])
found_dict = {}
process_comment_iteration = clean_text(text=account_record["process_comment"])
found_dict, cleaned_process_comment = extract_person_name_with_regex(
found_dict=found_dict, process_comment=process_comment_iteration, patterns_dict=list_of_regex_patterns, people=people
)
found_dict, cleaned_process_comment = extract_build_parts_info(
found_dict=found_dict, process_comment=cleaned_process_comment
)
found_dict, cleaned_process_comment = extract_months(
found_dict=found_dict, process_comment=cleaned_process_comment
)
found_dict, cleaned_process_comment = extract_year(
found_dict=found_dict, process_comment=cleaned_process_comment
)
found_dict, cleaned_process_comment = extract_payment_type(
found_dict=found_dict, process_comment=cleaned_process_comment
)
if found_dict:
dicts_found[str(account_record_id)] = found_dict
else:
dicts_not_found[str(account_record_id)] = account_record_id
for id_, item in dicts_found.items():
AccountRecordsPredict.set_session(session)
AccountRecords.set_session(session)
months_are_valid = bool(item.get("months", []))
years_are_valid = bool(item.get("years", []))
payment_types_are_valid = bool(item.get("payment_types", []))
apartment_number_are_valid = bool(item.get("apartment_number", []))
person_name_are_valid = bool(item.get("name_match", []))
account_record_to_save = AccountRecords.query.filter_by(id=int(id_)).first()
save_dict = dict(
account_records_id=account_record_to_save.id, account_records_uu_id=str(account_record_to_save.uu_id), prediction_model="regex", treshold=1, is_first_prediction=False
)
update_dict = dict(prediction_model="regex", treshold=1, is_first_prediction=False)
if any([months_are_valid, years_are_valid, payment_types_are_valid, apartment_number_are_valid, person_name_are_valid]):
count_extracted += 1
if months_are_valid:
print(f"months: {item['months']}")
data_to_save = dumps({"data": item['months']})
prediction_result = AccountRecordsPredict.query.filter_by(account_records_id=account_record_to_save.id, prediction_field="months", prediction_model="regex").first()
if not prediction_result:
created_account_prediction = AccountRecordsPredict.create(**save_dict, prediction_field="months", prediction_result=data_to_save)
created_account_prediction.save()
else:
prediction_result.update(**update_dict, prediction_result=data_to_save)
prediction_result.save()
if years_are_valid:
print(f"years: {item['years']}")
data_to_save = dumps({"data": item['years']})
prediction_result = AccountRecordsPredict.query.filter_by(account_records_id=account_record_to_save.id, prediction_field="years", prediction_model="regex").first()
if not prediction_result:
created_account_prediction = AccountRecordsPredict.create(**save_dict, prediction_field="years", prediction_result=data_to_save)
created_account_prediction.save()
else:
prediction_result.update(**update_dict, prediction_result=data_to_save)
prediction_result.save()
if payment_types_are_valid:
print(f"payment_types: {item['payment_types']}")
data_to_save = dumps({"data": item['payment_types']})
prediction_result = AccountRecordsPredict.query.filter_by(account_records_id=account_record_to_save.id, prediction_field="payment_types", prediction_model="regex").first()
if not prediction_result:
created_account_prediction = AccountRecordsPredict.create(**save_dict, prediction_field="payment_types", prediction_result=data_to_save)
created_account_prediction.save()
else:
prediction_result.update(**update_dict, prediction_result=data_to_save)
prediction_result.save()
if apartment_number_are_valid:
print(f"apartment_number: {item['apartment_number']}")
prediction_result = AccountRecordsPredict.query.filter_by(account_records_id=account_record_to_save.id, prediction_field="apartment_number", prediction_model="regex").first()
if not prediction_result:
created_account_prediction = AccountRecordsPredict.create(**save_dict, prediction_field="apartment_number", prediction_result=item['apartment_number'])
created_account_prediction.save()
else:
prediction_result.update(**update_dict, prediction_result=item['apartment_number'])
prediction_result.save()
if person_name_are_valid:
print(f"person_name: {item['name_match']}")
data_to_save = dumps({"data": item['name_match']})
prediction_result = AccountRecordsPredict.query.filter_by(account_records_id=account_record_to_save.id, prediction_field="person_name", prediction_model="regex").first()
if not prediction_result:
created_account_prediction = AccountRecordsPredict.create(**save_dict, prediction_field="person_name", prediction_result=data_to_save)
created_account_prediction.save()
else:
prediction_result.update(**update_dict, prediction_result=data_to_save)
prediction_result.save()
print("\n===== SUMMARY =====")
print(f"extracted data total : {count_extracted}")
print(f"not extracted data total : {len(account_records) - count_extracted}")
print(f"Total account records processed : {len(account_records)}")
if __name__ == "__main__":
session_factory = get_session_factory()
session = session_factory()
turkish_months = ["OCAK", "ŞUBAT", "MART", "NİSAN", "MAYIS", "HAZİRAN", "TEMMUZ", "AĞUSTOS", "EYLÜL", "EKİM", "KASIM", "ARALIK"]
turkish_months_abbr = {
"OCA": "OCAK", "SUB": "ŞUBAT", "ŞUB": "ŞUBAT", "MAR": "MART", "NIS": "NİSAN", "MAY": "MAYIS", "HAZ": "HAZİRAN", "HZR": "HAZİRAN",
"TEM": "TEMMUZ", "AGU": "AĞUSTOS", "AGT": "AĞUSTOS", "EYL": "EYLÜL", "EKI": "EKİM", "KAS": "KASIM", "ARA": "ARALIK",
}
start_year = 1950
current_year = arrow.now().year
people_query = sqlalchemy_text("""
SELECT DISTINCT ON (p.id) p.firstname, p.middle_name, p.surname, p.birthname, bl.id
FROM public.people as p
INNER JOIN public.build_living_space as bl ON bl.person_id = p.id
INNER JOIN public.build_parts as bp ON bp.id = bl.build_parts_id
INNER JOIN public.build as b ON b.id = bp.build_id
WHERE b.id = 1
ORDER BY p.id
""")
people_raw = session.execute(people_query).all()
remove_duplicate = list()
clean_people_list = list()
for person in people_raw:
merged_name = f"{person[0]} {person[1]} {person[2]} {person[3]}"
if merged_name not in remove_duplicate:
clean_people_list.append(person)
remove_duplicate.append(merged_name)
people = [{"firstname": p[0], "middle_name": p[1], "surname": p[2], "birthname": p[3], 'id': p[4]} for p in clean_people_list]
query_account_records = sqlalchemy_text("""
SELECT a.id, a.iban, a.bank_date, a.process_comment FROM public.account_records as a where currency_value > 0
""") # and bank_date::date >= '2020-01-01'
account_records = session.execute(query_account_records).all()
account_records = [{"id": ar[0], "iban": ar[1], "bank_date": ar[2], "process_comment": ar[3]} for ar in account_records]
try:
main(session=session, account_records=account_records, people=people)
except Exception as e:
print(f"{e}")
session.close()
session_factory.remove()