diff --git a/.gitignore b/.gitignore index 9838fc3..a822ad8 100644 --- a/.gitignore +++ b/.gitignore @@ -56,8 +56,12 @@ pids report.[0-9]*.[0-9]*.[0-9]*.[0-9]*.json env .env -ServicesRunner/AccountRecordServices/Finder/Iban/.prisma-cache -ServicesRunner/AccountRecordServices/Finder/Comment/.prisma-cache +# ServicesRunner/AccountRecordServices/Finder/Iban/.prisma-cache +# ServicesRunner/AccountRecordServices/Finder/Comment/.prisma-cache +# ServicesRunner/AccountRecordServices/Finder/Parser/Comment/.prisma-cache + +**/.prisma-cache + venv/ .vscode/ __pycache__/ diff --git a/ServicesApi/src/auth/select/select.service.ts b/ServicesApi/src/auth/select/select.service.ts index 067b0e2..cd1235e 100644 --- a/ServicesApi/src/auth/select/select.service.ts +++ b/ServicesApi/src/auth/select/select.service.ts @@ -6,6 +6,7 @@ import { PrismaService } from '@/src/prisma.service'; import { EventsService } from '@/src/navigator/events/events.service'; import { PagesService } from '@/src/navigator/pages/pages.service'; import { MenusService } from '@/src/navigator/menus/menu.service'; +import { includes } from 'zod'; @Injectable() export class SelectService { diff --git a/ServicesRunner/AccountRecordServices/Finder/Comment/.dockerignore b/ServicesRunner/AccountRecordServices/Finder/Accounts/.dockerignore similarity index 100% rename from ServicesRunner/AccountRecordServices/Finder/Comment/.dockerignore rename to ServicesRunner/AccountRecordServices/Finder/Accounts/.dockerignore diff --git a/ServicesRunner/AccountRecordServices/Finder/Comment/Dockerfile b/ServicesRunner/AccountRecordServices/Finder/Accounts/Dockerfile similarity index 100% rename from ServicesRunner/AccountRecordServices/Finder/Comment/Dockerfile rename to ServicesRunner/AccountRecordServices/Finder/Accounts/Dockerfile diff --git a/ServicesRunner/AccountRecordServices/Finder/Comment/app.py b/ServicesRunner/AccountRecordServices/Finder/Accounts/app.py similarity index 93% rename from ServicesRunner/AccountRecordServices/Finder/Comment/app.py rename to ServicesRunner/AccountRecordServices/Finder/Accounts/app.py index f9e4a63..30a0378 100644 --- a/ServicesRunner/AccountRecordServices/Finder/Comment/app.py +++ b/ServicesRunner/AccountRecordServices/Finder/Accounts/app.py @@ -2,7 +2,7 @@ import time import arrow import pprint -from json import dumps +from json import dumps, loads from decimal import Decimal from pydantic import BaseModel @@ -85,7 +85,8 @@ def enclose_task_and_send_mail_to_build_manager(prisma_service: PrismaService, s for build_id, saved_list_of_account_record in saved_list_of_account_records.items(): build_manager_occupant_type = prisma_service.find_first(table="occupant_types", query={"occupant_code":"BU-MNG", "is_confirmed": True, "active": True}) living_space = prisma_service.find_first( - table="build_living_space", query={"build_id": build_id, "occupant_type_id": build_manager_occupant_type['id'], "expiry_starts": {"lte": today}, "expiry_ends": {"gte": today}} + table="build_living_space", query={ + "build_id": build_id, "occupant_type_id": build_manager_occupant_type['id'], "expiry_starts": {"lte": today}, "expiry_ends": {"gte": today}} ) build = prisma_service.find_first(table="builds", query={"id": build_id}) person = prisma_service.find_first(table="people", query={"id": living_space['person_id']}) @@ -105,8 +106,10 @@ def enclose_task_and_send_mail_to_build_manager(prisma_service: PrismaService, s if __name__ == "__main__": + prisma_service = PrismaService() process_comment_finder_service = ProcessCommentFinderService() + print("Process Comment service started") try: print("Process Comment service started sleeping for 5 seconds") @@ -117,7 +120,10 @@ if __name__ == "__main__": for task in tasks: if not check_task_belong_to_this_service(task): continue - write_account_records_row_from_finder_comment(finder_comments=task.data.FinderComment, prisma_service=prisma_service, saved_list_of_account_records=saved_list_of_account_records) + write_account_records_row_from_finder_comment( + finder_comments=task.data.FinderComment, prisma_service=prisma_service, saved_list_of_account_records=saved_list_of_account_records + ) + save_task_object_for_comment_parsing(task=task, process_comment_finder_service=process_comment_finder_service) process_comment_finder_service.update_task_status(task_uuid=task.task, is_completed=True, status=Status.COMPLETED) process_comment_finder_service.delete_task(task_uuid=task.task) enclose_task_and_send_mail_to_build_manager( diff --git a/ServicesRunner/AccountRecordServices/Finder/Comment/entrypoint.sh b/ServicesRunner/AccountRecordServices/Finder/Accounts/entrypoint.sh similarity index 100% rename from ServicesRunner/AccountRecordServices/Finder/Comment/entrypoint.sh rename to ServicesRunner/AccountRecordServices/Finder/Accounts/entrypoint.sh diff --git a/ServicesRunner/AccountRecordServices/Finder/Comment/matchers.py b/ServicesRunner/AccountRecordServices/Finder/Comment/matchers.py deleted file mode 100644 index 638c21e..0000000 --- a/ServicesRunner/AccountRecordServices/Finder/Comment/matchers.py +++ /dev/null @@ -1,518 +0,0 @@ -import re -import arrow - -from json import loads, dumps -from unidecode import unidecode -from difflib import SequenceMatcher -from itertools import permutations -from time import perf_counter - -turkish_months = ["OCAK", "ŞUBAT", "MART", "NİSAN", "MAYIS", "HAZİRAN", "TEMMUZ", "AĞUSTOS", "EYLÜL", "EKİM", "KASIM", "ARALIK"] -turkish_months_abbr = { - "OCA": "OCAK", "SUB": "ŞUBAT", "ŞUB": "ŞUBAT", "MAR": "MART", "NIS": "NİSAN", "MAY": "MAYIS", "HAZ": "HAZİRAN", "HZR": "HAZİRAN", - "TEM": "TEMMUZ", "AGU": "AĞUSTOS", "AGT": "AĞUSTOS", "EYL": "EYLÜL", "EKI": "EKİM", "KAS": "KASIM", "ARA": "ARALIK", -} -month_to_number_dict = { - "ocak": 1, "şubat": 2, "mart": 3, "nisan": 4, "mayıs": 5, "haziran": 6, "temmuz": 7, "ağustos": 8, "eylül": 9, "ekim": 10, "kasım": 11, "aralık": 12, - "ocak": 1, "subat": 2, "mart": 3, "nisan": 4, "mayis": 5, "haziran": 6, "temmuz": 7, "agustos": 8, "eylul": 9, "ekim": 10, "kasim": 11, "aralik": 12 -} -start_year = 1950 -current_year = arrow.now().year - - -def clean_text(text): - text = str(text) - text = re.sub(r'\d{8,}', ' ', text) - # text = re.sub(r'\b[A-Za-z0-9]*?[0-9]+[A-Za-z0-9]*?[A-Za-z]+[A-Za-z0-9]*\b|\b[A-Za-z0-9]*?[A-Za-z]+[A-Za-z0-9]*?[0-9]+[A-Za-z0-9]*\b', ' ', text) - text = text.replace("/", " ") - text = text.replace("_", " ") - text_remove_underscore = text.replace("-", " ").replace("+", " ") - text_remove_asterisk = text_remove_underscore.replace("*", " ") - text_remove_comma = text_remove_asterisk.replace(",", " ") - text_remove_dots = text_remove_comma.replace(".", " ") - text_remove_dots = re.sub(r'\s+', ' ', text_remove_dots) - text_remove_dots = text_remove_dots.strip() - return text_remove_dots - - -def normalize_text(text: str): - text = text.replace('İ', 'i') - text = text.replace('I', 'ı') - text = text.replace('Ş', 'ş') - text = text.replace('Ğ', 'ğ') - text = text.replace('Ü', 'ü') - text = text.replace('Ö', 'ö') - text = text.replace('Ç', 'ç') - return unidecode(text).lower() - - -def get_person_initials(person: dict): - parts = [person.get("firstname", ""), person.get("middle_name", ""), person.get("surname", ""), person.get("birthname", "")] - return [unidecode(p.strip())[0].upper() for p in parts if p] - - -def get_text_initials(matched_text: str): - return [unidecode(word.strip())[0].upper() for word in matched_text.split() if word.strip()] - - -def generate_dictonary_of_patterns(people: list[dict]): - """ - completly remove middle_name instead do regex firstName + SomeWord + surname - """ - patterns_dict = {} - for person in people: - person_id = person.get('id') - firstname = person.get('firstname', '').strip() if person.get('firstname') else "" - middle_name = person.get('middle_name', '').strip() if person.get('middle_name') else "" - surname = person.get('surname', '').strip() if person.get('surname') else "" - birthname = person.get('birthname', '').strip() if person.get('birthname') else "" - if not firstname or not surname: - continue - - name_parts = { - 'firstname': { - 'orig': firstname, - 'norm': normalize_text(firstname) if firstname else "", - 'init': normalize_text(firstname)[0] if firstname else "" - }, - 'surname': { - 'orig': surname, - 'norm': normalize_text(surname) if surname else "", - 'init': normalize_text(surname)[0] if surname else "" - } - } - - if middle_name: - name_parts['middle_name'] = { - 'orig': middle_name, - 'norm': normalize_text(middle_name) if middle_name else "", - 'init': normalize_text(middle_name)[0] if middle_name else "" - } - - if birthname and normalize_text(birthname) != normalize_text(surname): - name_parts['birthname'] = { - 'orig': birthname, - 'norm': normalize_text(birthname), - 'init': normalize_text(birthname)[0] if birthname else "" - } - - person_patterns = set() - def create_pattern(parts, formats, separators=None): - if separators is None: - separators = [""] - patterns = [] - for fmt in formats: - for sep in separators: - pattern_parts = [] - for part_type, part_name in fmt: - if part_name in parts and part_type in parts[part_name]: - pattern_parts.append(re.escape(parts[part_name][part_type])) - if pattern_parts: - patterns.append(r"\b" + sep.join(pattern_parts) + r"\b") - return patterns - - name_formats = [ - [('orig', 'firstname'), ('orig', 'surname')], - [('norm', 'firstname'), ('norm', 'surname')], - [('orig', 'surname'), ('orig', 'firstname')], - [('norm', 'surname'), ('norm', 'firstname')], - ] - if 'middle_name' in name_parts: - name_formats = [ - [('orig', 'firstname'), ('orig', 'middle_name'), ('orig', 'surname')], - [('norm', 'firstname'), ('norm', 'middle_name'), ('norm', 'surname')], - ] - person_patterns.update(create_pattern(name_parts, name_formats, [" ", ""])) - - if 'middle_name' in name_parts: - middle_name_formats = [ - [('orig', 'firstname'), ('orig', 'middle_name')], - [('norm', 'firstname'), ('norm', 'middle_name')], - [('orig', 'middle_name'), ('orig', 'surname')], - [('norm', 'middle_name'), ('norm', 'surname')], - ] - person_patterns.update(create_pattern(name_parts, middle_name_formats, [" ", ""])) - - if 'birthname' in name_parts and name_parts['surname']['orig'] != name_parts['birthname']['orig']: - birthname_formats = [ - [('orig', 'firstname'), ('orig', 'birthname')], - [('norm', 'firstname'), ('norm', 'birthname')], - [('orig', 'birthname'), ('orig', 'firstname')], - [('norm', 'birthname'), ('norm', 'firstname')], - ] - person_patterns.update(create_pattern(name_parts, birthname_formats, [" ", ""])) - initial_formats = [[('init', 'firstname'), ('init', 'middle_name'), ('init', 'surname')], [('init', 'firstname'), ('init', 'surname')]] - person_patterns.update(create_pattern(name_parts, initial_formats, ["", ".", " ", ". "])) - if 'middle_name' in name_parts: - triple_initial_formats = [ - [('init', 'firstname'), ('init', 'middle_name'), ('init', 'surname')], - ] - person_patterns.update(create_pattern(name_parts, triple_initial_formats, ["", ".", " ", ". "])) - compiled_patterns = [re.compile(pattern, re.IGNORECASE) for pattern in person_patterns] - patterns_dict[person_id] = compiled_patterns - return patterns_dict - - -def extract_person_name_with_regex(found_dict: dict, process_comment: str, patterns_dict: dict, people: list[dict]): - cleaned_text = process_comment - all_matches = [] - for person_id, patterns in patterns_dict.items(): - person = next((p for p in people if p.get('id') == person_id), None) - if not person: - continue - firstname_norm = normalize_text(person.get("firstname", "").strip()) if person.get("firstname") else "" - middle_name_norm = normalize_text(person.get("middle_name", "").strip()) if person.get("middle_name") else "" - surname_norm = normalize_text(person.get("surname", "").strip()) if person.get("surname") else "" - birthname_norm = normalize_text(person.get("birthname", "").strip()) if person.get("birthname") else "" - text_norm = normalize_text(process_comment) - person_matches = [] - for pattern in patterns: - for match in pattern.finditer(text_norm): - start, end = match.span() - matched_text = process_comment[start:end] - matched_text_norm = normalize_text(matched_text) - is_valid_match = False - if len(matched_text_norm.split()) <= 1: - is_valid_match = False - else: - has_firstname = firstname_norm and firstname_norm in matched_text_norm - has_surname = surname_norm and surname_norm in matched_text_norm - has_birthname = birthname_norm and birthname_norm in matched_text_norm - if (has_firstname and has_surname) or (has_firstname and has_birthname): - is_valid_match = True - if is_valid_match: - person_matches.append({'matched_text': matched_text, 'start': start, 'end': end}) - if person_matches: - person_matches.sort(key=lambda x: len(x['matched_text']), reverse=True) - non_overlapping_matches = [] - for match in person_matches: - overlaps = False - for existing_match in non_overlapping_matches: - if (match['start'] < existing_match['end'] and match['end'] > existing_match['start']): - overlaps = True - break - if not overlaps: - non_overlapping_matches.append(match) - if non_overlapping_matches: - found_dict["name_match"] = person - all_matches.extend([(match, person) for match in non_overlapping_matches]) - if all_matches: - all_matches.sort(key=lambda x: x[0]['start'], reverse=True) - for match, person in all_matches: - matched_text = match['matched_text'] - matched_words = matched_text.split() - for word in matched_words: - word_norm = normalize_text(word).strip() - if not word_norm: - continue - text_norm = normalize_text(cleaned_text) - for word_match in re.finditer(rf'\b{re.escape(word_norm)}\b', text_norm, re.IGNORECASE): - start, end = word_match.span() - cleaned_text = cleaned_text[:start] + ' ' * (end - start) + cleaned_text[end:] - cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip() - return found_dict, cleaned_text - - -def extract_build_parts_info(found_dict, process_comment): - """ - Regex of parts such as : - 2 nolu daire - 9 NUMARALI DAI - daire 3 - 3 nolu dairenin - 11nolu daire - Daire No 12 - 2NOLU DAIRE - 12 No lu daire - D:10 - NO:11 - NO :3 - """ - apartment_number = None - cleaned_text = process_comment - def clean_text_apartment_number(text, match): - clean_text = text.replace(match.group(0), '').strip() - clean_text = re.sub(r'\s+', ' ', clean_text).strip() - return clean_text - pattern1 = re.compile(r'(\d+)\s*nolu\s*daire', re.IGNORECASE) - match = pattern1.search(cleaned_text) - if match: - apartment_number = match.group(1) - found_dict['apartment_number'] = apartment_number - return found_dict, clean_text_apartment_number(cleaned_text, match) - pattern4 = re.compile(r'(\d+)\s*nolu\s*daire\w*', re.IGNORECASE) - match = pattern4.search(cleaned_text) - if match: - apartment_number = match.group(1) - found_dict['apartment_number'] = apartment_number - return found_dict, clean_text_apartment_number(cleaned_text, match) - pattern5 = re.compile(r'(\d+)nolu\s*daire', re.IGNORECASE) - match = pattern5.search(cleaned_text) - if match: - apartment_number = match.group(1) - found_dict['apartment_number'] = apartment_number - return found_dict, clean_text_apartment_number(cleaned_text, match) - pattern7 = re.compile(r'(\d+)nolu\s*daire', re.IGNORECASE) - match = pattern7.search(cleaned_text) - if match: - apartment_number = match.group(1) - found_dict['apartment_number'] = apartment_number - return found_dict, clean_text_apartment_number(cleaned_text, match) - pattern8 = re.compile(r'(\d+)\s*no\s*lu\s*daire', re.IGNORECASE) - match = pattern8.search(cleaned_text) - if match: - apartment_number = match.group(1) - found_dict['apartment_number'] = apartment_number - return found_dict, clean_text_apartment_number(cleaned_text, match) - pattern6 = re.compile(r'daire\s*no\s*(\d+)', re.IGNORECASE) - match = pattern6.search(cleaned_text) - if match: - apartment_number = match.group(1) - found_dict['apartment_number'] = apartment_number - return found_dict, clean_text_apartment_number(cleaned_text, match) - pattern2 = re.compile(r'(\d+)\s*numarali\s*dai', re.IGNORECASE) - match = pattern2.search(cleaned_text) - if match: - apartment_number = match.group(1) - found_dict['apartment_number'] = apartment_number - return found_dict, clean_text_apartment_number(cleaned_text, match) - pattern3 = re.compile(r'daire\s*(\d+)', re.IGNORECASE) - match = pattern3.search(cleaned_text) - if match: - apartment_number = match.group(1) - found_dict['apartment_number'] = apartment_number - return found_dict, clean_text_apartment_number(cleaned_text, match) - pattern9 = re.compile(r'd\s*:\s*(\d+)', re.IGNORECASE) - match = pattern9.search(cleaned_text) - if match: - apartment_number = match.group(1) - found_dict['apartment_number'] = apartment_number - return found_dict, clean_text_apartment_number(cleaned_text, match) - pattern10 = re.compile(r'no\s*:\s*(\d+)', re.IGNORECASE) - match = pattern10.search(cleaned_text) - if match: - apartment_number = match.group(1) - found_dict['apartment_number'] = apartment_number - return found_dict, clean_text_apartment_number(cleaned_text, match) - return found_dict, cleaned_text - - -def extract_months(found_dict, process_comment): - """ - Extract Turkish month names and abbreviations from the process comment - """ - original_text = process_comment - - def normalize_turkish(text: str) -> str: - """Properly normalize Turkish text for case-insensitive comparison""" - text = text.lower() - text = text.replace('i̇', 'i') # Handle dotted i properly - text = text.replace('ı', 'i') # Convert dotless i to regular i for matching - text = unidecode(text) # Remove other diacritics - return text - - if 'months' not in found_dict: - found_dict['months'] = [] - - months_found, working_text = False, original_text - for month in turkish_months: - pattern = re.compile(r'\b' + re.escape(month) + r'\b', re.IGNORECASE) - for match in pattern.finditer(original_text): - matched_text = match.group(0) - normalized_month = normalize_turkish(month) - month_number = None - if month.lower() in month_to_number_dict: - month_number = month_to_number_dict[month.lower()] - elif normalized_month in month_to_number_dict: - month_number = month_to_number_dict[normalized_month] - month_info = {'name': month, 'number': month_number} - found_dict['months'].append(month_info) - months_found = True - working_text = working_text.replace(matched_text, '', 1) - - for abbr, full_month in turkish_months_abbr.items(): - pattern = re.compile(r'\b' + re.escape(abbr) + r'\b', re.IGNORECASE) - for match in pattern.finditer(working_text): - matched_text = match.group(0) - normalized_month = normalize_turkish(full_month) - month_number = None - if full_month.lower() in month_to_number_dict: - month_number = month_to_number_dict[full_month.lower()] - elif normalized_month in month_to_number_dict: - month_number = month_to_number_dict[normalized_month] - month_info = {'name': full_month, 'number': month_number} - found_dict['months'].append(month_info) - months_found = True - working_text = working_text.replace(matched_text, '', 1) - return found_dict, working_text - - -def extract_year(found_dict, process_comment): - """ - Extract years from the process comment - """ - original_text = process_comment - if 'years' not in found_dict: - found_dict['years'] = [] - working_text = original_text - for year in range(start_year, current_year + 1): - pattern = re.compile(r'\b' + str(year) + r'\b', re.IGNORECASE) - for match in pattern.finditer(original_text): - matched_text = match.group(0) - if str(matched_text).isdigit(): - found_dict['years'].append(int(matched_text)) - working_text = working_text.replace(matched_text, '', 1) - return found_dict, working_text - - -def extract_payment_type(found_dict, process_comment): - """ - Extract payment type from the process comment - aidat - AİD - aidatı - TADİLAT - YAKIT - yakıt - yakit - """ - original_text = process_comment - working_text = original_text - if 'payment_types' not in found_dict: - found_dict['payment_types'] = [] - payment_keywords = {'aidat': ['aidat', 'aİd', 'aid', 'aidatı', 'aidati'], 'tadilat': ['tadilat', 'tadİlat', 'tadilatı'], 'yakit': ['yakit', 'yakıt', 'yakıtı', 'yakiti']} - for payment_type, keywords in payment_keywords.items(): - for keyword in keywords: - pattern = re.compile(r'\b' + keyword + r'\b', re.IGNORECASE) - for match in pattern.finditer(original_text): - matched_text = match.group(0) - if payment_type not in found_dict['payment_types']: - found_dict['payment_types'].append(payment_type) - working_text = working_text.replace(matched_text, '', 1) - return found_dict, working_text - - -def main(account_records, people): - list_of_regex_patterns = generate_dictonary_of_patterns(people=people) - dicts_found = dict() - dicts_not_found = dict() - count_extracted = 0 - for account_record in account_records: - account_record_id = str(account_record["id"]) - found_dict = {} - process_comment_iteration = clean_text(text=account_record["process_comment"]) - found_dict, cleaned_process_comment = extract_person_name_with_regex(found_dict=found_dict, process_comment=process_comment_iteration, patterns_dict=list_of_regex_patterns, people=people) - found_dict, cleaned_process_comment = extract_build_parts_info(found_dict=found_dict, process_comment=cleaned_process_comment) - found_dict, cleaned_process_comment = extract_months(found_dict=found_dict, process_comment=cleaned_process_comment) - found_dict, cleaned_process_comment = extract_year(found_dict=found_dict, process_comment=cleaned_process_comment) - found_dict, cleaned_process_comment = extract_payment_type(found_dict=found_dict, process_comment=cleaned_process_comment) - if found_dict: - dicts_found[str(account_record_id)] = found_dict - else: - dicts_not_found[str(account_record_id)] = account_record_id - - for id_, item in dicts_found.items(): - - months_are_valid = bool(item.get("months", [])) - years_are_valid = bool(item.get("years", [])) - payment_types_are_valid = bool(item.get("payment_types", [])) - apartment_number_are_valid = bool(item.get("apartment_number", [])) - person_name_are_valid = bool(item.get("name_match", [])) - account_record_to_save = AccountRecords.query.filter_by(id=int(id_)).first() - save_dict = dict( - account_records_id=account_record_to_save.id, account_records_uu_id=str(account_record_to_save.uu_id), prediction_model="regex", treshold=1, is_first_prediction=False - ) - update_dict = dict(prediction_model="regex", treshold=1, is_first_prediction=False) - if any([months_are_valid, years_are_valid, payment_types_are_valid, apartment_number_are_valid, person_name_are_valid]): - count_extracted += 1 - if months_are_valid: - print(f"months: {item['months']}") - data_to_save = dumps({"data": item['months']}) - prediction_result = AccountRecordsPredict.query.filter_by(account_records_id=account_record_to_save.id, prediction_field="months", prediction_model="regex").first() - if not prediction_result: - created_account_prediction = AccountRecordsPredict.create(**save_dict, prediction_field="months", prediction_result=data_to_save) - created_account_prediction.save() - else: - prediction_result.update(**update_dict, prediction_result=data_to_save) - prediction_result.save() - if years_are_valid: - print(f"years: {item['years']}") - data_to_save = dumps({"data": item['years']}) - prediction_result = AccountRecordsPredict.query.filter_by(account_records_id=account_record_to_save.id, prediction_field="years", prediction_model="regex").first() - if not prediction_result: - created_account_prediction = AccountRecordsPredict.create(**save_dict, prediction_field="years", prediction_result=data_to_save) - created_account_prediction.save() - else: - prediction_result.update(**update_dict, prediction_result=data_to_save) - prediction_result.save() - if payment_types_are_valid: - print(f"payment_types: {item['payment_types']}") - data_to_save = dumps({"data": item['payment_types']}) - prediction_result = AccountRecordsPredict.query.filter_by(account_records_id=account_record_to_save.id, prediction_field="payment_types", prediction_model="regex").first() - if not prediction_result: - created_account_prediction = AccountRecordsPredict.create(**save_dict, prediction_field="payment_types", prediction_result=data_to_save) - created_account_prediction.save() - else: - prediction_result.update(**update_dict, prediction_result=data_to_save) - prediction_result.save() - if apartment_number_are_valid: - print(f"apartment_number: {item['apartment_number']}") - prediction_result = AccountRecordsPredict.query.filter_by(account_records_id=account_record_to_save.id, prediction_field="apartment_number", prediction_model="regex").first() - if not prediction_result: - created_account_prediction = AccountRecordsPredict.create(**save_dict, prediction_field="apartment_number", prediction_result=item['apartment_number']) - created_account_prediction.save() - else: - prediction_result.update(**update_dict, prediction_result=item['apartment_number']) - prediction_result.save() - if person_name_are_valid: - print(f"person_name: {item['name_match']}") - data_to_save = dumps({"data": item['name_match']}) - prediction_result = AccountRecordsPredict.query.filter_by(account_records_id=account_record_to_save.id, prediction_field="person_name", prediction_model="regex").first() - if not prediction_result: - created_account_prediction = AccountRecordsPredict.create(**save_dict, prediction_field="person_name", prediction_result=data_to_save) - created_account_prediction.save() - else: - prediction_result.update(**update_dict, prediction_result=data_to_save) - prediction_result.save() - - print("\n===== SUMMARY =====") - print(f"extracted data total : {count_extracted}") - print(f"not extracted data total : {len(account_records) - count_extracted}") - print(f"Total account records processed : {len(account_records)}") - - -if __name__ == "__main__": - - people_query = sqlalchemy_text(""" - SELECT DISTINCT ON (p.id) p.firstname, p.middle_name, p.surname, p.birthname, bl.id - FROM public.people as p - INNER JOIN public.build_living_space as bl ON bl.person_id = p.id - INNER JOIN public.build_parts as bp ON bp.id = bl.build_parts_id - INNER JOIN public.build as b ON b.id = bp.build_id - WHERE b.id = 1 - ORDER BY p.id - """) - - people_raw = session.execute(people_query).all() - remove_duplicate = list() - clean_people_list = list() - for person in people_raw: - merged_name = f"{person[0]} {person[1]} {person[2]} {person[3]}" - if merged_name not in remove_duplicate: - clean_people_list.append(person) - remove_duplicate.append(merged_name) - - people = [{"firstname": p[0], "middle_name": p[1], "surname": p[2], "birthname": p[3], 'id': p[4]} for p in clean_people_list] - query_account_records = sqlalchemy_text(""" - SELECT a.id, a.iban, a.bank_date, a.process_comment FROM public.account_records as a where currency_value > 0 - """) # and bank_date::date >= '2020-01-01' - account_records = session.execute(query_account_records).all() - account_records = [{"id": ar[0], "iban": ar[1], "bank_date": ar[2], "process_comment": ar[3]} for ar in account_records] - - try: - main(session=session, account_records=account_records, people=people) - except Exception as e: - print(f"{e}") - - session.close() - session_factory.remove() diff --git a/ServicesRunner/AccountRecordServices/Finder/Parser/Comment/.dockerignore b/ServicesRunner/AccountRecordServices/Finder/Parser/Comment/.dockerignore new file mode 100644 index 0000000..f0d05fd --- /dev/null +++ b/ServicesRunner/AccountRecordServices/Finder/Parser/Comment/.dockerignore @@ -0,0 +1,14 @@ +__pycache__/ +*.pyc +*.pyo +*.pyd +*.db +*.sqlite3 +*.log +*.env +venv/ +.env.* +node_modules/ +.prisma/ +.prisma-cache/ +ServicesRunnner/AccountRecordServices/Test/venv/ diff --git a/ServicesRunner/AccountRecordServices/Finder/Parser/Comment/Dockerfile b/ServicesRunner/AccountRecordServices/Finder/Parser/Comment/Dockerfile new file mode 100644 index 0000000..5f74a9d --- /dev/null +++ b/ServicesRunner/AccountRecordServices/Finder/Parser/Comment/Dockerfile @@ -0,0 +1,22 @@ +FROM python:3.12-slim + +ENV PYTHONUNBUFFERED=1 +ENV PYTHONDONTWRITEBYTECODE=1 +ENV VIRTUAL_ENV=/opt/venv +ENV PRISMA_SCHEMA_PATH=/app/Depends/schema.prisma +ENV PATH="$VIRTUAL_ENV/bin:$PATH" +ENV PYTHONPATH=/app + +RUN apt-get update && apt-get install -y --no-install-recommends gcc curl && rm -rf /var/lib/apt/lists/* + +WORKDIR /app + +COPY ServicesRunner/Depends/ /app/Depends/ +COPY ServicesRunner/AccountRecordServices/Finder/Parser/Comment /app/ + +COPY ServicesRunner/requirements.txt /app/requirements.txt +COPY ServicesRunner/AccountRecordServices/Finder/Parser/Comment/entrypoint.sh /entrypoint.sh + +RUN chmod +x /entrypoint.sh + +CMD ["/entrypoint.sh"] diff --git a/ServicesRunner/AccountRecordServices/Finder/Parser/Comment/app.py b/ServicesRunner/AccountRecordServices/Finder/Parser/Comment/app.py new file mode 100644 index 0000000..43e0373 --- /dev/null +++ b/ServicesRunner/AccountRecordServices/Finder/Parser/Comment/app.py @@ -0,0 +1,161 @@ +import time +import arrow + +from typing import Optional +from pydantic import BaseModel +from matchers import Parser +from models import BuildingCluster, BuildPart, BuildLivingSpace, Person, User, OccupantType + +from Depends.prisma_client import PrismaService +from Depends.config import ConfigServices, RedisTaskObject +from Depends.service_handler import ProcessCommentParserService + + +def check_task_belong_to_this_service(task: RedisTaskObject): + """ + Check if task belongs to this service + """ + if not task.service == ConfigServices.TASK_COMMENT_PARSER: + return False + if not task.completed: + return False + if task.is_completed: + return False + if not task.data: + return False + return True + + +def get_all_person_data_due_to_build(prisma_service: PrismaService): + """ + Get all person data due to build with comprehensive inner joins + Returns a dictionary of buildings clustered with their build parts, people, and living spaces + """ + buildings_dict, today = {}, arrow.now().to('GMT+3').datetime + occupant_flat_owner = prisma_service.find_first(table="occupant_types", query={"occupant_code": "FL-OWN", "active": True, "is_confirmed": True}, include={"user_types": True}) + occupant_tenant = prisma_service.find_first(table="occupant_types", query={"occupant_code": "FL-TEN", "active": True, "is_confirmed": True}, include={"user_types": True}) + possible_money_sender_occupants = [occupant_flat_owner.id, occupant_tenant.id] + buildings = prisma_service.find_many(table="build", query={"active": True, "is_confirmed": True,"expiry_starts": {"lte": today}, "expiry_ends": {"gte": today}}) + for build in buildings: + buildings_dict[str(build.id)] = BuildingCluster( + id=build.id, + uu_id=build.uu_id, + build_name=build.build_name, + build_no=build.build_no, + build_date=str(build.build_date), + decision_period_date=str(build.decision_period_date), + expiry_starts=str(build.expiry_starts), + expiry_ends=str(build.expiry_ends), + is_confirmed=build.is_confirmed, + active=build.active, + build_parts=[] + ) + build_parts = prisma_service.find_many(table="build_parts", query={"build_id": build.id, "active": True, "is_confirmed": True, "human_livable": True, "expiry_starts": {"lte": today}, "expiry_ends": {"gte": today}}) + for build_part in build_parts: + part_obj = BuildPart( + id=build_part.id, + uu_id=build_part.uu_id, + part_no=build_part.part_no, + part_level=build_part.part_level, + part_code=build_part.part_code, + part_gross_size=build_part.part_gross_size, + part_net_size=build_part.part_net_size, + human_livable=build_part.human_livable, + build_id=build_part.build_id, + build_uu_id=build_part.build_uu_id, + is_confirmed=build_part.is_confirmed, + active=build_part.active, + living_spaces=[], + build=None + ) + living_spaces = prisma_service.find_many( + table="build_living_space", include={"occupant_types": True, "people": {"include": {"users": True}}}, + query={"build_parts_id": build_part.id, "active": True, "is_confirmed": True, "expiry_starts": {"lte": today}, "expiry_ends": {"gte": today}, "occupant_type_id": {"in": possible_money_sender_occupants}}, + ) + for living_space in living_spaces: + person = living_space.people + user = prisma_service.find_first(table="users", query={"person_id": person.id, "active": True, "is_confirmed": True}) + user_of_person = None + if user: + user_of_person = User( + id=user.id, + uu_id=user.uu_id, + user_tag=user.user_tag, + user_type=user.user_type, + email=user.email, + phone_number=user.phone_number, + related_company=user.related_company, + is_confirmed=user.is_confirmed, + active=user.active + ) + person_obj = Person( + id=person.id, + uu_id=person.uu_id, + firstname=person.firstname, + surname=person.surname, + middle_name=person.middle_name, + birthname=person.birthname, + is_confirmed=person.is_confirmed, + active=person.active, + user=user_of_person + ) + occupant_type = living_space.occupant_types + occupant_type_obj = OccupantType( + id=occupant_type.id, + uu_id=occupant_type.uu_id, + occupant_code=occupant_type.occupant_code, + occupant_type=occupant_type.occupant_type, + is_confirmed=occupant_type.is_confirmed, + active=occupant_type.active, + user_type_uu_id=occupant_type.user_type_uu_id + ) + living_space_obj = BuildLivingSpace( + id=living_space.id, + uu_id=living_space.uu_id, + expiry_starts=str(living_space.expiry_starts), + expiry_ends=str(living_space.expiry_ends), + fix_value=float(living_space.fix_value), + fix_percent=float(living_space.fix_percent), + agreement_no=living_space.agreement_no, + marketing_process=living_space.marketing_process, + build_parts_id=living_space.build_parts_id, + build_parts_uu_id=living_space.build_parts_uu_id, + person_id=living_space.person_id, + person_uu_id=living_space.person_uu_id, + occupant_type_id=living_space.occupant_type_id, + occupant_type_uu_id=living_space.occupant_type_uu_id, + is_confirmed=living_space.is_confirmed, + active=living_space.active, + person=person_obj, + occupant_types=occupant_type_obj + ) + part_obj.living_spaces.append(living_space_obj) + buildings_dict[str(build.id)].build_parts.append(part_obj) + return {i: v.dict(exclude_none=True) for i, v in buildings_dict.items()} + + +def get_all_companies_data(prisma_service: PrismaService): + return prisma_service.find_many(table="companies", query={"active": True, "is_confirmed": True}) + + +if __name__ == "__main__": + print("Process Comment Parser service started") + prisma_service = PrismaService() + process_comment_parser_service = ProcessCommentParserService() + search_people = get_all_person_data_due_to_build(prisma_service) + process_comment_parser_service.set_task_requirements(search_people) + arriving_account_records = prisma_service.find_many(table="account_records", query={"active": True, "is_confirmed": True, "approved_record": False, "currency_value": {"gt": 0}}) + debt_account_records = prisma_service.find_many(table="account_records", query={"active": True, "is_confirmed": True, "approved_record": False, "currency_value": {"lt": 0}}) + try: + while True: + time.sleep(5) + print("Process Comment Parser service started sleeping for 5 seconds") + tasks_dict = process_comment_parser_service.get_task_requirements() + task_requirements: dict[str, BuildingCluster] = {idx: BuildingCluster(**value) for idx, value in tasks_dict.items()} + parser = Parser(account_records=arriving_account_records, task_requirements=task_requirements) + parsed_records = parser.parse() + except Exception as e: + print(f"Process Comment Parser service error: {str(e)}") + raise e + finally: + prisma_service.disconnect() diff --git a/ServicesRunner/AccountRecordServices/Finder/Parser/Comment/entrypoint.sh b/ServicesRunner/AccountRecordServices/Finder/Parser/Comment/entrypoint.sh new file mode 100644 index 0000000..8afa504 --- /dev/null +++ b/ServicesRunner/AccountRecordServices/Finder/Parser/Comment/entrypoint.sh @@ -0,0 +1,19 @@ +#!/bin/sh + +VENV_PATH="/opt/venv" +REQUIREMENTS_PATH="/app/requirements.txt" +SCHEMA_PATH="/app/Depends/schema.prisma" +PRISMA_BINARY_PATH="/root/.cache/prisma-python/binaries" + +if [ ! -x "$VENV_PATH/bin/python" ]; then + python -m venv "$VENV_PATH" + "$VENV_PATH/bin/pip" install pip --upgrade + "$VENV_PATH/bin/pip" install --no-cache-dir -r "$REQUIREMENTS_PATH" + "$VENV_PATH/bin/prisma" generate --schema "$SCHEMA_PATH" +fi + +if ! find "$PRISMA_BINARY_PATH" -type f -name "prisma-query-engine-debian-openssl-3.0.x" | grep -q .; then + "$VENV_PATH/bin/prisma" py fetch +fi + +exec "$VENV_PATH/bin/python" -u app.py diff --git a/ServicesRunner/AccountRecordServices/Finder/Parser/Comment/matchers.py b/ServicesRunner/AccountRecordServices/Finder/Parser/Comment/matchers.py new file mode 100644 index 0000000..0d3f7b5 --- /dev/null +++ b/ServicesRunner/AccountRecordServices/Finder/Parser/Comment/matchers.py @@ -0,0 +1,566 @@ +import pprint +import re +import arrow + +from json import loads, dumps +from unidecode import unidecode +from models import BuildingCluster, Person + + +turkish_months = ["OCAK", "ŞUBAT", "MART", "NİSAN", "MAYIS", "HAZİRAN", "TEMMUZ", "AĞUSTOS", "EYLÜL", "EKİM", "KASIM", "ARALIK"] +turkish_months_abbr = { + "OCA": "OCAK", "SUB": "ŞUBAT", "ŞUB": "ŞUBAT", "MAR": "MART", "NIS": "NİSAN", "MAY": "MAYIS", "HAZ": "HAZİRAN", "HZR": "HAZİRAN", + "TEM": "TEMMUZ", "AGU": "AĞUSTOS", "AGT": "AĞUSTOS", "EYL": "EYLÜL", "EKI": "EKİM", "KAS": "KASIM", "ARA": "ARALIK", "AGUSTOS": "AĞUSTOS" +} +month_to_number_dict = { + "ocak": 1, "şubat": 2, "mart": 3, "nisan": 4, "mayıs": 5, "haziran": 6, "temmuz": 7, "ağustos": 8, "eylül": 9, "ekim": 10, "kasım": 11, "aralık": 12, + "ocak": 1, "subat": 2, "mart": 3, "nisan": 4, "mayis": 5, "haziran": 6, "temmuz": 7, "agustos": 8, "eylul": 9, "ekim": 10, "kasim": 11, "aralik": 12 +} +start_year = 1950 +current_year = arrow.now().year + + +class ParsedComment: + + def __init__(self, account_record_id: int, org_comment: str) -> None: + self.account_record_id: int = account_record_id + self.org_comment: str = org_comment + self.comment: str = None + self.people: list[dict] = [] + self.parts: list[dict] = [] + self.months: list[str] = [] + self.years: list[str] = [] + self.payment_types: list[str] = [] + + def set_people(self, people: list[dict]) -> None: + self.people = people + + def set_parts(self, parts: list[dict]) -> None: + self.parts = parts + + def set_months(self, months: list[str]) -> None: + self.months = months + + def set_years(self, years: list[str]) -> None: + self.years = years + + def set_payment_types(self, payment_types: list[str]) -> None: + self.payment_types = payment_types + + +class ParserHelpers: + + @staticmethod + def normalize_text(text: str) -> str: + text = text.replace('İ', 'i') + text = text.replace('I', 'ı') + text = text.replace('Ş', 'ş') + text = text.replace('Ğ', 'ğ') + text = text.replace('Ü', 'ü') + text = text.replace('Ö', 'ö') + text = text.replace('Ç', 'ç') + return unidecode(text).lower() + + +class ParserRequirements(ParserHelpers): + + def create_pattern(parts, formats, separators=None): + """ + parts: dict + formats: list[list[tuple[str, str]]] + separators: list[str] + """ + if separators is None: + separators = [""] + patterns = [] + for fmt in formats: + for sep in separators: + pattern_parts = [] + for part_type, part_name in fmt: + if part_name in parts and part_type in parts[part_name]: + pattern_parts.append(re.escape(parts[part_name][part_type])) + if pattern_parts: + patterns.append(r"\b" + sep.join(pattern_parts) + r"\b") + return patterns + + @classmethod + def generate_dictonary_of_patterns(cls, person: Person): + """Completly remove middle_name instead do regex firstName + SomeWord + surname""" + patterns_dict = {} + person_patterns, firstname, birthname = set(), person.firstname.strip() if person.firstname else "", person.birthname.strip() if person.birthname else "" + middle_name, surname = person.middle_name.strip() if person.middle_name else "", person.surname.strip() if person.surname else "" + if not firstname or not surname: + return patterns_dict + name_parts = { + 'firstname': {'orig': firstname, 'norm': cls.normalize_text(firstname) if firstname else "", 'init': cls.normalize_text(firstname)[0] if firstname else ""}, + 'surname': {'orig': surname, 'norm': cls.normalize_text(surname) if surname else "", 'init': cls.normalize_text(surname)[0] if surname else ""} + } + if middle_name: + name_parts['middle_name'] = {'orig': middle_name, 'norm': cls.normalize_text(middle_name) if middle_name else "", 'init': cls.normalize_text(middle_name)[0] if middle_name else ""} + if birthname and cls.normalize_text(birthname) != cls.normalize_text(surname): + name_parts['birthname'] = {'orig': birthname, 'norm': cls.normalize_text(birthname), 'init': cls.normalize_text(birthname)[0] if birthname else ""} + name_formats = [[('orig', 'firstname'), ('orig', 'surname')], [('norm', 'firstname'), ('norm', 'surname')], [('orig', 'surname'), ('orig', 'firstname')], [('norm', 'surname'), ('norm', 'firstname')]] + if 'middle_name' in name_parts: + name_formats = [[('orig', 'firstname'), ('orig', 'middle_name'), ('orig', 'surname')], [('norm', 'firstname'), ('norm', 'middle_name'), ('norm', 'surname')]] + person_patterns.update(cls.create_pattern(name_parts, name_formats, [" ", ""])) + if 'middle_name' in name_parts: + middle_name_formats = [[('orig', 'firstname'), ('orig', 'middle_name')], [('norm', 'firstname'), ('norm', 'middle_name')], [('orig', 'middle_name'), ('orig', 'surname')], [('norm', 'middle_name'), ('norm', 'surname')],] + person_patterns.update(cls.create_pattern(name_parts, middle_name_formats, [" ", ""])) + if 'birthname' in name_parts and name_parts['surname']['orig'] != name_parts['birthname']['orig']: + birthname_formats = [ + [('orig', 'firstname'), ('orig', 'birthname')], [('norm', 'firstname'), ('norm', 'birthname')], + [('orig', 'birthname'), ('orig', 'firstname')], [('norm', 'birthname'), ('norm', 'firstname')] + ] + person_patterns.update(cls.create_pattern(name_parts, birthname_formats, [" ", ""])) + initial_formats = [[('init', 'firstname'), ('init', 'middle_name'), ('init', 'surname')], [('init', 'firstname'), ('init', 'surname')]] + person_patterns.update(cls.create_pattern(name_parts, initial_formats, ["", ".", " ", ". "])) + if 'middle_name' in name_parts: + triple_initial_formats = [[('init', 'firstname'), ('init', 'middle_name'), ('init', 'surname')]] + person_patterns.update(cls.create_pattern(name_parts, triple_initial_formats, ["", ".", " ", ". "])) + compiled_patterns = [re.compile(pattern, re.IGNORECASE) for pattern in person_patterns] + patterns_dict[str(person.id)] = compiled_patterns + return patterns_dict + + +class CommentParser(ParserHelpers): + + def __init__(self, account_record, people_regex_dict: dict, people_dict: dict) -> None: + self.original_comment: str = account_record.process_comment + self.comment: str = self.clean_text(account_record.process_comment) + self.people_regex_dict: dict = people_regex_dict + self.people: dict = people_dict + self.account_record_id: str = str(account_record.id) + self.build_id: str = str(account_record.build_id) + self.parsed_comment: ParsedComment = ParsedComment(account_record_id=self.account_record_id, org_comment=self.original_comment) + + @staticmethod + def clean_text_apartment_number(text: str, match): + clean_text = text.replace(match.group(0), '').strip() + clean_text = re.sub(r'\s+', ' ', clean_text).strip() + return clean_text + + @staticmethod + def clean_text(text: str) -> str: + text = str(text) + text = re.sub(r'\d{8,}', ' ', text) + # text = re.sub(r'\b[A-Za-z0-9]*?[0-9]+[A-Za-z0-9]*?[A-Za-z]+[A-Za-z0-9]*\b|\b[A-Za-z0-9]*?[A-Za-z]+[A-Za-z0-9]*?[0-9]+[A-Za-z0-9]*\b', ' ', text) + text = text.replace("/", " ") + text = text.replace("_", " ") + text_remove_underscore = text.replace("-", " ").replace("+", " ") + text_remove_asterisk = text_remove_underscore.replace("*", " ") + text_remove_comma = text_remove_asterisk.replace(",", " ") + text_remove_dots = text_remove_comma.replace(".", " ") + text_remove_dots = re.sub(r'\s+', ' ', text_remove_dots) + text_remove_dots = text_remove_dots.strip() + return text_remove_dots + + def get_people_regex_by_build_id(self) -> dict: + """ + Get people regex by build id + """ + return self.people_regex_dict.get(self.build_id, {}) + + def get_person(self, person_id: str) -> Person | None: + return self.people[str(self.build_id)].get(person_id, None) + + def parse_comment(self) -> ParsedComment: + """ + Parse comment and extract information + """ + self.extract_person_name_with_regex() + self.extract_build_parts_info() + self.extract_months() + self.extract_years() + self.extract_payment_type() + self.comment = self.comment.strip() + self.parsed_comment.comment = self.comment + return self.parsed_comment + + def get_text_initials(matched_text: str): + return [unidecode(word.strip())[0].upper() for word in matched_text.split() if word.strip()] + + def extract_person_name_with_regex(self): + all_matches, found_dict = [], {} + build_regex = self.get_people_regex_by_build_id() + for person_id, patterns in build_regex.items(): + person_matches = [] + person = self.get_person(str(person_id)) + if not person: + continue + firstname_norm = str(self.normalize_text(person.firstname)).strip() if person.firstname else "" + # middle_name_norm = str(self.normalize_text(person.middle_name)).strip() if person.middle_name else "" + surname_norm = str(self.normalize_text(person.surname)).strip() if person.surname else "" + birthname_norm = str(self.normalize_text(person.birthname)).strip() if person.birthname else "" + text_norm = str(self.normalize_text(self.comment)) + for pattern in patterns[str(person_id)]: + for match in pattern.finditer(text_norm): + start, end = match.span() + matched_text: str = self.comment[start:end] + matched_text_norm = self.normalize_text(matched_text) + is_valid_match = False + if len(matched_text_norm.split()) <= 1: + is_valid_match = False + else: + has_firstname = firstname_norm and firstname_norm in matched_text_norm + has_surname = surname_norm and surname_norm in matched_text_norm + has_birthname = birthname_norm and birthname_norm in matched_text_norm + if (has_firstname and has_surname) or (has_firstname and has_birthname): + is_valid_match = True + if is_valid_match: + person_matches.append({'matched_text': matched_text, 'start': start, 'end': end}) + if person_matches: + person_matches.sort(key=lambda x: len(x['matched_text']), reverse=True) + non_overlapping_matches = [] + for match in person_matches: + overlaps = False + for existing_match in non_overlapping_matches: + if (match['start'] < existing_match['end'] and match['end'] > existing_match['start']): + overlaps = True + break + if not overlaps: + non_overlapping_matches.append(match) + if non_overlapping_matches: + found_dict["name_match"] = person + all_matches.extend([(match, person) for match in non_overlapping_matches]) + if all_matches: + all_matches.sort(key=lambda x: x[0]['start'], reverse=True) + for match, person in all_matches: + matched_text: str = match['matched_text'] + matched_words = matched_text.split() + for word in matched_words: + word_norm = str(self.normalize_text(word)).strip() + if not word_norm: + continue + text_norm = self.normalize_text(self.comment) + if not any([person_com for person_com in self.parsed_comment.people if str(person_com.id) == str(person.id)]): + self.parsed_comment.people.append(person) + for word_match in re.finditer(rf'\b{re.escape(word_norm)}\b', text_norm, re.IGNORECASE): + start, end = word_match.span() + self.comment = self.comment[:start] + ' ' * (end - start) + self.comment[end:] + self.comment = re.sub(r'\s+', ' ', self.comment).strip() + + def extract_build_parts_info(self): + """ + Regex of parts such as : + 2 nolu daire + 9 NUMARALI DAI + daire 3 + 3 nolu dairenin + 11nolu daire + Daire No 12 + 2NOLU DAIRE + 12 No lu daire + D:10 + NO:11 + NO :3 + """ + apartment_number = None + pattern1 = re.compile(r'(\d+)\s*nolu\s*daire', re.IGNORECASE) + match = pattern1.search(self.comment) + if match: + apartment_number = match.group(1) + self.parsed_comment.parts.append(apartment_number) + self.comment = self.clean_text_apartment_number(self.comment, match) + return + pattern4 = re.compile(r'(\d+)\s*nolu\s*daire\w*', re.IGNORECASE) + match = pattern4.search(self.comment) + if match: + apartment_number = match.group(1) + self.parsed_comment.parts.append(apartment_number) + self.comment = self.clean_text_apartment_number(self.comment, match) + return + pattern5 = re.compile(r'(\d+)nolu\s*daire', re.IGNORECASE) + match = pattern5.search(self.comment) + if match: + apartment_number = match.group(1) + self.parsed_comment.parts.append(apartment_number) + self.comment = self.clean_text_apartment_number(self.comment, match) + return + pattern7 = re.compile(r'(\d+)nolu\s*daire', re.IGNORECASE) + match = pattern7.search(self.comment) + if match: + apartment_number = match.group(1) + self.parsed_comment.parts.append(apartment_number) + self.comment = self.clean_text_apartment_number(self.comment, match) + return + pattern8 = re.compile(r'(\d+)\s*no\s*lu\s*daire', re.IGNORECASE) + match = pattern8.search(self.comment) + if match: + apartment_number = match.group(1) + self.parsed_comment.parts.append(apartment_number) + self.comment = self.clean_text_apartment_number(self.comment, match) + return + pattern6 = re.compile(r'daire\s*no\s*(\d+)', re.IGNORECASE) + match = pattern6.search(self.comment) + if match: + apartment_number = match.group(1) + self.parsed_comment.parts.append(apartment_number) + self.comment = self.clean_text_apartment_number(self.comment, match) + return + pattern2 = re.compile(r'(\d+)\s*numarali\s*dai', re.IGNORECASE) + match = pattern2.search(self.comment) + if match: + apartment_number = match.group(1) + self.parsed_comment.parts.append(apartment_number) + self.comment = self.clean_text_apartment_number(self.comment, match) + return + pattern3 = re.compile(r'daire\s*(\d+)', re.IGNORECASE) + match = pattern3.search(self.comment) + if match: + apartment_number = match.group(1) + self.parsed_comment.parts.append(apartment_number) + self.comment = self.clean_text_apartment_number(self.comment, match) + return + pattern9 = re.compile(r'd\s*:\s*(\d+)', re.IGNORECASE) + match = pattern9.search(self.comment) + if match: + apartment_number = match.group(1) + self.parsed_comment.parts.append(apartment_number) + self.comment = self.clean_text_apartment_number(self.comment, match) + return + pattern10 = re.compile(r'no\s*:\s*(\d+)', re.IGNORECASE) + match = pattern10.search(self.comment) + if match: + apartment_number = match.group(1) + self.parsed_comment.parts.append(apartment_number) + self.comment = self.clean_text_apartment_number(self.comment, match) + return + # return found_dict, self.comment + + def extract_months(self): + """ + Extract Turkish month names and abbreviations from the process comment + """ + original_text = self.comment + working_text = original_text + for month in turkish_months: + pattern = re.compile(r'\b' + re.escape(month) + r'\b', re.IGNORECASE) + for match in pattern.finditer(original_text): + matched_text = match.group(0) + normalized_month = self.normalize_text(month) + month_number = None + if month.lower() in month_to_number_dict: + month_number = month_to_number_dict[month.lower()] + elif normalized_month in month_to_number_dict: + month_number = month_to_number_dict[normalized_month] + month_info = {'name': month, 'number': month_number} + self.parsed_comment.months.append(month_info) + working_text = working_text.replace(matched_text, '', 1) + + for abbr, full_month in turkish_months_abbr.items(): + pattern = re.compile(r'\b' + re.escape(abbr) + r'\b', re.IGNORECASE) + for match in pattern.finditer(working_text): + matched_text = match.group(0) + normalized_month = self.normalize_text(full_month) + month_number = None + if full_month.lower() in month_to_number_dict: + month_number = month_to_number_dict[full_month.lower()] + elif normalized_month in month_to_number_dict: + month_number = month_to_number_dict[normalized_month] + month_info = {'name': full_month, 'number': month_number} + self.parsed_comment.months.append(month_info) + working_text = working_text.replace(matched_text, '', 1) + self.comment = working_text + + def extract_years(self): + """ + Extract years from the process comment + """ + original_text = self.comment + working_text = original_text + for year in range(start_year, current_year + 1): + pattern = re.compile(r'\b' + str(year) + r'\b', re.IGNORECASE) + for match in pattern.finditer(original_text): + matched_text = match.group(0) + if str(matched_text).isdigit(): + self.parsed_comment.years.append(int(matched_text)) + working_text = working_text.replace(matched_text, '', 1) + self.comment = working_text + + def extract_payment_type(self): + """ + Extract payment type from the process comment : aidat, AİD, aidatı, TADİLAT, YAKIT, yakıt, yakit + """ + original_text = self.comment + working_text = original_text + payment_keywords = {'aidat': ['aidat', 'aİd', 'aid', 'aidatı', 'aidati'], 'tadilat': ['tadilat', 'tadİlat', 'tadilatı'], 'yakit': ['yakit', 'yakıt', 'yakıtı', 'yakiti']} + for payment_type, keywords in payment_keywords.items(): + for keyword in keywords: + pattern = re.compile(r'\b' + keyword + r'\b', re.IGNORECASE) + for match in pattern.finditer(original_text): + matched_text = match.group(0) + if payment_type not in self.parsed_comment.payment_types: + self.parsed_comment.payment_types.append(payment_type) + working_text = working_text.replace(matched_text, '', 1) + self.comment = working_text + + +class Parser: + + def __init__(self, account_records: list, task_requirements: dict[str, BuildingCluster]) -> None: + """ + Initialize parser with account records and task requirements + """ + self.account_records: list = account_records + self.task_requirements: dict[str, BuildingCluster] = task_requirements + self.people_dict: dict[str, Person] = {} + self.people_regex_dict: dict = self.prepare_people_regex_dict() + self.parsed_records: list[ParsedComment] = [] + + def prepare_people_regex_dict(self): + """Prepare regex dictionary for people""" + regex_pattern_dict = {} + for build_id, build_cluster in self.task_requirements.items(): + for build_part in build_cluster.build_parts: + for living_space in build_part.living_spaces: + person: Person = living_space.person + if str(build_id) in self.people_dict: + if not str(person.id) in self.people_dict[str(build_id)]: + self.people_dict[str(build_id)][str(person.id)] = person + else: + self.people_dict[str(build_id)] = {str(person.id): person} + for build_id, people in self.people_dict.items(): + people: dict[str, Person] = people + for person_id, person in people.items(): + if str(build_id) not in regex_pattern_dict: + regex_pattern_dict[str(build_id)] = {} + regex_pattern_dict[str(build_id)][str(person_id)] = ParserRequirements.generate_dictonary_of_patterns(person) + return regex_pattern_dict + + def parse(self): + """Parse account records based on task requirements""" + for account_record in self.account_records: + comment_parser = CommentParser(account_record=account_record, people_regex_dict=self.people_regex_dict, people_dict=self.people_dict) + parsed_comment = comment_parser.parse_comment() + self.parsed_records.append(parsed_comment) + + for parsed_record in self.parsed_records: + print("*" * 150) + pprint.pprint({ + "original_comment": parsed_record.org_comment, "comment": parsed_record.comment, "people": parsed_record.people, + "parts": parsed_record.parts, "months": parsed_record.months, "years": parsed_record.years, "payment_types": parsed_record.payment_types + }, indent=2) + return self.parsed_records + + +def commented_code(): + def main(account_records, people): + + list_of_regex_patterns = generate_dictonary_of_patterns(people=people) + dicts_found, dicts_not_found, count_extracted = dict(), dict(), 0 + for account_record in account_records: + account_record_id = str(account_record["id"]) + found_dict = {} + process_comment_iteration = clean_text(text=account_record["process_comment"]) + found_dict, cleaned_process_comment = extract_person_name_with_regex(found_dict=found_dict, process_comment=process_comment_iteration, patterns_dict=list_of_regex_patterns, people=people) + found_dict, cleaned_process_comment = extract_build_parts_info(found_dict=found_dict, process_comment=cleaned_process_comment) + found_dict, cleaned_process_comment = extract_months(found_dict=found_dict, process_comment=cleaned_process_comment) + found_dict, cleaned_process_comment = extract_year(found_dict=found_dict, process_comment=cleaned_process_comment) + found_dict, cleaned_process_comment = extract_payment_type(found_dict=found_dict, process_comment=cleaned_process_comment) + if found_dict: + dicts_found[str(account_record_id)] = found_dict + else: + dicts_not_found[str(account_record_id)] = account_record_id + + for id_, item in dicts_found.items(): + months_are_valid = bool(item.get("months", [])) + years_are_valid = bool(item.get("years", [])) + payment_types_are_valid = bool(item.get("payment_types", [])) + apartment_number_are_valid = bool(item.get("apartment_number", [])) + person_name_are_valid = bool(item.get("name_match", [])) + account_record_to_save = AccountRecords.query.filter_by(id=int(id_)).first() + save_dict = dict(account_records_id=account_record_to_save.id, account_records_uu_id=str(account_record_to_save.uu_id), prediction_model="regex", treshold=1, is_first_prediction=False) + update_dict = dict(prediction_model="regex", treshold=1, is_first_prediction=False) + if any([months_are_valid, years_are_valid, payment_types_are_valid, apartment_number_are_valid, person_name_are_valid]): + count_extracted += 1 + if months_are_valid: + print(f"months: {item['months']}") + data_to_save = dumps({"data": item['months']}) + prediction_result = AccountRecordsPredict.query.filter_by(account_records_id=account_record_to_save.id, prediction_field="months", prediction_model="regex").first() + if not prediction_result: + created_account_prediction = AccountRecordsPredict.create(**save_dict, prediction_field="months", prediction_result=data_to_save) + created_account_prediction.save() + else: + prediction_result.update(**update_dict, prediction_result=data_to_save) + prediction_result.save() + if years_are_valid: + print(f"years: {item['years']}") + data_to_save = dumps({"data": item['years']}) + prediction_result = AccountRecordsPredict.query.filter_by(account_records_id=account_record_to_save.id, prediction_field="years", prediction_model="regex").first() + if not prediction_result: + created_account_prediction = AccountRecordsPredict.create(**save_dict, prediction_field="years", prediction_result=data_to_save) + created_account_prediction.save() + else: + prediction_result.update(**update_dict, prediction_result=data_to_save) + prediction_result.save() + if payment_types_are_valid: + print(f"payment_types: {item['payment_types']}") + data_to_save = dumps({"data": item['payment_types']}) + prediction_result = AccountRecordsPredict.query.filter_by(account_records_id=account_record_to_save.id, prediction_field="payment_types", prediction_model="regex").first() + if not prediction_result: + created_account_prediction = AccountRecordsPredict.create(**save_dict, prediction_field="payment_types", prediction_result=data_to_save) + created_account_prediction.save() + else: + prediction_result.update(**update_dict, prediction_result=data_to_save) + prediction_result.save() + if apartment_number_are_valid: + print(f"apartment_number: {item['apartment_number']}") + prediction_result = AccountRecordsPredict.query.filter_by(account_records_id=account_record_to_save.id, prediction_field="apartment_number", prediction_model="regex").first() + if not prediction_result: + created_account_prediction = AccountRecordsPredict.create(**save_dict, prediction_field="apartment_number", prediction_result=item['apartment_number']) + created_account_prediction.save() + else: + prediction_result.update(**update_dict, prediction_result=item['apartment_number']) + prediction_result.save() + if person_name_are_valid: + print(f"person_name: {item['name_match']}") + data_to_save = dumps({"data": item['name_match']}) + prediction_result = AccountRecordsPredict.query.filter_by(account_records_id=account_record_to_save.id, prediction_field="person_name", prediction_model="regex").first() + if not prediction_result: + created_account_prediction = AccountRecordsPredict.create(**save_dict, prediction_field="person_name", prediction_result=data_to_save) + created_account_prediction.save() + else: + prediction_result.update(**update_dict, prediction_result=data_to_save) + prediction_result.save() + + print("\n===== SUMMARY =====") + print(f"extracted data total : {count_extracted}") + print(f"not extracted data total : {len(account_records) - count_extracted}") + print(f"Total account records processed : {len(account_records)}") + + +# if __name__ == "__main__": + +# people_query = sqlalchemy_text(""" +# SELECT DISTINCT ON (p.id) p.firstname, p.middle_name, p.surname, p.birthname, bl.id +# FROM public.people as p +# INNER JOIN public.build_living_space as bl ON bl.person_id = p.id +# INNER JOIN public.build_parts as bp ON bp.id = bl.build_parts_id +# INNER JOIN public.build as b ON b.id = bp.build_id +# WHERE b.id = 1 +# ORDER BY p.id +# """) + +# people_raw = session.execute(people_query).all() +# remove_duplicate = list() +# clean_people_list = list() +# for person in people_raw: +# merged_name = f"{person[0]} {person[1]} {person[2]} {person[3]}" +# if merged_name not in remove_duplicate: +# clean_people_list.append(person) +# remove_duplicate.append(merged_name) + +# people = [{"firstname": p[0], "middle_name": p[1], "surname": p[2], "birthname": p[3], 'id': p[4]} for p in clean_people_list] +# query_account_records = sqlalchemy_text(""" +# SELECT a.id, a.iban, a.bank_date, a.process_comment FROM public.account_records as a where currency_value > 0 +# """) # and bank_date::date >= '2020-01-01' +# account_records = session.execute(query_account_records).all() +# account_records = [{"id": ar[0], "iban": ar[1], "bank_date": ar[2], "process_comment": ar[3]} for ar in account_records] + +# try: +# main(session=session, account_records=account_records, people=people) +# except Exception as e: +# print(f"{e}") + +# session.close() +# session_factory.remove() diff --git a/ServicesRunner/AccountRecordServices/Finder/Parser/Comment/models.py b/ServicesRunner/AccountRecordServices/Finder/Parser/Comment/models.py new file mode 100644 index 0000000..9fbdf36 --- /dev/null +++ b/ServicesRunner/AccountRecordServices/Finder/Parser/Comment/models.py @@ -0,0 +1,93 @@ +from typing import Optional, List +from pydantic import BaseModel + + +class User(BaseModel): + id: int + uu_id: str + user_tag: str + user_type: str + email: str + phone_number: str + related_company: str + is_confirmed: bool + active: bool + + +class Person(BaseModel): + id: int + uu_id: str + firstname: str + surname: str + middle_name: Optional[str] = "" + birthname: Optional[str] = "" + # national_identity_id: str + is_confirmed: bool + active: bool + user: Optional[User] = None + + +class OccupantType(BaseModel): + id: int + uu_id: str + occupant_code: str + occupant_type: str + is_confirmed: bool + active: bool + user_type_uu_id: Optional[str] = None + + +class BuildPart(BaseModel): + id: int + uu_id: str + part_no: str + part_level: str + part_code: str + part_gross_size: float + part_net_size: float + human_livable: bool + build_id: int + build_uu_id: str + is_confirmed: bool + active: bool + living_spaces: Optional[List['BuildLivingSpace']] = None + + +class BuildLivingSpace(BaseModel): + id: int + uu_id: str + expiry_starts: str + expiry_ends: str + fix_value: float + fix_percent: float + agreement_no: str + marketing_process: bool + build_parts_id: int + build_parts_uu_id: str + person_id: int + person_uu_id: str + occupant_type_id: int + occupant_type_uu_id: str + is_confirmed: bool + active: bool + person: Optional[Person] = None + occupant_type: Optional[OccupantType] = None + + +class BuildingCluster(BaseModel): + id: int + uu_id: str + build_name: str + build_no: str + build_date: str + decision_period_date: str + expiry_starts: str + expiry_ends: str + is_confirmed: bool + active: bool + build_parts: List['BuildPart'] = [] + + +# Update forward references for models with circular dependencies +BuildPart.update_forward_refs() +BuildingCluster.update_forward_refs() diff --git a/ServicesRunner/Depends/config.py b/ServicesRunner/Depends/config.py index 96a5587..3530377 100644 --- a/ServicesRunner/Depends/config.py +++ b/ServicesRunner/Depends/config.py @@ -138,6 +138,7 @@ class ConfigServices: TASK_UUID_INDEX_PREFIX: str = "BANK:SERVICES:TASK:UUID" TASK_SEEN_PREFIX: str = "BANK:SERVICES:TASK:SEEN" TASK_DELETED_PREFIX: str = "BANK:SERVICES:TASK:DELETED" + TASK_COMMENT_PARSER: str = "BANK:SERVICES:TASK:COMMENT:PARSER" SERVICE_PREFIX_MAIL_READER: str = "MailReader" SERVICE_PREFIX_MAIL_PARSER: str = "MailParser" @@ -145,6 +146,7 @@ class ConfigServices: SERVICE_PREFIX_FINDER_COMMENT: str = "FinderComment" SERVICE_PREFIX_MAIL_SENDER: str = "MailSender" + TEMPLATE_ACCOUNT_RECORDS: str = "template_accounts.html" diff --git a/ServicesRunner/Depends/prisma_client.py b/ServicesRunner/Depends/prisma_client.py index 55ac71c..98aed20 100644 --- a/ServicesRunner/Depends/prisma_client.py +++ b/ServicesRunner/Depends/prisma_client.py @@ -132,7 +132,7 @@ class PrismaService: table_selected: BaseModelClient = getattr(db, table, None) if not table_selected: raise ValueError(f"Table {table} not found") - rows = await table_selected.find_many(where=query, take=take, skip=skip, order=order or [], select=select, include=include) + rows = await table_selected.find_many(where=query, take=take, skip=skip, order=order or [], include=include) # print(f"[{datetime.now()}] Find many query completed in {time.time() - start:.2f}s") return rows @@ -234,7 +234,7 @@ class PrismaService: self, table: str, query: Optional[dict] = None, take: int = None, skip: int = None, order: Optional[list[dict]] = None, select: Optional[dict] = None, include: Optional[dict] = None ): - result = self._submit(self._a_find_many(table=table, query=query, take=take, skip=skip, order=order, select=select, include=include)) + result = self._submit(self._a_find_many(table=table, query=query, take=take, skip=skip, order=order, include=include)) if select and result: result = [{k: v for k, v in item.items() if k in select} for item in result] return result diff --git a/ServicesRunner/Depends/service_handler.py b/ServicesRunner/Depends/service_handler.py index 7290aeb..a90e8d0 100644 --- a/ServicesRunner/Depends/service_handler.py +++ b/ServicesRunner/Depends/service_handler.py @@ -605,15 +605,6 @@ class MailReaderService: self.service_retriever = ServiceTaskRetriever(self.redis_handler) self._initialized = True - def ensure_connection(self): - """ - Ensure Redis connection is established - - Returns: - bool: True if connection is established, False otherwise - """ - return self.redis_handler.ensure_connection() - def get_task_by_uuid(self, task_uuid: str) -> RedisTaskObject: """ Get a task object by its UUID @@ -840,34 +831,6 @@ class MailParserService: def fetch_all_tasks(self) -> list[RedisTaskObject]: return self.service_retriever.fetch_all_tasks() - def ensure_connection(self): - """ - Ensure Redis connection is established - - Returns: - bool: True if connection is established, False otherwise - """ - return self.redis_handler.ensure_connection() - - def _check_redis_connection(self) -> bool: - """ - Check if Redis connection is alive using RedisHandler - - Returns: - True if connection is alive, False otherwise - """ - try: - # Use RedisHandler to check connection - connection_status = self.redis_handler.ensure_connection() - if connection_status: - logger.info("Redis connection established via RedisHandler") - else: - logger.error("Redis connection check failed via RedisHandler") - return connection_status - except RedisHandler.REDIS_EXCEPTIONS as e: - logger.error(f"Redis connection failed: {str(e)}") - return False - def get_task_by_uuid(self, task_uuid: str) -> RedisTaskObject: """ Get a task object by its UUID @@ -948,34 +911,6 @@ class IbanFinderService: def fetch_all_tasks(self) -> list[RedisTaskObject]: return self.service_retriever.fetch_all_tasks() - def ensure_connection(self): - """ - Ensure Redis connection is established - - Returns: - bool: True if connection is established, False otherwise - """ - return self.redis_handler.ensure_connection() - - def _check_redis_connection(self) -> bool: - """ - Check if Redis connection is alive using RedisHandler - - Returns: - True if connection is alive, False otherwise - """ - try: - # Use RedisHandler to check connection - connection_status = self.redis_handler.ensure_connection() - if connection_status: - logger.info("Redis connection established via RedisHandler") - else: - logger.error("Redis connection check failed via RedisHandler") - return connection_status - except RedisHandler.REDIS_EXCEPTIONS as e: - logger.error(f"Redis connection failed: {str(e)}") - return False - def get_task_by_uuid(self, task_uuid: str) -> RedisTaskObject: """ Get a task object by its UUID @@ -1152,3 +1087,41 @@ class ProcessCommentFinderService: return self.service_retriever.delete_task(task_uuid, max_retries) +class ProcessCommentParserService: + """ + Class for processing comment parser tasks + """ + + instance = None + REDIS_EXCEPTIONS = RedisHandler.REDIS_EXCEPTIONS + + def __init__(self): + if hasattr(self, '_initialized') and self._initialized: + return + self.service_retriever: ServiceTaskRetriever = ServiceTaskRetriever() + self._initialized = True + + def fetch_all_tasks(self) -> list[RedisTaskObject]: + """ + Get all tasks from Redis + + Returns: + list: List of task objects + """ + return self.service_retriever.fetch_all_tasks_parser() + + def get_task_requirements(self) -> dict: + """ + Get task requirements from Redis + Returns: + dict: Task requirements if found + """ + if task_object := self.service_retriever.redis_handler.get(f'{ConfigServices.TASK_COMMENT_PARSER}'): + return loads(task_object) + return None + + def set_task_requirements(self, task_object: RedisTaskObject): + """ + Set task requirements in Redis + """ + return self.service_retriever.redis_handler.set(f'{ConfigServices.TASK_COMMENT_PARSER}', dumps(task_object)) diff --git a/docker-compose.bank.yml b/docker-compose.bank.yml index 0287c54..cdaafa7 100644 --- a/docker-compose.bank.yml +++ b/docker-compose.bank.yml @@ -120,6 +120,22 @@ services: options: max-size: "10m" max-file: "3" + + process_comment_parser: + container_name: process_comment_parser + build: + context: . + dockerfile: ServicesRunner/AccountRecordServices/Finder/Parser/Comment/Dockerfile + networks: + - bank-services-network + volumes: + - ./ServicesRunner/AccountRecordServices/Finder/Parser/Comment/venv:/opt/venv + - ./ServicesRunner/AccountRecordServices/Finder/Parser/Comment/.prisma-cache:/root/.cache/prisma-python + logging: + driver: "json-file" + options: + max-size: "10m" + max-file: "3" # finder_payments: # container_name: finder_payments # env_file: @@ -135,8 +151,6 @@ services: # max-size: "10m" # max-file: "3" - - networks: bank-services-network: driver: bridge