updated Parser comment

This commit is contained in:
Berkay 2025-08-14 00:10:57 +03:00
parent 7a5521648c
commit 4ec9031005
17 changed files with 949 additions and 592 deletions

8
.gitignore vendored
View File

@ -56,8 +56,12 @@ pids
report.[0-9]*.[0-9]*.[0-9]*.[0-9]*.json report.[0-9]*.[0-9]*.[0-9]*.[0-9]*.json
env env
.env .env
ServicesRunner/AccountRecordServices/Finder/Iban/.prisma-cache # ServicesRunner/AccountRecordServices/Finder/Iban/.prisma-cache
ServicesRunner/AccountRecordServices/Finder/Comment/.prisma-cache # ServicesRunner/AccountRecordServices/Finder/Comment/.prisma-cache
# ServicesRunner/AccountRecordServices/Finder/Parser/Comment/.prisma-cache
**/.prisma-cache
venv/ venv/
.vscode/ .vscode/
__pycache__/ __pycache__/

View File

@ -6,6 +6,7 @@ import { PrismaService } from '@/src/prisma.service';
import { EventsService } from '@/src/navigator/events/events.service'; import { EventsService } from '@/src/navigator/events/events.service';
import { PagesService } from '@/src/navigator/pages/pages.service'; import { PagesService } from '@/src/navigator/pages/pages.service';
import { MenusService } from '@/src/navigator/menus/menu.service'; import { MenusService } from '@/src/navigator/menus/menu.service';
import { includes } from 'zod';
@Injectable() @Injectable()
export class SelectService { export class SelectService {

View File

@ -2,7 +2,7 @@ import time
import arrow import arrow
import pprint import pprint
from json import dumps from json import dumps, loads
from decimal import Decimal from decimal import Decimal
from pydantic import BaseModel from pydantic import BaseModel
@ -85,7 +85,8 @@ def enclose_task_and_send_mail_to_build_manager(prisma_service: PrismaService, s
for build_id, saved_list_of_account_record in saved_list_of_account_records.items(): for build_id, saved_list_of_account_record in saved_list_of_account_records.items():
build_manager_occupant_type = prisma_service.find_first(table="occupant_types", query={"occupant_code":"BU-MNG", "is_confirmed": True, "active": True}) build_manager_occupant_type = prisma_service.find_first(table="occupant_types", query={"occupant_code":"BU-MNG", "is_confirmed": True, "active": True})
living_space = prisma_service.find_first( living_space = prisma_service.find_first(
table="build_living_space", query={"build_id": build_id, "occupant_type_id": build_manager_occupant_type['id'], "expiry_starts": {"lte": today}, "expiry_ends": {"gte": today}} table="build_living_space", query={
"build_id": build_id, "occupant_type_id": build_manager_occupant_type['id'], "expiry_starts": {"lte": today}, "expiry_ends": {"gte": today}}
) )
build = prisma_service.find_first(table="builds", query={"id": build_id}) build = prisma_service.find_first(table="builds", query={"id": build_id})
person = prisma_service.find_first(table="people", query={"id": living_space['person_id']}) person = prisma_service.find_first(table="people", query={"id": living_space['person_id']})
@ -105,8 +106,10 @@ def enclose_task_and_send_mail_to_build_manager(prisma_service: PrismaService, s
if __name__ == "__main__": if __name__ == "__main__":
prisma_service = PrismaService() prisma_service = PrismaService()
process_comment_finder_service = ProcessCommentFinderService() process_comment_finder_service = ProcessCommentFinderService()
print("Process Comment service started") print("Process Comment service started")
try: try:
print("Process Comment service started sleeping for 5 seconds") print("Process Comment service started sleeping for 5 seconds")
@ -117,7 +120,10 @@ if __name__ == "__main__":
for task in tasks: for task in tasks:
if not check_task_belong_to_this_service(task): if not check_task_belong_to_this_service(task):
continue continue
write_account_records_row_from_finder_comment(finder_comments=task.data.FinderComment, prisma_service=prisma_service, saved_list_of_account_records=saved_list_of_account_records) write_account_records_row_from_finder_comment(
finder_comments=task.data.FinderComment, prisma_service=prisma_service, saved_list_of_account_records=saved_list_of_account_records
)
save_task_object_for_comment_parsing(task=task, process_comment_finder_service=process_comment_finder_service)
process_comment_finder_service.update_task_status(task_uuid=task.task, is_completed=True, status=Status.COMPLETED) process_comment_finder_service.update_task_status(task_uuid=task.task, is_completed=True, status=Status.COMPLETED)
process_comment_finder_service.delete_task(task_uuid=task.task) process_comment_finder_service.delete_task(task_uuid=task.task)
enclose_task_and_send_mail_to_build_manager( enclose_task_and_send_mail_to_build_manager(

View File

@ -1,518 +0,0 @@
import re
import arrow
from json import loads, dumps
from unidecode import unidecode
from difflib import SequenceMatcher
from itertools import permutations
from time import perf_counter
turkish_months = ["OCAK", "ŞUBAT", "MART", "NİSAN", "MAYIS", "HAZİRAN", "TEMMUZ", "AĞUSTOS", "EYLÜL", "EKİM", "KASIM", "ARALIK"]
turkish_months_abbr = {
"OCA": "OCAK", "SUB": "ŞUBAT", "ŞUB": "ŞUBAT", "MAR": "MART", "NIS": "NİSAN", "MAY": "MAYIS", "HAZ": "HAZİRAN", "HZR": "HAZİRAN",
"TEM": "TEMMUZ", "AGU": "AĞUSTOS", "AGT": "AĞUSTOS", "EYL": "EYLÜL", "EKI": "EKİM", "KAS": "KASIM", "ARA": "ARALIK",
}
month_to_number_dict = {
"ocak": 1, "şubat": 2, "mart": 3, "nisan": 4, "mayıs": 5, "haziran": 6, "temmuz": 7, "ağustos": 8, "eylül": 9, "ekim": 10, "kasım": 11, "aralık": 12,
"ocak": 1, "subat": 2, "mart": 3, "nisan": 4, "mayis": 5, "haziran": 6, "temmuz": 7, "agustos": 8, "eylul": 9, "ekim": 10, "kasim": 11, "aralik": 12
}
start_year = 1950
current_year = arrow.now().year
def clean_text(text):
text = str(text)
text = re.sub(r'\d{8,}', ' ', text)
# text = re.sub(r'\b[A-Za-z0-9]*?[0-9]+[A-Za-z0-9]*?[A-Za-z]+[A-Za-z0-9]*\b|\b[A-Za-z0-9]*?[A-Za-z]+[A-Za-z0-9]*?[0-9]+[A-Za-z0-9]*\b', ' ', text)
text = text.replace("/", " ")
text = text.replace("_", " ")
text_remove_underscore = text.replace("-", " ").replace("+", " ")
text_remove_asterisk = text_remove_underscore.replace("*", " ")
text_remove_comma = text_remove_asterisk.replace(",", " ")
text_remove_dots = text_remove_comma.replace(".", " ")
text_remove_dots = re.sub(r'\s+', ' ', text_remove_dots)
text_remove_dots = text_remove_dots.strip()
return text_remove_dots
def normalize_text(text: str):
text = text.replace('İ', 'i')
text = text.replace('I', 'ı')
text = text.replace('Ş', 'ş')
text = text.replace('Ğ', 'ğ')
text = text.replace('Ü', 'ü')
text = text.replace('Ö', 'ö')
text = text.replace('Ç', 'ç')
return unidecode(text).lower()
def get_person_initials(person: dict):
parts = [person.get("firstname", ""), person.get("middle_name", ""), person.get("surname", ""), person.get("birthname", "")]
return [unidecode(p.strip())[0].upper() for p in parts if p]
def get_text_initials(matched_text: str):
return [unidecode(word.strip())[0].upper() for word in matched_text.split() if word.strip()]
def generate_dictonary_of_patterns(people: list[dict]):
"""
completly remove middle_name instead do regex firstName + SomeWord + surname
"""
patterns_dict = {}
for person in people:
person_id = person.get('id')
firstname = person.get('firstname', '').strip() if person.get('firstname') else ""
middle_name = person.get('middle_name', '').strip() if person.get('middle_name') else ""
surname = person.get('surname', '').strip() if person.get('surname') else ""
birthname = person.get('birthname', '').strip() if person.get('birthname') else ""
if not firstname or not surname:
continue
name_parts = {
'firstname': {
'orig': firstname,
'norm': normalize_text(firstname) if firstname else "",
'init': normalize_text(firstname)[0] if firstname else ""
},
'surname': {
'orig': surname,
'norm': normalize_text(surname) if surname else "",
'init': normalize_text(surname)[0] if surname else ""
}
}
if middle_name:
name_parts['middle_name'] = {
'orig': middle_name,
'norm': normalize_text(middle_name) if middle_name else "",
'init': normalize_text(middle_name)[0] if middle_name else ""
}
if birthname and normalize_text(birthname) != normalize_text(surname):
name_parts['birthname'] = {
'orig': birthname,
'norm': normalize_text(birthname),
'init': normalize_text(birthname)[0] if birthname else ""
}
person_patterns = set()
def create_pattern(parts, formats, separators=None):
if separators is None:
separators = [""]
patterns = []
for fmt in formats:
for sep in separators:
pattern_parts = []
for part_type, part_name in fmt:
if part_name in parts and part_type in parts[part_name]:
pattern_parts.append(re.escape(parts[part_name][part_type]))
if pattern_parts:
patterns.append(r"\b" + sep.join(pattern_parts) + r"\b")
return patterns
name_formats = [
[('orig', 'firstname'), ('orig', 'surname')],
[('norm', 'firstname'), ('norm', 'surname')],
[('orig', 'surname'), ('orig', 'firstname')],
[('norm', 'surname'), ('norm', 'firstname')],
]
if 'middle_name' in name_parts:
name_formats = [
[('orig', 'firstname'), ('orig', 'middle_name'), ('orig', 'surname')],
[('norm', 'firstname'), ('norm', 'middle_name'), ('norm', 'surname')],
]
person_patterns.update(create_pattern(name_parts, name_formats, [" ", ""]))
if 'middle_name' in name_parts:
middle_name_formats = [
[('orig', 'firstname'), ('orig', 'middle_name')],
[('norm', 'firstname'), ('norm', 'middle_name')],
[('orig', 'middle_name'), ('orig', 'surname')],
[('norm', 'middle_name'), ('norm', 'surname')],
]
person_patterns.update(create_pattern(name_parts, middle_name_formats, [" ", ""]))
if 'birthname' in name_parts and name_parts['surname']['orig'] != name_parts['birthname']['orig']:
birthname_formats = [
[('orig', 'firstname'), ('orig', 'birthname')],
[('norm', 'firstname'), ('norm', 'birthname')],
[('orig', 'birthname'), ('orig', 'firstname')],
[('norm', 'birthname'), ('norm', 'firstname')],
]
person_patterns.update(create_pattern(name_parts, birthname_formats, [" ", ""]))
initial_formats = [[('init', 'firstname'), ('init', 'middle_name'), ('init', 'surname')], [('init', 'firstname'), ('init', 'surname')]]
person_patterns.update(create_pattern(name_parts, initial_formats, ["", ".", " ", ". "]))
if 'middle_name' in name_parts:
triple_initial_formats = [
[('init', 'firstname'), ('init', 'middle_name'), ('init', 'surname')],
]
person_patterns.update(create_pattern(name_parts, triple_initial_formats, ["", ".", " ", ". "]))
compiled_patterns = [re.compile(pattern, re.IGNORECASE) for pattern in person_patterns]
patterns_dict[person_id] = compiled_patterns
return patterns_dict
def extract_person_name_with_regex(found_dict: dict, process_comment: str, patterns_dict: dict, people: list[dict]):
cleaned_text = process_comment
all_matches = []
for person_id, patterns in patterns_dict.items():
person = next((p for p in people if p.get('id') == person_id), None)
if not person:
continue
firstname_norm = normalize_text(person.get("firstname", "").strip()) if person.get("firstname") else ""
middle_name_norm = normalize_text(person.get("middle_name", "").strip()) if person.get("middle_name") else ""
surname_norm = normalize_text(person.get("surname", "").strip()) if person.get("surname") else ""
birthname_norm = normalize_text(person.get("birthname", "").strip()) if person.get("birthname") else ""
text_norm = normalize_text(process_comment)
person_matches = []
for pattern in patterns:
for match in pattern.finditer(text_norm):
start, end = match.span()
matched_text = process_comment[start:end]
matched_text_norm = normalize_text(matched_text)
is_valid_match = False
if len(matched_text_norm.split()) <= 1:
is_valid_match = False
else:
has_firstname = firstname_norm and firstname_norm in matched_text_norm
has_surname = surname_norm and surname_norm in matched_text_norm
has_birthname = birthname_norm and birthname_norm in matched_text_norm
if (has_firstname and has_surname) or (has_firstname and has_birthname):
is_valid_match = True
if is_valid_match:
person_matches.append({'matched_text': matched_text, 'start': start, 'end': end})
if person_matches:
person_matches.sort(key=lambda x: len(x['matched_text']), reverse=True)
non_overlapping_matches = []
for match in person_matches:
overlaps = False
for existing_match in non_overlapping_matches:
if (match['start'] < existing_match['end'] and match['end'] > existing_match['start']):
overlaps = True
break
if not overlaps:
non_overlapping_matches.append(match)
if non_overlapping_matches:
found_dict["name_match"] = person
all_matches.extend([(match, person) for match in non_overlapping_matches])
if all_matches:
all_matches.sort(key=lambda x: x[0]['start'], reverse=True)
for match, person in all_matches:
matched_text = match['matched_text']
matched_words = matched_text.split()
for word in matched_words:
word_norm = normalize_text(word).strip()
if not word_norm:
continue
text_norm = normalize_text(cleaned_text)
for word_match in re.finditer(rf'\b{re.escape(word_norm)}\b', text_norm, re.IGNORECASE):
start, end = word_match.span()
cleaned_text = cleaned_text[:start] + ' ' * (end - start) + cleaned_text[end:]
cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
return found_dict, cleaned_text
def extract_build_parts_info(found_dict, process_comment):
"""
Regex of parts such as :
2 nolu daire
9 NUMARALI DAI
daire 3
3 nolu dairenin
11nolu daire
Daire No 12
2NOLU DAIRE
12 No lu daire
D:10
NO:11
NO :3
"""
apartment_number = None
cleaned_text = process_comment
def clean_text_apartment_number(text, match):
clean_text = text.replace(match.group(0), '').strip()
clean_text = re.sub(r'\s+', ' ', clean_text).strip()
return clean_text
pattern1 = re.compile(r'(\d+)\s*nolu\s*daire', re.IGNORECASE)
match = pattern1.search(cleaned_text)
if match:
apartment_number = match.group(1)
found_dict['apartment_number'] = apartment_number
return found_dict, clean_text_apartment_number(cleaned_text, match)
pattern4 = re.compile(r'(\d+)\s*nolu\s*daire\w*', re.IGNORECASE)
match = pattern4.search(cleaned_text)
if match:
apartment_number = match.group(1)
found_dict['apartment_number'] = apartment_number
return found_dict, clean_text_apartment_number(cleaned_text, match)
pattern5 = re.compile(r'(\d+)nolu\s*daire', re.IGNORECASE)
match = pattern5.search(cleaned_text)
if match:
apartment_number = match.group(1)
found_dict['apartment_number'] = apartment_number
return found_dict, clean_text_apartment_number(cleaned_text, match)
pattern7 = re.compile(r'(\d+)nolu\s*daire', re.IGNORECASE)
match = pattern7.search(cleaned_text)
if match:
apartment_number = match.group(1)
found_dict['apartment_number'] = apartment_number
return found_dict, clean_text_apartment_number(cleaned_text, match)
pattern8 = re.compile(r'(\d+)\s*no\s*lu\s*daire', re.IGNORECASE)
match = pattern8.search(cleaned_text)
if match:
apartment_number = match.group(1)
found_dict['apartment_number'] = apartment_number
return found_dict, clean_text_apartment_number(cleaned_text, match)
pattern6 = re.compile(r'daire\s*no\s*(\d+)', re.IGNORECASE)
match = pattern6.search(cleaned_text)
if match:
apartment_number = match.group(1)
found_dict['apartment_number'] = apartment_number
return found_dict, clean_text_apartment_number(cleaned_text, match)
pattern2 = re.compile(r'(\d+)\s*numarali\s*dai', re.IGNORECASE)
match = pattern2.search(cleaned_text)
if match:
apartment_number = match.group(1)
found_dict['apartment_number'] = apartment_number
return found_dict, clean_text_apartment_number(cleaned_text, match)
pattern3 = re.compile(r'daire\s*(\d+)', re.IGNORECASE)
match = pattern3.search(cleaned_text)
if match:
apartment_number = match.group(1)
found_dict['apartment_number'] = apartment_number
return found_dict, clean_text_apartment_number(cleaned_text, match)
pattern9 = re.compile(r'd\s*:\s*(\d+)', re.IGNORECASE)
match = pattern9.search(cleaned_text)
if match:
apartment_number = match.group(1)
found_dict['apartment_number'] = apartment_number
return found_dict, clean_text_apartment_number(cleaned_text, match)
pattern10 = re.compile(r'no\s*:\s*(\d+)', re.IGNORECASE)
match = pattern10.search(cleaned_text)
if match:
apartment_number = match.group(1)
found_dict['apartment_number'] = apartment_number
return found_dict, clean_text_apartment_number(cleaned_text, match)
return found_dict, cleaned_text
def extract_months(found_dict, process_comment):
"""
Extract Turkish month names and abbreviations from the process comment
"""
original_text = process_comment
def normalize_turkish(text: str) -> str:
"""Properly normalize Turkish text for case-insensitive comparison"""
text = text.lower()
text = text.replace('', 'i') # Handle dotted i properly
text = text.replace('ı', 'i') # Convert dotless i to regular i for matching
text = unidecode(text) # Remove other diacritics
return text
if 'months' not in found_dict:
found_dict['months'] = []
months_found, working_text = False, original_text
for month in turkish_months:
pattern = re.compile(r'\b' + re.escape(month) + r'\b', re.IGNORECASE)
for match in pattern.finditer(original_text):
matched_text = match.group(0)
normalized_month = normalize_turkish(month)
month_number = None
if month.lower() in month_to_number_dict:
month_number = month_to_number_dict[month.lower()]
elif normalized_month in month_to_number_dict:
month_number = month_to_number_dict[normalized_month]
month_info = {'name': month, 'number': month_number}
found_dict['months'].append(month_info)
months_found = True
working_text = working_text.replace(matched_text, '', 1)
for abbr, full_month in turkish_months_abbr.items():
pattern = re.compile(r'\b' + re.escape(abbr) + r'\b', re.IGNORECASE)
for match in pattern.finditer(working_text):
matched_text = match.group(0)
normalized_month = normalize_turkish(full_month)
month_number = None
if full_month.lower() in month_to_number_dict:
month_number = month_to_number_dict[full_month.lower()]
elif normalized_month in month_to_number_dict:
month_number = month_to_number_dict[normalized_month]
month_info = {'name': full_month, 'number': month_number}
found_dict['months'].append(month_info)
months_found = True
working_text = working_text.replace(matched_text, '', 1)
return found_dict, working_text
def extract_year(found_dict, process_comment):
"""
Extract years from the process comment
"""
original_text = process_comment
if 'years' not in found_dict:
found_dict['years'] = []
working_text = original_text
for year in range(start_year, current_year + 1):
pattern = re.compile(r'\b' + str(year) + r'\b', re.IGNORECASE)
for match in pattern.finditer(original_text):
matched_text = match.group(0)
if str(matched_text).isdigit():
found_dict['years'].append(int(matched_text))
working_text = working_text.replace(matched_text, '', 1)
return found_dict, working_text
def extract_payment_type(found_dict, process_comment):
"""
Extract payment type from the process comment
aidat
AİD
aidatı
TADİLAT
YAKIT
yakıt
yakit
"""
original_text = process_comment
working_text = original_text
if 'payment_types' not in found_dict:
found_dict['payment_types'] = []
payment_keywords = {'aidat': ['aidat', 'aİd', 'aid', 'aidatı', 'aidati'], 'tadilat': ['tadilat', 'tadİlat', 'tadilatı'], 'yakit': ['yakit', 'yakıt', 'yakıtı', 'yakiti']}
for payment_type, keywords in payment_keywords.items():
for keyword in keywords:
pattern = re.compile(r'\b' + keyword + r'\b', re.IGNORECASE)
for match in pattern.finditer(original_text):
matched_text = match.group(0)
if payment_type not in found_dict['payment_types']:
found_dict['payment_types'].append(payment_type)
working_text = working_text.replace(matched_text, '', 1)
return found_dict, working_text
def main(account_records, people):
list_of_regex_patterns = generate_dictonary_of_patterns(people=people)
dicts_found = dict()
dicts_not_found = dict()
count_extracted = 0
for account_record in account_records:
account_record_id = str(account_record["id"])
found_dict = {}
process_comment_iteration = clean_text(text=account_record["process_comment"])
found_dict, cleaned_process_comment = extract_person_name_with_regex(found_dict=found_dict, process_comment=process_comment_iteration, patterns_dict=list_of_regex_patterns, people=people)
found_dict, cleaned_process_comment = extract_build_parts_info(found_dict=found_dict, process_comment=cleaned_process_comment)
found_dict, cleaned_process_comment = extract_months(found_dict=found_dict, process_comment=cleaned_process_comment)
found_dict, cleaned_process_comment = extract_year(found_dict=found_dict, process_comment=cleaned_process_comment)
found_dict, cleaned_process_comment = extract_payment_type(found_dict=found_dict, process_comment=cleaned_process_comment)
if found_dict:
dicts_found[str(account_record_id)] = found_dict
else:
dicts_not_found[str(account_record_id)] = account_record_id
for id_, item in dicts_found.items():
months_are_valid = bool(item.get("months", []))
years_are_valid = bool(item.get("years", []))
payment_types_are_valid = bool(item.get("payment_types", []))
apartment_number_are_valid = bool(item.get("apartment_number", []))
person_name_are_valid = bool(item.get("name_match", []))
account_record_to_save = AccountRecords.query.filter_by(id=int(id_)).first()
save_dict = dict(
account_records_id=account_record_to_save.id, account_records_uu_id=str(account_record_to_save.uu_id), prediction_model="regex", treshold=1, is_first_prediction=False
)
update_dict = dict(prediction_model="regex", treshold=1, is_first_prediction=False)
if any([months_are_valid, years_are_valid, payment_types_are_valid, apartment_number_are_valid, person_name_are_valid]):
count_extracted += 1
if months_are_valid:
print(f"months: {item['months']}")
data_to_save = dumps({"data": item['months']})
prediction_result = AccountRecordsPredict.query.filter_by(account_records_id=account_record_to_save.id, prediction_field="months", prediction_model="regex").first()
if not prediction_result:
created_account_prediction = AccountRecordsPredict.create(**save_dict, prediction_field="months", prediction_result=data_to_save)
created_account_prediction.save()
else:
prediction_result.update(**update_dict, prediction_result=data_to_save)
prediction_result.save()
if years_are_valid:
print(f"years: {item['years']}")
data_to_save = dumps({"data": item['years']})
prediction_result = AccountRecordsPredict.query.filter_by(account_records_id=account_record_to_save.id, prediction_field="years", prediction_model="regex").first()
if not prediction_result:
created_account_prediction = AccountRecordsPredict.create(**save_dict, prediction_field="years", prediction_result=data_to_save)
created_account_prediction.save()
else:
prediction_result.update(**update_dict, prediction_result=data_to_save)
prediction_result.save()
if payment_types_are_valid:
print(f"payment_types: {item['payment_types']}")
data_to_save = dumps({"data": item['payment_types']})
prediction_result = AccountRecordsPredict.query.filter_by(account_records_id=account_record_to_save.id, prediction_field="payment_types", prediction_model="regex").first()
if not prediction_result:
created_account_prediction = AccountRecordsPredict.create(**save_dict, prediction_field="payment_types", prediction_result=data_to_save)
created_account_prediction.save()
else:
prediction_result.update(**update_dict, prediction_result=data_to_save)
prediction_result.save()
if apartment_number_are_valid:
print(f"apartment_number: {item['apartment_number']}")
prediction_result = AccountRecordsPredict.query.filter_by(account_records_id=account_record_to_save.id, prediction_field="apartment_number", prediction_model="regex").first()
if not prediction_result:
created_account_prediction = AccountRecordsPredict.create(**save_dict, prediction_field="apartment_number", prediction_result=item['apartment_number'])
created_account_prediction.save()
else:
prediction_result.update(**update_dict, prediction_result=item['apartment_number'])
prediction_result.save()
if person_name_are_valid:
print(f"person_name: {item['name_match']}")
data_to_save = dumps({"data": item['name_match']})
prediction_result = AccountRecordsPredict.query.filter_by(account_records_id=account_record_to_save.id, prediction_field="person_name", prediction_model="regex").first()
if not prediction_result:
created_account_prediction = AccountRecordsPredict.create(**save_dict, prediction_field="person_name", prediction_result=data_to_save)
created_account_prediction.save()
else:
prediction_result.update(**update_dict, prediction_result=data_to_save)
prediction_result.save()
print("\n===== SUMMARY =====")
print(f"extracted data total : {count_extracted}")
print(f"not extracted data total : {len(account_records) - count_extracted}")
print(f"Total account records processed : {len(account_records)}")
if __name__ == "__main__":
people_query = sqlalchemy_text("""
SELECT DISTINCT ON (p.id) p.firstname, p.middle_name, p.surname, p.birthname, bl.id
FROM public.people as p
INNER JOIN public.build_living_space as bl ON bl.person_id = p.id
INNER JOIN public.build_parts as bp ON bp.id = bl.build_parts_id
INNER JOIN public.build as b ON b.id = bp.build_id
WHERE b.id = 1
ORDER BY p.id
""")
people_raw = session.execute(people_query).all()
remove_duplicate = list()
clean_people_list = list()
for person in people_raw:
merged_name = f"{person[0]} {person[1]} {person[2]} {person[3]}"
if merged_name not in remove_duplicate:
clean_people_list.append(person)
remove_duplicate.append(merged_name)
people = [{"firstname": p[0], "middle_name": p[1], "surname": p[2], "birthname": p[3], 'id': p[4]} for p in clean_people_list]
query_account_records = sqlalchemy_text("""
SELECT a.id, a.iban, a.bank_date, a.process_comment FROM public.account_records as a where currency_value > 0
""") # and bank_date::date >= '2020-01-01'
account_records = session.execute(query_account_records).all()
account_records = [{"id": ar[0], "iban": ar[1], "bank_date": ar[2], "process_comment": ar[3]} for ar in account_records]
try:
main(session=session, account_records=account_records, people=people)
except Exception as e:
print(f"{e}")
session.close()
session_factory.remove()

View File

@ -0,0 +1,14 @@
__pycache__/
*.pyc
*.pyo
*.pyd
*.db
*.sqlite3
*.log
*.env
venv/
.env.*
node_modules/
.prisma/
.prisma-cache/
ServicesRunnner/AccountRecordServices/Test/venv/

View File

@ -0,0 +1,22 @@
FROM python:3.12-slim
ENV PYTHONUNBUFFERED=1
ENV PYTHONDONTWRITEBYTECODE=1
ENV VIRTUAL_ENV=/opt/venv
ENV PRISMA_SCHEMA_PATH=/app/Depends/schema.prisma
ENV PATH="$VIRTUAL_ENV/bin:$PATH"
ENV PYTHONPATH=/app
RUN apt-get update && apt-get install -y --no-install-recommends gcc curl && rm -rf /var/lib/apt/lists/*
WORKDIR /app
COPY ServicesRunner/Depends/ /app/Depends/
COPY ServicesRunner/AccountRecordServices/Finder/Parser/Comment /app/
COPY ServicesRunner/requirements.txt /app/requirements.txt
COPY ServicesRunner/AccountRecordServices/Finder/Parser/Comment/entrypoint.sh /entrypoint.sh
RUN chmod +x /entrypoint.sh
CMD ["/entrypoint.sh"]

View File

@ -0,0 +1,161 @@
import time
import arrow
from typing import Optional
from pydantic import BaseModel
from matchers import Parser
from models import BuildingCluster, BuildPart, BuildLivingSpace, Person, User, OccupantType
from Depends.prisma_client import PrismaService
from Depends.config import ConfigServices, RedisTaskObject
from Depends.service_handler import ProcessCommentParserService
def check_task_belong_to_this_service(task: RedisTaskObject):
"""
Check if task belongs to this service
"""
if not task.service == ConfigServices.TASK_COMMENT_PARSER:
return False
if not task.completed:
return False
if task.is_completed:
return False
if not task.data:
return False
return True
def get_all_person_data_due_to_build(prisma_service: PrismaService):
"""
Get all person data due to build with comprehensive inner joins
Returns a dictionary of buildings clustered with their build parts, people, and living spaces
"""
buildings_dict, today = {}, arrow.now().to('GMT+3').datetime
occupant_flat_owner = prisma_service.find_first(table="occupant_types", query={"occupant_code": "FL-OWN", "active": True, "is_confirmed": True}, include={"user_types": True})
occupant_tenant = prisma_service.find_first(table="occupant_types", query={"occupant_code": "FL-TEN", "active": True, "is_confirmed": True}, include={"user_types": True})
possible_money_sender_occupants = [occupant_flat_owner.id, occupant_tenant.id]
buildings = prisma_service.find_many(table="build", query={"active": True, "is_confirmed": True,"expiry_starts": {"lte": today}, "expiry_ends": {"gte": today}})
for build in buildings:
buildings_dict[str(build.id)] = BuildingCluster(
id=build.id,
uu_id=build.uu_id,
build_name=build.build_name,
build_no=build.build_no,
build_date=str(build.build_date),
decision_period_date=str(build.decision_period_date),
expiry_starts=str(build.expiry_starts),
expiry_ends=str(build.expiry_ends),
is_confirmed=build.is_confirmed,
active=build.active,
build_parts=[]
)
build_parts = prisma_service.find_many(table="build_parts", query={"build_id": build.id, "active": True, "is_confirmed": True, "human_livable": True, "expiry_starts": {"lte": today}, "expiry_ends": {"gte": today}})
for build_part in build_parts:
part_obj = BuildPart(
id=build_part.id,
uu_id=build_part.uu_id,
part_no=build_part.part_no,
part_level=build_part.part_level,
part_code=build_part.part_code,
part_gross_size=build_part.part_gross_size,
part_net_size=build_part.part_net_size,
human_livable=build_part.human_livable,
build_id=build_part.build_id,
build_uu_id=build_part.build_uu_id,
is_confirmed=build_part.is_confirmed,
active=build_part.active,
living_spaces=[],
build=None
)
living_spaces = prisma_service.find_many(
table="build_living_space", include={"occupant_types": True, "people": {"include": {"users": True}}},
query={"build_parts_id": build_part.id, "active": True, "is_confirmed": True, "expiry_starts": {"lte": today}, "expiry_ends": {"gte": today}, "occupant_type_id": {"in": possible_money_sender_occupants}},
)
for living_space in living_spaces:
person = living_space.people
user = prisma_service.find_first(table="users", query={"person_id": person.id, "active": True, "is_confirmed": True})
user_of_person = None
if user:
user_of_person = User(
id=user.id,
uu_id=user.uu_id,
user_tag=user.user_tag,
user_type=user.user_type,
email=user.email,
phone_number=user.phone_number,
related_company=user.related_company,
is_confirmed=user.is_confirmed,
active=user.active
)
person_obj = Person(
id=person.id,
uu_id=person.uu_id,
firstname=person.firstname,
surname=person.surname,
middle_name=person.middle_name,
birthname=person.birthname,
is_confirmed=person.is_confirmed,
active=person.active,
user=user_of_person
)
occupant_type = living_space.occupant_types
occupant_type_obj = OccupantType(
id=occupant_type.id,
uu_id=occupant_type.uu_id,
occupant_code=occupant_type.occupant_code,
occupant_type=occupant_type.occupant_type,
is_confirmed=occupant_type.is_confirmed,
active=occupant_type.active,
user_type_uu_id=occupant_type.user_type_uu_id
)
living_space_obj = BuildLivingSpace(
id=living_space.id,
uu_id=living_space.uu_id,
expiry_starts=str(living_space.expiry_starts),
expiry_ends=str(living_space.expiry_ends),
fix_value=float(living_space.fix_value),
fix_percent=float(living_space.fix_percent),
agreement_no=living_space.agreement_no,
marketing_process=living_space.marketing_process,
build_parts_id=living_space.build_parts_id,
build_parts_uu_id=living_space.build_parts_uu_id,
person_id=living_space.person_id,
person_uu_id=living_space.person_uu_id,
occupant_type_id=living_space.occupant_type_id,
occupant_type_uu_id=living_space.occupant_type_uu_id,
is_confirmed=living_space.is_confirmed,
active=living_space.active,
person=person_obj,
occupant_types=occupant_type_obj
)
part_obj.living_spaces.append(living_space_obj)
buildings_dict[str(build.id)].build_parts.append(part_obj)
return {i: v.dict(exclude_none=True) for i, v in buildings_dict.items()}
def get_all_companies_data(prisma_service: PrismaService):
return prisma_service.find_many(table="companies", query={"active": True, "is_confirmed": True})
if __name__ == "__main__":
print("Process Comment Parser service started")
prisma_service = PrismaService()
process_comment_parser_service = ProcessCommentParserService()
search_people = get_all_person_data_due_to_build(prisma_service)
process_comment_parser_service.set_task_requirements(search_people)
arriving_account_records = prisma_service.find_many(table="account_records", query={"active": True, "is_confirmed": True, "approved_record": False, "currency_value": {"gt": 0}})
debt_account_records = prisma_service.find_many(table="account_records", query={"active": True, "is_confirmed": True, "approved_record": False, "currency_value": {"lt": 0}})
try:
while True:
time.sleep(5)
print("Process Comment Parser service started sleeping for 5 seconds")
tasks_dict = process_comment_parser_service.get_task_requirements()
task_requirements: dict[str, BuildingCluster] = {idx: BuildingCluster(**value) for idx, value in tasks_dict.items()}
parser = Parser(account_records=arriving_account_records, task_requirements=task_requirements)
parsed_records = parser.parse()
except Exception as e:
print(f"Process Comment Parser service error: {str(e)}")
raise e
finally:
prisma_service.disconnect()

View File

@ -0,0 +1,19 @@
#!/bin/sh
VENV_PATH="/opt/venv"
REQUIREMENTS_PATH="/app/requirements.txt"
SCHEMA_PATH="/app/Depends/schema.prisma"
PRISMA_BINARY_PATH="/root/.cache/prisma-python/binaries"
if [ ! -x "$VENV_PATH/bin/python" ]; then
python -m venv "$VENV_PATH"
"$VENV_PATH/bin/pip" install pip --upgrade
"$VENV_PATH/bin/pip" install --no-cache-dir -r "$REQUIREMENTS_PATH"
"$VENV_PATH/bin/prisma" generate --schema "$SCHEMA_PATH"
fi
if ! find "$PRISMA_BINARY_PATH" -type f -name "prisma-query-engine-debian-openssl-3.0.x" | grep -q .; then
"$VENV_PATH/bin/prisma" py fetch
fi
exec "$VENV_PATH/bin/python" -u app.py

View File

@ -0,0 +1,566 @@
import pprint
import re
import arrow
from json import loads, dumps
from unidecode import unidecode
from models import BuildingCluster, Person
turkish_months = ["OCAK", "ŞUBAT", "MART", "NİSAN", "MAYIS", "HAZİRAN", "TEMMUZ", "AĞUSTOS", "EYLÜL", "EKİM", "KASIM", "ARALIK"]
turkish_months_abbr = {
"OCA": "OCAK", "SUB": "ŞUBAT", "ŞUB": "ŞUBAT", "MAR": "MART", "NIS": "NİSAN", "MAY": "MAYIS", "HAZ": "HAZİRAN", "HZR": "HAZİRAN",
"TEM": "TEMMUZ", "AGU": "AĞUSTOS", "AGT": "AĞUSTOS", "EYL": "EYLÜL", "EKI": "EKİM", "KAS": "KASIM", "ARA": "ARALIK", "AGUSTOS": "AĞUSTOS"
}
month_to_number_dict = {
"ocak": 1, "şubat": 2, "mart": 3, "nisan": 4, "mayıs": 5, "haziran": 6, "temmuz": 7, "ağustos": 8, "eylül": 9, "ekim": 10, "kasım": 11, "aralık": 12,
"ocak": 1, "subat": 2, "mart": 3, "nisan": 4, "mayis": 5, "haziran": 6, "temmuz": 7, "agustos": 8, "eylul": 9, "ekim": 10, "kasim": 11, "aralik": 12
}
start_year = 1950
current_year = arrow.now().year
class ParsedComment:
def __init__(self, account_record_id: int, org_comment: str) -> None:
self.account_record_id: int = account_record_id
self.org_comment: str = org_comment
self.comment: str = None
self.people: list[dict] = []
self.parts: list[dict] = []
self.months: list[str] = []
self.years: list[str] = []
self.payment_types: list[str] = []
def set_people(self, people: list[dict]) -> None:
self.people = people
def set_parts(self, parts: list[dict]) -> None:
self.parts = parts
def set_months(self, months: list[str]) -> None:
self.months = months
def set_years(self, years: list[str]) -> None:
self.years = years
def set_payment_types(self, payment_types: list[str]) -> None:
self.payment_types = payment_types
class ParserHelpers:
@staticmethod
def normalize_text(text: str) -> str:
text = text.replace('İ', 'i')
text = text.replace('I', 'ı')
text = text.replace('Ş', 'ş')
text = text.replace('Ğ', 'ğ')
text = text.replace('Ü', 'ü')
text = text.replace('Ö', 'ö')
text = text.replace('Ç', 'ç')
return unidecode(text).lower()
class ParserRequirements(ParserHelpers):
def create_pattern(parts, formats, separators=None):
"""
parts: dict
formats: list[list[tuple[str, str]]]
separators: list[str]
"""
if separators is None:
separators = [""]
patterns = []
for fmt in formats:
for sep in separators:
pattern_parts = []
for part_type, part_name in fmt:
if part_name in parts and part_type in parts[part_name]:
pattern_parts.append(re.escape(parts[part_name][part_type]))
if pattern_parts:
patterns.append(r"\b" + sep.join(pattern_parts) + r"\b")
return patterns
@classmethod
def generate_dictonary_of_patterns(cls, person: Person):
"""Completly remove middle_name instead do regex firstName + SomeWord + surname"""
patterns_dict = {}
person_patterns, firstname, birthname = set(), person.firstname.strip() if person.firstname else "", person.birthname.strip() if person.birthname else ""
middle_name, surname = person.middle_name.strip() if person.middle_name else "", person.surname.strip() if person.surname else ""
if not firstname or not surname:
return patterns_dict
name_parts = {
'firstname': {'orig': firstname, 'norm': cls.normalize_text(firstname) if firstname else "", 'init': cls.normalize_text(firstname)[0] if firstname else ""},
'surname': {'orig': surname, 'norm': cls.normalize_text(surname) if surname else "", 'init': cls.normalize_text(surname)[0] if surname else ""}
}
if middle_name:
name_parts['middle_name'] = {'orig': middle_name, 'norm': cls.normalize_text(middle_name) if middle_name else "", 'init': cls.normalize_text(middle_name)[0] if middle_name else ""}
if birthname and cls.normalize_text(birthname) != cls.normalize_text(surname):
name_parts['birthname'] = {'orig': birthname, 'norm': cls.normalize_text(birthname), 'init': cls.normalize_text(birthname)[0] if birthname else ""}
name_formats = [[('orig', 'firstname'), ('orig', 'surname')], [('norm', 'firstname'), ('norm', 'surname')], [('orig', 'surname'), ('orig', 'firstname')], [('norm', 'surname'), ('norm', 'firstname')]]
if 'middle_name' in name_parts:
name_formats = [[('orig', 'firstname'), ('orig', 'middle_name'), ('orig', 'surname')], [('norm', 'firstname'), ('norm', 'middle_name'), ('norm', 'surname')]]
person_patterns.update(cls.create_pattern(name_parts, name_formats, [" ", ""]))
if 'middle_name' in name_parts:
middle_name_formats = [[('orig', 'firstname'), ('orig', 'middle_name')], [('norm', 'firstname'), ('norm', 'middle_name')], [('orig', 'middle_name'), ('orig', 'surname')], [('norm', 'middle_name'), ('norm', 'surname')],]
person_patterns.update(cls.create_pattern(name_parts, middle_name_formats, [" ", ""]))
if 'birthname' in name_parts and name_parts['surname']['orig'] != name_parts['birthname']['orig']:
birthname_formats = [
[('orig', 'firstname'), ('orig', 'birthname')], [('norm', 'firstname'), ('norm', 'birthname')],
[('orig', 'birthname'), ('orig', 'firstname')], [('norm', 'birthname'), ('norm', 'firstname')]
]
person_patterns.update(cls.create_pattern(name_parts, birthname_formats, [" ", ""]))
initial_formats = [[('init', 'firstname'), ('init', 'middle_name'), ('init', 'surname')], [('init', 'firstname'), ('init', 'surname')]]
person_patterns.update(cls.create_pattern(name_parts, initial_formats, ["", ".", " ", ". "]))
if 'middle_name' in name_parts:
triple_initial_formats = [[('init', 'firstname'), ('init', 'middle_name'), ('init', 'surname')]]
person_patterns.update(cls.create_pattern(name_parts, triple_initial_formats, ["", ".", " ", ". "]))
compiled_patterns = [re.compile(pattern, re.IGNORECASE) for pattern in person_patterns]
patterns_dict[str(person.id)] = compiled_patterns
return patterns_dict
class CommentParser(ParserHelpers):
def __init__(self, account_record, people_regex_dict: dict, people_dict: dict) -> None:
self.original_comment: str = account_record.process_comment
self.comment: str = self.clean_text(account_record.process_comment)
self.people_regex_dict: dict = people_regex_dict
self.people: dict = people_dict
self.account_record_id: str = str(account_record.id)
self.build_id: str = str(account_record.build_id)
self.parsed_comment: ParsedComment = ParsedComment(account_record_id=self.account_record_id, org_comment=self.original_comment)
@staticmethod
def clean_text_apartment_number(text: str, match):
clean_text = text.replace(match.group(0), '').strip()
clean_text = re.sub(r'\s+', ' ', clean_text).strip()
return clean_text
@staticmethod
def clean_text(text: str) -> str:
text = str(text)
text = re.sub(r'\d{8,}', ' ', text)
# text = re.sub(r'\b[A-Za-z0-9]*?[0-9]+[A-Za-z0-9]*?[A-Za-z]+[A-Za-z0-9]*\b|\b[A-Za-z0-9]*?[A-Za-z]+[A-Za-z0-9]*?[0-9]+[A-Za-z0-9]*\b', ' ', text)
text = text.replace("/", " ")
text = text.replace("_", " ")
text_remove_underscore = text.replace("-", " ").replace("+", " ")
text_remove_asterisk = text_remove_underscore.replace("*", " ")
text_remove_comma = text_remove_asterisk.replace(",", " ")
text_remove_dots = text_remove_comma.replace(".", " ")
text_remove_dots = re.sub(r'\s+', ' ', text_remove_dots)
text_remove_dots = text_remove_dots.strip()
return text_remove_dots
def get_people_regex_by_build_id(self) -> dict:
"""
Get people regex by build id
"""
return self.people_regex_dict.get(self.build_id, {})
def get_person(self, person_id: str) -> Person | None:
return self.people[str(self.build_id)].get(person_id, None)
def parse_comment(self) -> ParsedComment:
"""
Parse comment and extract information
"""
self.extract_person_name_with_regex()
self.extract_build_parts_info()
self.extract_months()
self.extract_years()
self.extract_payment_type()
self.comment = self.comment.strip()
self.parsed_comment.comment = self.comment
return self.parsed_comment
def get_text_initials(matched_text: str):
return [unidecode(word.strip())[0].upper() for word in matched_text.split() if word.strip()]
def extract_person_name_with_regex(self):
all_matches, found_dict = [], {}
build_regex = self.get_people_regex_by_build_id()
for person_id, patterns in build_regex.items():
person_matches = []
person = self.get_person(str(person_id))
if not person:
continue
firstname_norm = str(self.normalize_text(person.firstname)).strip() if person.firstname else ""
# middle_name_norm = str(self.normalize_text(person.middle_name)).strip() if person.middle_name else ""
surname_norm = str(self.normalize_text(person.surname)).strip() if person.surname else ""
birthname_norm = str(self.normalize_text(person.birthname)).strip() if person.birthname else ""
text_norm = str(self.normalize_text(self.comment))
for pattern in patterns[str(person_id)]:
for match in pattern.finditer(text_norm):
start, end = match.span()
matched_text: str = self.comment[start:end]
matched_text_norm = self.normalize_text(matched_text)
is_valid_match = False
if len(matched_text_norm.split()) <= 1:
is_valid_match = False
else:
has_firstname = firstname_norm and firstname_norm in matched_text_norm
has_surname = surname_norm and surname_norm in matched_text_norm
has_birthname = birthname_norm and birthname_norm in matched_text_norm
if (has_firstname and has_surname) or (has_firstname and has_birthname):
is_valid_match = True
if is_valid_match:
person_matches.append({'matched_text': matched_text, 'start': start, 'end': end})
if person_matches:
person_matches.sort(key=lambda x: len(x['matched_text']), reverse=True)
non_overlapping_matches = []
for match in person_matches:
overlaps = False
for existing_match in non_overlapping_matches:
if (match['start'] < existing_match['end'] and match['end'] > existing_match['start']):
overlaps = True
break
if not overlaps:
non_overlapping_matches.append(match)
if non_overlapping_matches:
found_dict["name_match"] = person
all_matches.extend([(match, person) for match in non_overlapping_matches])
if all_matches:
all_matches.sort(key=lambda x: x[0]['start'], reverse=True)
for match, person in all_matches:
matched_text: str = match['matched_text']
matched_words = matched_text.split()
for word in matched_words:
word_norm = str(self.normalize_text(word)).strip()
if not word_norm:
continue
text_norm = self.normalize_text(self.comment)
if not any([person_com for person_com in self.parsed_comment.people if str(person_com.id) == str(person.id)]):
self.parsed_comment.people.append(person)
for word_match in re.finditer(rf'\b{re.escape(word_norm)}\b', text_norm, re.IGNORECASE):
start, end = word_match.span()
self.comment = self.comment[:start] + ' ' * (end - start) + self.comment[end:]
self.comment = re.sub(r'\s+', ' ', self.comment).strip()
def extract_build_parts_info(self):
"""
Regex of parts such as :
2 nolu daire
9 NUMARALI DAI
daire 3
3 nolu dairenin
11nolu daire
Daire No 12
2NOLU DAIRE
12 No lu daire
D:10
NO:11
NO :3
"""
apartment_number = None
pattern1 = re.compile(r'(\d+)\s*nolu\s*daire', re.IGNORECASE)
match = pattern1.search(self.comment)
if match:
apartment_number = match.group(1)
self.parsed_comment.parts.append(apartment_number)
self.comment = self.clean_text_apartment_number(self.comment, match)
return
pattern4 = re.compile(r'(\d+)\s*nolu\s*daire\w*', re.IGNORECASE)
match = pattern4.search(self.comment)
if match:
apartment_number = match.group(1)
self.parsed_comment.parts.append(apartment_number)
self.comment = self.clean_text_apartment_number(self.comment, match)
return
pattern5 = re.compile(r'(\d+)nolu\s*daire', re.IGNORECASE)
match = pattern5.search(self.comment)
if match:
apartment_number = match.group(1)
self.parsed_comment.parts.append(apartment_number)
self.comment = self.clean_text_apartment_number(self.comment, match)
return
pattern7 = re.compile(r'(\d+)nolu\s*daire', re.IGNORECASE)
match = pattern7.search(self.comment)
if match:
apartment_number = match.group(1)
self.parsed_comment.parts.append(apartment_number)
self.comment = self.clean_text_apartment_number(self.comment, match)
return
pattern8 = re.compile(r'(\d+)\s*no\s*lu\s*daire', re.IGNORECASE)
match = pattern8.search(self.comment)
if match:
apartment_number = match.group(1)
self.parsed_comment.parts.append(apartment_number)
self.comment = self.clean_text_apartment_number(self.comment, match)
return
pattern6 = re.compile(r'daire\s*no\s*(\d+)', re.IGNORECASE)
match = pattern6.search(self.comment)
if match:
apartment_number = match.group(1)
self.parsed_comment.parts.append(apartment_number)
self.comment = self.clean_text_apartment_number(self.comment, match)
return
pattern2 = re.compile(r'(\d+)\s*numarali\s*dai', re.IGNORECASE)
match = pattern2.search(self.comment)
if match:
apartment_number = match.group(1)
self.parsed_comment.parts.append(apartment_number)
self.comment = self.clean_text_apartment_number(self.comment, match)
return
pattern3 = re.compile(r'daire\s*(\d+)', re.IGNORECASE)
match = pattern3.search(self.comment)
if match:
apartment_number = match.group(1)
self.parsed_comment.parts.append(apartment_number)
self.comment = self.clean_text_apartment_number(self.comment, match)
return
pattern9 = re.compile(r'd\s*:\s*(\d+)', re.IGNORECASE)
match = pattern9.search(self.comment)
if match:
apartment_number = match.group(1)
self.parsed_comment.parts.append(apartment_number)
self.comment = self.clean_text_apartment_number(self.comment, match)
return
pattern10 = re.compile(r'no\s*:\s*(\d+)', re.IGNORECASE)
match = pattern10.search(self.comment)
if match:
apartment_number = match.group(1)
self.parsed_comment.parts.append(apartment_number)
self.comment = self.clean_text_apartment_number(self.comment, match)
return
# return found_dict, self.comment
def extract_months(self):
"""
Extract Turkish month names and abbreviations from the process comment
"""
original_text = self.comment
working_text = original_text
for month in turkish_months:
pattern = re.compile(r'\b' + re.escape(month) + r'\b', re.IGNORECASE)
for match in pattern.finditer(original_text):
matched_text = match.group(0)
normalized_month = self.normalize_text(month)
month_number = None
if month.lower() in month_to_number_dict:
month_number = month_to_number_dict[month.lower()]
elif normalized_month in month_to_number_dict:
month_number = month_to_number_dict[normalized_month]
month_info = {'name': month, 'number': month_number}
self.parsed_comment.months.append(month_info)
working_text = working_text.replace(matched_text, '', 1)
for abbr, full_month in turkish_months_abbr.items():
pattern = re.compile(r'\b' + re.escape(abbr) + r'\b', re.IGNORECASE)
for match in pattern.finditer(working_text):
matched_text = match.group(0)
normalized_month = self.normalize_text(full_month)
month_number = None
if full_month.lower() in month_to_number_dict:
month_number = month_to_number_dict[full_month.lower()]
elif normalized_month in month_to_number_dict:
month_number = month_to_number_dict[normalized_month]
month_info = {'name': full_month, 'number': month_number}
self.parsed_comment.months.append(month_info)
working_text = working_text.replace(matched_text, '', 1)
self.comment = working_text
def extract_years(self):
"""
Extract years from the process comment
"""
original_text = self.comment
working_text = original_text
for year in range(start_year, current_year + 1):
pattern = re.compile(r'\b' + str(year) + r'\b', re.IGNORECASE)
for match in pattern.finditer(original_text):
matched_text = match.group(0)
if str(matched_text).isdigit():
self.parsed_comment.years.append(int(matched_text))
working_text = working_text.replace(matched_text, '', 1)
self.comment = working_text
def extract_payment_type(self):
"""
Extract payment type from the process comment : aidat, AİD, aidatı, TADİLAT, YAKIT, yakıt, yakit
"""
original_text = self.comment
working_text = original_text
payment_keywords = {'aidat': ['aidat', 'aİd', 'aid', 'aidatı', 'aidati'], 'tadilat': ['tadilat', 'tadİlat', 'tadilatı'], 'yakit': ['yakit', 'yakıt', 'yakıtı', 'yakiti']}
for payment_type, keywords in payment_keywords.items():
for keyword in keywords:
pattern = re.compile(r'\b' + keyword + r'\b', re.IGNORECASE)
for match in pattern.finditer(original_text):
matched_text = match.group(0)
if payment_type not in self.parsed_comment.payment_types:
self.parsed_comment.payment_types.append(payment_type)
working_text = working_text.replace(matched_text, '', 1)
self.comment = working_text
class Parser:
def __init__(self, account_records: list, task_requirements: dict[str, BuildingCluster]) -> None:
"""
Initialize parser with account records and task requirements
"""
self.account_records: list = account_records
self.task_requirements: dict[str, BuildingCluster] = task_requirements
self.people_dict: dict[str, Person] = {}
self.people_regex_dict: dict = self.prepare_people_regex_dict()
self.parsed_records: list[ParsedComment] = []
def prepare_people_regex_dict(self):
"""Prepare regex dictionary for people"""
regex_pattern_dict = {}
for build_id, build_cluster in self.task_requirements.items():
for build_part in build_cluster.build_parts:
for living_space in build_part.living_spaces:
person: Person = living_space.person
if str(build_id) in self.people_dict:
if not str(person.id) in self.people_dict[str(build_id)]:
self.people_dict[str(build_id)][str(person.id)] = person
else:
self.people_dict[str(build_id)] = {str(person.id): person}
for build_id, people in self.people_dict.items():
people: dict[str, Person] = people
for person_id, person in people.items():
if str(build_id) not in regex_pattern_dict:
regex_pattern_dict[str(build_id)] = {}
regex_pattern_dict[str(build_id)][str(person_id)] = ParserRequirements.generate_dictonary_of_patterns(person)
return regex_pattern_dict
def parse(self):
"""Parse account records based on task requirements"""
for account_record in self.account_records:
comment_parser = CommentParser(account_record=account_record, people_regex_dict=self.people_regex_dict, people_dict=self.people_dict)
parsed_comment = comment_parser.parse_comment()
self.parsed_records.append(parsed_comment)
for parsed_record in self.parsed_records:
print("*" * 150)
pprint.pprint({
"original_comment": parsed_record.org_comment, "comment": parsed_record.comment, "people": parsed_record.people,
"parts": parsed_record.parts, "months": parsed_record.months, "years": parsed_record.years, "payment_types": parsed_record.payment_types
}, indent=2)
return self.parsed_records
def commented_code():
def main(account_records, people):
list_of_regex_patterns = generate_dictonary_of_patterns(people=people)
dicts_found, dicts_not_found, count_extracted = dict(), dict(), 0
for account_record in account_records:
account_record_id = str(account_record["id"])
found_dict = {}
process_comment_iteration = clean_text(text=account_record["process_comment"])
found_dict, cleaned_process_comment = extract_person_name_with_regex(found_dict=found_dict, process_comment=process_comment_iteration, patterns_dict=list_of_regex_patterns, people=people)
found_dict, cleaned_process_comment = extract_build_parts_info(found_dict=found_dict, process_comment=cleaned_process_comment)
found_dict, cleaned_process_comment = extract_months(found_dict=found_dict, process_comment=cleaned_process_comment)
found_dict, cleaned_process_comment = extract_year(found_dict=found_dict, process_comment=cleaned_process_comment)
found_dict, cleaned_process_comment = extract_payment_type(found_dict=found_dict, process_comment=cleaned_process_comment)
if found_dict:
dicts_found[str(account_record_id)] = found_dict
else:
dicts_not_found[str(account_record_id)] = account_record_id
for id_, item in dicts_found.items():
months_are_valid = bool(item.get("months", []))
years_are_valid = bool(item.get("years", []))
payment_types_are_valid = bool(item.get("payment_types", []))
apartment_number_are_valid = bool(item.get("apartment_number", []))
person_name_are_valid = bool(item.get("name_match", []))
account_record_to_save = AccountRecords.query.filter_by(id=int(id_)).first()
save_dict = dict(account_records_id=account_record_to_save.id, account_records_uu_id=str(account_record_to_save.uu_id), prediction_model="regex", treshold=1, is_first_prediction=False)
update_dict = dict(prediction_model="regex", treshold=1, is_first_prediction=False)
if any([months_are_valid, years_are_valid, payment_types_are_valid, apartment_number_are_valid, person_name_are_valid]):
count_extracted += 1
if months_are_valid:
print(f"months: {item['months']}")
data_to_save = dumps({"data": item['months']})
prediction_result = AccountRecordsPredict.query.filter_by(account_records_id=account_record_to_save.id, prediction_field="months", prediction_model="regex").first()
if not prediction_result:
created_account_prediction = AccountRecordsPredict.create(**save_dict, prediction_field="months", prediction_result=data_to_save)
created_account_prediction.save()
else:
prediction_result.update(**update_dict, prediction_result=data_to_save)
prediction_result.save()
if years_are_valid:
print(f"years: {item['years']}")
data_to_save = dumps({"data": item['years']})
prediction_result = AccountRecordsPredict.query.filter_by(account_records_id=account_record_to_save.id, prediction_field="years", prediction_model="regex").first()
if not prediction_result:
created_account_prediction = AccountRecordsPredict.create(**save_dict, prediction_field="years", prediction_result=data_to_save)
created_account_prediction.save()
else:
prediction_result.update(**update_dict, prediction_result=data_to_save)
prediction_result.save()
if payment_types_are_valid:
print(f"payment_types: {item['payment_types']}")
data_to_save = dumps({"data": item['payment_types']})
prediction_result = AccountRecordsPredict.query.filter_by(account_records_id=account_record_to_save.id, prediction_field="payment_types", prediction_model="regex").first()
if not prediction_result:
created_account_prediction = AccountRecordsPredict.create(**save_dict, prediction_field="payment_types", prediction_result=data_to_save)
created_account_prediction.save()
else:
prediction_result.update(**update_dict, prediction_result=data_to_save)
prediction_result.save()
if apartment_number_are_valid:
print(f"apartment_number: {item['apartment_number']}")
prediction_result = AccountRecordsPredict.query.filter_by(account_records_id=account_record_to_save.id, prediction_field="apartment_number", prediction_model="regex").first()
if not prediction_result:
created_account_prediction = AccountRecordsPredict.create(**save_dict, prediction_field="apartment_number", prediction_result=item['apartment_number'])
created_account_prediction.save()
else:
prediction_result.update(**update_dict, prediction_result=item['apartment_number'])
prediction_result.save()
if person_name_are_valid:
print(f"person_name: {item['name_match']}")
data_to_save = dumps({"data": item['name_match']})
prediction_result = AccountRecordsPredict.query.filter_by(account_records_id=account_record_to_save.id, prediction_field="person_name", prediction_model="regex").first()
if not prediction_result:
created_account_prediction = AccountRecordsPredict.create(**save_dict, prediction_field="person_name", prediction_result=data_to_save)
created_account_prediction.save()
else:
prediction_result.update(**update_dict, prediction_result=data_to_save)
prediction_result.save()
print("\n===== SUMMARY =====")
print(f"extracted data total : {count_extracted}")
print(f"not extracted data total : {len(account_records) - count_extracted}")
print(f"Total account records processed : {len(account_records)}")
# if __name__ == "__main__":
# people_query = sqlalchemy_text("""
# SELECT DISTINCT ON (p.id) p.firstname, p.middle_name, p.surname, p.birthname, bl.id
# FROM public.people as p
# INNER JOIN public.build_living_space as bl ON bl.person_id = p.id
# INNER JOIN public.build_parts as bp ON bp.id = bl.build_parts_id
# INNER JOIN public.build as b ON b.id = bp.build_id
# WHERE b.id = 1
# ORDER BY p.id
# """)
# people_raw = session.execute(people_query).all()
# remove_duplicate = list()
# clean_people_list = list()
# for person in people_raw:
# merged_name = f"{person[0]} {person[1]} {person[2]} {person[3]}"
# if merged_name not in remove_duplicate:
# clean_people_list.append(person)
# remove_duplicate.append(merged_name)
# people = [{"firstname": p[0], "middle_name": p[1], "surname": p[2], "birthname": p[3], 'id': p[4]} for p in clean_people_list]
# query_account_records = sqlalchemy_text("""
# SELECT a.id, a.iban, a.bank_date, a.process_comment FROM public.account_records as a where currency_value > 0
# """) # and bank_date::date >= '2020-01-01'
# account_records = session.execute(query_account_records).all()
# account_records = [{"id": ar[0], "iban": ar[1], "bank_date": ar[2], "process_comment": ar[3]} for ar in account_records]
# try:
# main(session=session, account_records=account_records, people=people)
# except Exception as e:
# print(f"{e}")
# session.close()
# session_factory.remove()

View File

@ -0,0 +1,93 @@
from typing import Optional, List
from pydantic import BaseModel
class User(BaseModel):
id: int
uu_id: str
user_tag: str
user_type: str
email: str
phone_number: str
related_company: str
is_confirmed: bool
active: bool
class Person(BaseModel):
id: int
uu_id: str
firstname: str
surname: str
middle_name: Optional[str] = ""
birthname: Optional[str] = ""
# national_identity_id: str
is_confirmed: bool
active: bool
user: Optional[User] = None
class OccupantType(BaseModel):
id: int
uu_id: str
occupant_code: str
occupant_type: str
is_confirmed: bool
active: bool
user_type_uu_id: Optional[str] = None
class BuildPart(BaseModel):
id: int
uu_id: str
part_no: str
part_level: str
part_code: str
part_gross_size: float
part_net_size: float
human_livable: bool
build_id: int
build_uu_id: str
is_confirmed: bool
active: bool
living_spaces: Optional[List['BuildLivingSpace']] = None
class BuildLivingSpace(BaseModel):
id: int
uu_id: str
expiry_starts: str
expiry_ends: str
fix_value: float
fix_percent: float
agreement_no: str
marketing_process: bool
build_parts_id: int
build_parts_uu_id: str
person_id: int
person_uu_id: str
occupant_type_id: int
occupant_type_uu_id: str
is_confirmed: bool
active: bool
person: Optional[Person] = None
occupant_type: Optional[OccupantType] = None
class BuildingCluster(BaseModel):
id: int
uu_id: str
build_name: str
build_no: str
build_date: str
decision_period_date: str
expiry_starts: str
expiry_ends: str
is_confirmed: bool
active: bool
build_parts: List['BuildPart'] = []
# Update forward references for models with circular dependencies
BuildPart.update_forward_refs()
BuildingCluster.update_forward_refs()

View File

@ -138,6 +138,7 @@ class ConfigServices:
TASK_UUID_INDEX_PREFIX: str = "BANK:SERVICES:TASK:UUID" TASK_UUID_INDEX_PREFIX: str = "BANK:SERVICES:TASK:UUID"
TASK_SEEN_PREFIX: str = "BANK:SERVICES:TASK:SEEN" TASK_SEEN_PREFIX: str = "BANK:SERVICES:TASK:SEEN"
TASK_DELETED_PREFIX: str = "BANK:SERVICES:TASK:DELETED" TASK_DELETED_PREFIX: str = "BANK:SERVICES:TASK:DELETED"
TASK_COMMENT_PARSER: str = "BANK:SERVICES:TASK:COMMENT:PARSER"
SERVICE_PREFIX_MAIL_READER: str = "MailReader" SERVICE_PREFIX_MAIL_READER: str = "MailReader"
SERVICE_PREFIX_MAIL_PARSER: str = "MailParser" SERVICE_PREFIX_MAIL_PARSER: str = "MailParser"
@ -145,6 +146,7 @@ class ConfigServices:
SERVICE_PREFIX_FINDER_COMMENT: str = "FinderComment" SERVICE_PREFIX_FINDER_COMMENT: str = "FinderComment"
SERVICE_PREFIX_MAIL_SENDER: str = "MailSender" SERVICE_PREFIX_MAIL_SENDER: str = "MailSender"
TEMPLATE_ACCOUNT_RECORDS: str = "template_accounts.html" TEMPLATE_ACCOUNT_RECORDS: str = "template_accounts.html"

View File

@ -132,7 +132,7 @@ class PrismaService:
table_selected: BaseModelClient = getattr(db, table, None) table_selected: BaseModelClient = getattr(db, table, None)
if not table_selected: if not table_selected:
raise ValueError(f"Table {table} not found") raise ValueError(f"Table {table} not found")
rows = await table_selected.find_many(where=query, take=take, skip=skip, order=order or [], select=select, include=include) rows = await table_selected.find_many(where=query, take=take, skip=skip, order=order or [], include=include)
# print(f"[{datetime.now()}] Find many query completed in {time.time() - start:.2f}s") # print(f"[{datetime.now()}] Find many query completed in {time.time() - start:.2f}s")
return rows return rows
@ -234,7 +234,7 @@ class PrismaService:
self, table: str, query: Optional[dict] = None, take: int = None, skip: int = None, self, table: str, query: Optional[dict] = None, take: int = None, skip: int = None,
order: Optional[list[dict]] = None, select: Optional[dict] = None, include: Optional[dict] = None order: Optional[list[dict]] = None, select: Optional[dict] = None, include: Optional[dict] = None
): ):
result = self._submit(self._a_find_many(table=table, query=query, take=take, skip=skip, order=order, select=select, include=include)) result = self._submit(self._a_find_many(table=table, query=query, take=take, skip=skip, order=order, include=include))
if select and result: if select and result:
result = [{k: v for k, v in item.items() if k in select} for item in result] result = [{k: v for k, v in item.items() if k in select} for item in result]
return result return result

View File

@ -605,15 +605,6 @@ class MailReaderService:
self.service_retriever = ServiceTaskRetriever(self.redis_handler) self.service_retriever = ServiceTaskRetriever(self.redis_handler)
self._initialized = True self._initialized = True
def ensure_connection(self):
"""
Ensure Redis connection is established
Returns:
bool: True if connection is established, False otherwise
"""
return self.redis_handler.ensure_connection()
def get_task_by_uuid(self, task_uuid: str) -> RedisTaskObject: def get_task_by_uuid(self, task_uuid: str) -> RedisTaskObject:
""" """
Get a task object by its UUID Get a task object by its UUID
@ -840,34 +831,6 @@ class MailParserService:
def fetch_all_tasks(self) -> list[RedisTaskObject]: def fetch_all_tasks(self) -> list[RedisTaskObject]:
return self.service_retriever.fetch_all_tasks() return self.service_retriever.fetch_all_tasks()
def ensure_connection(self):
"""
Ensure Redis connection is established
Returns:
bool: True if connection is established, False otherwise
"""
return self.redis_handler.ensure_connection()
def _check_redis_connection(self) -> bool:
"""
Check if Redis connection is alive using RedisHandler
Returns:
True if connection is alive, False otherwise
"""
try:
# Use RedisHandler to check connection
connection_status = self.redis_handler.ensure_connection()
if connection_status:
logger.info("Redis connection established via RedisHandler")
else:
logger.error("Redis connection check failed via RedisHandler")
return connection_status
except RedisHandler.REDIS_EXCEPTIONS as e:
logger.error(f"Redis connection failed: {str(e)}")
return False
def get_task_by_uuid(self, task_uuid: str) -> RedisTaskObject: def get_task_by_uuid(self, task_uuid: str) -> RedisTaskObject:
""" """
Get a task object by its UUID Get a task object by its UUID
@ -948,34 +911,6 @@ class IbanFinderService:
def fetch_all_tasks(self) -> list[RedisTaskObject]: def fetch_all_tasks(self) -> list[RedisTaskObject]:
return self.service_retriever.fetch_all_tasks() return self.service_retriever.fetch_all_tasks()
def ensure_connection(self):
"""
Ensure Redis connection is established
Returns:
bool: True if connection is established, False otherwise
"""
return self.redis_handler.ensure_connection()
def _check_redis_connection(self) -> bool:
"""
Check if Redis connection is alive using RedisHandler
Returns:
True if connection is alive, False otherwise
"""
try:
# Use RedisHandler to check connection
connection_status = self.redis_handler.ensure_connection()
if connection_status:
logger.info("Redis connection established via RedisHandler")
else:
logger.error("Redis connection check failed via RedisHandler")
return connection_status
except RedisHandler.REDIS_EXCEPTIONS as e:
logger.error(f"Redis connection failed: {str(e)}")
return False
def get_task_by_uuid(self, task_uuid: str) -> RedisTaskObject: def get_task_by_uuid(self, task_uuid: str) -> RedisTaskObject:
""" """
Get a task object by its UUID Get a task object by its UUID
@ -1152,3 +1087,41 @@ class ProcessCommentFinderService:
return self.service_retriever.delete_task(task_uuid, max_retries) return self.service_retriever.delete_task(task_uuid, max_retries)
class ProcessCommentParserService:
"""
Class for processing comment parser tasks
"""
instance = None
REDIS_EXCEPTIONS = RedisHandler.REDIS_EXCEPTIONS
def __init__(self):
if hasattr(self, '_initialized') and self._initialized:
return
self.service_retriever: ServiceTaskRetriever = ServiceTaskRetriever()
self._initialized = True
def fetch_all_tasks(self) -> list[RedisTaskObject]:
"""
Get all tasks from Redis
Returns:
list: List of task objects
"""
return self.service_retriever.fetch_all_tasks_parser()
def get_task_requirements(self) -> dict:
"""
Get task requirements from Redis
Returns:
dict: Task requirements if found
"""
if task_object := self.service_retriever.redis_handler.get(f'{ConfigServices.TASK_COMMENT_PARSER}'):
return loads(task_object)
return None
def set_task_requirements(self, task_object: RedisTaskObject):
"""
Set task requirements in Redis
"""
return self.service_retriever.redis_handler.set(f'{ConfigServices.TASK_COMMENT_PARSER}', dumps(task_object))

View File

@ -120,6 +120,22 @@ services:
options: options:
max-size: "10m" max-size: "10m"
max-file: "3" max-file: "3"
process_comment_parser:
container_name: process_comment_parser
build:
context: .
dockerfile: ServicesRunner/AccountRecordServices/Finder/Parser/Comment/Dockerfile
networks:
- bank-services-network
volumes:
- ./ServicesRunner/AccountRecordServices/Finder/Parser/Comment/venv:/opt/venv
- ./ServicesRunner/AccountRecordServices/Finder/Parser/Comment/.prisma-cache:/root/.cache/prisma-python
logging:
driver: "json-file"
options:
max-size: "10m"
max-file: "3"
# finder_payments: # finder_payments:
# container_name: finder_payments # container_name: finder_payments
# env_file: # env_file:
@ -135,8 +151,6 @@ services:
# max-size: "10m" # max-size: "10m"
# max-file: "3" # max-file: "3"
networks: networks:
bank-services-network: bank-services-network:
driver: bridge driver: bridge