updated table frontend
This commit is contained in:
commit
917396ec15
|
|
@ -0,0 +1,2 @@
|
||||||
|
.venv
|
||||||
|
__pycache__
|
||||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
|
@ -0,0 +1,61 @@
|
||||||
|
from contextlib import contextmanager
|
||||||
|
from functools import lru_cache
|
||||||
|
from typing import Generator
|
||||||
|
|
||||||
|
from sqlalchemy import create_engine
|
||||||
|
from sqlalchemy.orm import declarative_base, sessionmaker, scoped_session, Session
|
||||||
|
|
||||||
|
|
||||||
|
# Configure the database engine with proper pooling
|
||||||
|
engine = create_engine(
|
||||||
|
"postgresql+psycopg2://postgres:password@10.10.2.14:5432/postgres",
|
||||||
|
pool_pre_ping=True,
|
||||||
|
pool_size=10, # Reduced from 20 to better match your CPU cores
|
||||||
|
max_overflow=5, # Reduced from 10 to prevent too many connections
|
||||||
|
pool_recycle=600, # Keep as is
|
||||||
|
pool_timeout=30, # Keep as is
|
||||||
|
echo=False, # Consider setting to False in production
|
||||||
|
)
|
||||||
|
|
||||||
|
Base = declarative_base()
|
||||||
|
|
||||||
|
|
||||||
|
# Create a cached session factory
|
||||||
|
@lru_cache()
|
||||||
|
def get_session_factory() -> scoped_session:
|
||||||
|
"""Create a thread-safe session factory."""
|
||||||
|
session_local = sessionmaker(
|
||||||
|
bind=engine,
|
||||||
|
autocommit=False,
|
||||||
|
autoflush=False,
|
||||||
|
expire_on_commit=True, # Prevent expired object issues
|
||||||
|
)
|
||||||
|
return scoped_session(session_local)
|
||||||
|
|
||||||
|
|
||||||
|
# Get database session with proper connection management
|
||||||
|
@contextmanager
|
||||||
|
def get_db() -> Generator[Session, None, None]:
|
||||||
|
"""Get database session with proper connection management.
|
||||||
|
|
||||||
|
This context manager ensures:
|
||||||
|
- Proper connection pooling
|
||||||
|
- Session cleanup
|
||||||
|
- Connection return to pool
|
||||||
|
- Thread safety
|
||||||
|
|
||||||
|
Yields:
|
||||||
|
Session: SQLAlchemy session object
|
||||||
|
"""
|
||||||
|
|
||||||
|
session_factory = get_session_factory()
|
||||||
|
session = session_factory()
|
||||||
|
try:
|
||||||
|
yield session
|
||||||
|
session.commit()
|
||||||
|
except Exception:
|
||||||
|
session.rollback()
|
||||||
|
raise
|
||||||
|
finally:
|
||||||
|
session.close()
|
||||||
|
session_factory.remove() # Clean up the session from the registry
|
||||||
|
|
@ -0,0 +1,225 @@
|
||||||
|
import re
|
||||||
|
import pandas as pd
|
||||||
|
from unidecode import unidecode
|
||||||
|
from difflib import SequenceMatcher
|
||||||
|
from itertools import permutations
|
||||||
|
from time import perf_counter
|
||||||
|
from base_import import get_session_factory
|
||||||
|
from sqlalchemy import text as sqlalchemy_text
|
||||||
|
|
||||||
|
session_factory = get_session_factory()
|
||||||
|
session = session_factory()
|
||||||
|
|
||||||
|
turkish_months = ["OCAK", "ŞUBAT", "MART", "NİSAN", "MAYIS", "HAZİRAN", "TEMMUZ", "AĞUSTOS", "EYLÜL", "EKİM", "KASIM", "ARALIK"]
|
||||||
|
turkish_months_abbr = {
|
||||||
|
"OCA": "OCAK", "SUB": "ŞUBAT", "ŞUB": "ŞUBAT", "MAR": "MART", "NIS": "NİSAN", "MAY": "MAYIS", "HAZ": "HAZİRAN", "HZR": "HAZİRAN",
|
||||||
|
"TEM": "TEMMUZ", "AGU": "AĞUSTOS", "AGT": "AĞUSTOS", "EYL": "EYLÜL", "EKI": "EKİM", "KAS": "KASIM", "ARA": "ARALIK",
|
||||||
|
}
|
||||||
|
start_year = 1950
|
||||||
|
current_year = pd.Timestamp.now().year
|
||||||
|
people = session.execute(
|
||||||
|
sqlalchemy_text("""
|
||||||
|
SELECT p.firstname, p.middle_name, p.surname, bl.id FROM public.people as p
|
||||||
|
inner join public.build_living_space as bl on bl.person_id = p.id
|
||||||
|
inner join public.build_parts as bp on bp.id = bl.build_parts_id
|
||||||
|
inner join public.build as b on b.id = bp.build_id
|
||||||
|
where b.id = 1
|
||||||
|
""")
|
||||||
|
).all()
|
||||||
|
people = [{"firstname": p[0], "middle_name": p[1], "last_name": p[2], 'id': p[3]} for p in people]
|
||||||
|
|
||||||
|
|
||||||
|
def clean_text(text):
|
||||||
|
# Convert to string just in case
|
||||||
|
text = str(text)
|
||||||
|
# Remove extra spaces and tabs by splitting and joining
|
||||||
|
text = " ".join(text.split())
|
||||||
|
text_remove_underscore = text.replace("-", " ")
|
||||||
|
text_remove_asterisk = text_remove_underscore.replace("*", " ")
|
||||||
|
text_remove_comma = text_remove_asterisk.replace(",", " ")
|
||||||
|
# Optionally lower-case or normalize unicode if needed
|
||||||
|
return text_remove_comma
|
||||||
|
|
||||||
|
|
||||||
|
def is_valid_char_match(word, month):
|
||||||
|
"""Ensure all letters in the word exist in the target month name."""
|
||||||
|
word_chars = set(word)
|
||||||
|
month_chars = set(month)
|
||||||
|
return word_chars.issubset(month_chars)
|
||||||
|
|
||||||
|
|
||||||
|
def best_month_matches(text, threshold=0.7):
|
||||||
|
matches = []
|
||||||
|
words = clean_text(text).split()
|
||||||
|
|
||||||
|
for word in words:
|
||||||
|
# First check abbreviations dictionary exact match
|
||||||
|
if word in turkish_months_abbr:
|
||||||
|
full_month = turkish_months_abbr[word]
|
||||||
|
matches.append((full_month.capitalize(), word, 1.0))
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Else fuzzy match full months
|
||||||
|
for month in turkish_months:
|
||||||
|
month_clean = unidecode(month.upper())
|
||||||
|
ratio = SequenceMatcher(None, word, month_clean).ratio()
|
||||||
|
if ratio >= threshold and is_valid_char_match(word, month_clean):
|
||||||
|
matches.append((month.capitalize(), word, round(ratio, 2)))
|
||||||
|
break
|
||||||
|
|
||||||
|
return matches if matches else ["N/A"]
|
||||||
|
|
||||||
|
|
||||||
|
def print_out_results(data_frame, df):
|
||||||
|
for ix, process_comment in enumerate(data_frame):
|
||||||
|
print(f'Row number {ix} : {process_comment}')
|
||||||
|
print(len(data_frame), '/', len(df))
|
||||||
|
|
||||||
|
|
||||||
|
def extract_years(text):
|
||||||
|
matches = re.findall(r'\b(19\d{2}|20\d{2})\b', text)
|
||||||
|
valid_years = [year for year in matches if start_year <= int(year) <= current_year]
|
||||||
|
return valid_years if valid_years else ["N/A"]
|
||||||
|
|
||||||
|
|
||||||
|
def extract_months(text):
|
||||||
|
# Normalize text and months by converting to ASCII lowercase
|
||||||
|
text_norm = unidecode(str(text)).lower()
|
||||||
|
months_norm = [unidecode(m).lower() for m in turkish_months]
|
||||||
|
found = [turkish_months[i] for i, m in enumerate(months_norm) if m in text_norm]
|
||||||
|
return found if found else ["N/A"]
|
||||||
|
|
||||||
|
|
||||||
|
def normalize_text(text):
|
||||||
|
return unidecode(text).lower()
|
||||||
|
|
||||||
|
|
||||||
|
def build_name_regex_all_combinations(person):
|
||||||
|
firstname = person.get("firstname", "").strip()
|
||||||
|
middle_name = person.get("middle_name", "").strip()
|
||||||
|
last_name = person.get("last_name", "").strip()
|
||||||
|
|
||||||
|
parts = [unidecode(p).lower() for p in [firstname, middle_name, last_name] if p]
|
||||||
|
|
||||||
|
patterns = []
|
||||||
|
for r in range(1, len(parts) + 1):
|
||||||
|
# Use permutations instead of combinations to get all orderings
|
||||||
|
for permuted_parts in permutations(parts, r):
|
||||||
|
regex_pattern = r"\b" + r"\s*".join(map(re.escape, permuted_parts)) + r"\b"
|
||||||
|
patterns.append((re.compile(regex_pattern, flags=re.IGNORECASE), r))
|
||||||
|
|
||||||
|
# Add full merged version like "fatihergunguclu"
|
||||||
|
if len(parts) >= 2:
|
||||||
|
merged = ''.join(parts)
|
||||||
|
patterns.append((re.compile(rf"\b{re.escape(merged)}\b", flags=re.IGNORECASE), len(parts)))
|
||||||
|
|
||||||
|
return patterns
|
||||||
|
|
||||||
|
|
||||||
|
def get_person_initials(person):
|
||||||
|
parts = [person.get("firstname", ""), person.get("middle_name", ""), person.get("last_name", "")]
|
||||||
|
return [unidecode(p.strip())[0].upper() for p in parts if p]
|
||||||
|
|
||||||
|
def get_text_initials(matched_text):
|
||||||
|
return [unidecode(word.strip())[0].upper() for word in matched_text.split() if word.strip()]
|
||||||
|
|
||||||
|
def find_person_regex_all_combinations(text, person):
|
||||||
|
text_norm = normalize_text(text)
|
||||||
|
scored_matches = []
|
||||||
|
|
||||||
|
for pattern, weight in build_name_regex_all_combinations(person):
|
||||||
|
for match in pattern.finditer(text_norm):
|
||||||
|
matched_text = match.group().strip()
|
||||||
|
scored_matches.append({
|
||||||
|
"matched_person": person,
|
||||||
|
"matched_text": matched_text,
|
||||||
|
"weight": weight
|
||||||
|
})
|
||||||
|
|
||||||
|
return scored_matches
|
||||||
|
|
||||||
|
def find_all_person_matches(text):
|
||||||
|
all_valid_matches = {}
|
||||||
|
|
||||||
|
for person in people:
|
||||||
|
matches = find_person_regex_all_combinations(text, person)
|
||||||
|
|
||||||
|
for match in matches:
|
||||||
|
matched_person = match["matched_person"]
|
||||||
|
matched_text = match["matched_text"]
|
||||||
|
weight = match["weight"]
|
||||||
|
|
||||||
|
person_initials = get_person_initials(matched_person)
|
||||||
|
found_text_letters = get_text_initials(matched_text)
|
||||||
|
|
||||||
|
match_count = sum(1 for c in found_text_letters if c in person_initials)
|
||||||
|
|
||||||
|
if match_count >= 2:
|
||||||
|
person_key = tuple(matched_person.values())
|
||||||
|
if person_key not in all_valid_matches or all_valid_matches[person_key][0] < weight:
|
||||||
|
all_valid_matches[person_key] = (weight, matched_person)
|
||||||
|
|
||||||
|
sorted_matches = sorted(all_valid_matches.values(), key=lambda x: -x[0])
|
||||||
|
return [m[1] for m in sorted_matches] or ["N/A"]
|
||||||
|
|
||||||
|
|
||||||
|
def find_all_person_matches_possibilities(text):
|
||||||
|
all_valid_matches = {}
|
||||||
|
|
||||||
|
for person in people:
|
||||||
|
matches = find_person_regex_all_combinations(text, person)
|
||||||
|
|
||||||
|
for match in matches:
|
||||||
|
matched_person = match["matched_person"]
|
||||||
|
matched_text = match["matched_text"]
|
||||||
|
weight = match["weight"]
|
||||||
|
|
||||||
|
person_initials = get_person_initials(matched_person)
|
||||||
|
found_text_letters = get_text_initials(matched_text)
|
||||||
|
person_key = tuple(matched_person.values())
|
||||||
|
all_valid_matches[person_key] = (weight, matched_person)
|
||||||
|
sorted_matches = sorted(all_valid_matches.values(), key=lambda x: -x[0])
|
||||||
|
return [m[1] for m in sorted_matches] or ["N/A"]
|
||||||
|
|
||||||
|
start_time = perf_counter()
|
||||||
|
df = pd.read_csv("account_records_incoming.csv")
|
||||||
|
|
||||||
|
df["process_comment"] = df["process_comment"].apply(clean_text)
|
||||||
|
df["year"] = df["process_comment"].str.extract(r'(\d{4})')
|
||||||
|
df["year"] = pd.to_numeric(df["year"], errors='coerce')
|
||||||
|
|
||||||
|
df["year"] = df["year"].where(df["year"].between(start_year, current_year))
|
||||||
|
df["years_found"] = df["process_comment"].apply(extract_years)
|
||||||
|
df["months_found"] = df["process_comment"].apply(extract_months)
|
||||||
|
df["fuzzy_months_found"] = df["process_comment"].apply(best_month_matches)
|
||||||
|
df["person_name_matches"] = df["process_comment"].apply(find_all_person_matches)
|
||||||
|
df["person_name_matches_possibilities"] = df["process_comment"].apply(find_all_person_matches_possibilities)
|
||||||
|
|
||||||
|
# df["year_str"] = df["year"].apply(lambda x: str(int(x)) if pd.notnull(x) else "N/A")
|
||||||
|
# print(df[["process_comment", "year_str", "months_found"]].head(100))
|
||||||
|
|
||||||
|
# for ix, row in df[["process_comment", "years_found", "months_found", "fuzzy_months_found", "person_name_matches", "person_name_matches_possibilities"]].iterrows():
|
||||||
|
# if row['person_name_matches'] == ["N/A"] and row['person_name_matches_possibilities'] != ["N/A"]:
|
||||||
|
# print(f"Row number {ix} | {row['process_comment']}")
|
||||||
|
# # print(f"Years found: {row['years_found']} | Months found: {row['months_found']} | Fuzzy months found: {row['fuzzy_months_found']}")
|
||||||
|
# print(f"Person name matches: {row['person_name_matches']} | Person name matches possibilities: {row['person_name_matches_possibilities']}")
|
||||||
|
|
||||||
|
month_direct_found = df[df['months_found'].apply(lambda x: x != ["N/A"])]
|
||||||
|
month_indirect_found = df[df['months_found'].apply(lambda x: x == ["N/A"]) | df['fuzzy_months_found'].apply(lambda x: x != ["N/A"])]
|
||||||
|
years_found = df[df['years_found'].apply(lambda x: x != ["N/A"])]
|
||||||
|
people_found = df[df['person_name_matches'].apply(lambda x: x != ["N/A"]) ]
|
||||||
|
people_found_possibilities = df[df['person_name_matches_possibilities'].apply(lambda x: x != ["N/A"]) & df['person_name_matches'].apply(lambda x: x == ["N/A"])]
|
||||||
|
|
||||||
|
print("length of years where is not N/A : ", len(years_found), '/', len(df))
|
||||||
|
print("length of months where is not N/A : ", len(month_direct_found), '/', len(df))
|
||||||
|
print("length of fuzzy months where is not N/A : ", len(month_indirect_found), '/', len(df))
|
||||||
|
print("lenght of names found", len(people_found), '/', len(df))
|
||||||
|
print("lenght of names found with possibilities", len(people_found_possibilities), '/', len(df))
|
||||||
|
end_time = perf_counter()
|
||||||
|
print(f"Time taken to read CSV: {end_time - start_time:.2f} seconds")
|
||||||
|
# for ix, row in df[['id', 'process_comment', 'years_found', 'months_found', 'fuzzy_months_found', 'person_name_matches', 'person_name_matches_possibilities']].iterrows():
|
||||||
|
# print(f"id {row['id']} | {row['process_comment']} | {row['years_found']} | {row['months_found']} | Person Name Matches : {row['person_name_matches']}")
|
||||||
|
# if row['person_name_matches'] == ["N/A"] and row['person_name_matches_possibilities'] != ["N/A"]:
|
||||||
|
# print(f"id {row['id']} | Fuzzy person Matches : {row['person_name_matches_possibilities']}")
|
||||||
|
# if row['months_found'] == ["N/A"] and row['fuzzy_months_found'] != ["N/A"]:
|
||||||
|
# print(f"id {row['id']} | Fuzzy month Matches : {row['fuzzy_months_found']}")
|
||||||
|
|
@ -0,0 +1,266 @@
|
||||||
|
from base_import import get_session_factory
|
||||||
|
from sqlalchemy import text as sqlalchemy_text
|
||||||
|
|
||||||
|
session_factory = get_session_factory()
|
||||||
|
session = session_factory()
|
||||||
|
|
||||||
|
# Add LIMIT and OFFSET directly to the SQL query
|
||||||
|
def query_to_run(limit: int, offset: int):
|
||||||
|
return sqlalchemy_text(f"""
|
||||||
|
SELECT a.id, a.uu_id, a.iban, a.bank_date, a.process_comment, a.payment_result_type,
|
||||||
|
b.part_code, COUNT(a2.id) AS sum_a2_id
|
||||||
|
FROM public.account_records AS a
|
||||||
|
INNER JOIN public.build_parts b ON b.id = a.build_parts_id
|
||||||
|
INNER JOIN public.api_enum_dropdown ae ON ae.id = a.payment_result_type
|
||||||
|
INNER JOIN public.build_living_space bl ON bl.id = a.living_space_id
|
||||||
|
INNER JOIN public.people p ON p.id = bl.person_id
|
||||||
|
LEFT JOIN public.account_records_model_train a2 ON a2.account_records_id = a.id
|
||||||
|
WHERE a.bank_date::date > '2023-06-30' and a2.id is null
|
||||||
|
GROUP BY a.id, a.uu_id, a.iban, a.bank_date, a.process_comment, a.payment_result_type, b.part_code
|
||||||
|
ORDER BY a.bank_date ASC
|
||||||
|
LIMIT {int(limit)} OFFSET {int(offset)};
|
||||||
|
""")
|
||||||
|
|
||||||
|
|
||||||
|
"""
|
||||||
|
52 56b75aec-d28f-4cd4-84e9-ea222cc1d9bd BuildTypes APT_KZN Apartman Kazan Dairesi
|
||||||
|
53 a9f854d1-d01d-4f2a-af5f-1ccf34193e0f BuildTypes APT_GRJ Apartman Garaj
|
||||||
|
54 ed7371a4-0a0a-491d-b1f9-015025b6ac91 BuildTypes APT_DP Apartman Depo
|
||||||
|
55 f6eb95dd-5ed0-407b-8205-4bc855199b06 BuildTypes DAIRE Apartman Dairesi
|
||||||
|
56 242bbe5e-44df-4f10-9583-9d80ff93c52d BuildTypes APT Apartman Binası
|
||||||
|
57 8920eb8b-a5aa-42c3-81d0-13afca85ba1f BuildTypes APT_YNT Apartman Yönetimi
|
||||||
|
58 a7b98daf-c83e-494d-8938-d716be131b5d BuildTypes APT_PRK Apartman Açık Park Alanı
|
||||||
|
59 628188d9-b5e3-493e-9a42-afac3f5bf816 BuildTypes APT_YSL Apartman Yeşil Alan
|
||||||
|
60 8b5bcca2-7702-4486-904c-d708248ccd4d BuildTypes APT_YOL Apartman Ara Yol
|
||||||
|
5 1b51381a-b5a9-485e-884e-fab07b4adf21 BuildDuesTypes BDT-S Service fee
|
||||||
|
6 4619b29f-7b60-4b95-9a97-50a4e5d40f94 BuildDuesTypes BDT-I Information
|
||||||
|
1 2d0127eb-899e-47c5-ad86-67a78174bf90 BuildDuesTypes BDT-D Bina Aidat
|
||||||
|
2 11656423-24b7-4ed9-96e7-1563f639da53 BuildDuesTypes BDT-A Bina Ek Aidat
|
||||||
|
3 c74c72f4-5e10-4d00-8016-4f9ddd50b3c4 BuildDuesTypes BDT-R Bina Tadilat
|
||||||
|
4 5edeb654-b7ce-4c1f-b7e3-2c717bb1d263 BuildDuesTypes BDT-L Bina Yasal Harcama
|
||||||
|
48 f14ae805-8238-438f-a522-d8ac6553f717 TimePeriod TP-W Weekly
|
||||||
|
49 184c3356-6397-476d-a965-45ddf26a4ff5 TimePeriod TP-M Monthly
|
||||||
|
50 ba36110f-7afe-4c41-bcad-f80ce71f626e TimePeriod TP-Q Quarterly
|
||||||
|
51 89ff94c6-126e-45c2-9bc7-6d1007d02528 TimePeriod TP-Y Yearly
|
||||||
|
32 3cf533a4-3947-4563-9a43-16ea2bab1119 PerComType 1 Person
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
class Categories:
|
||||||
|
APTKZN = "APTKZN"
|
||||||
|
APTGRJ = "APTGRJ"
|
||||||
|
APTDP = "APTDP"
|
||||||
|
DAIRE = "DAIRE"
|
||||||
|
APT = "APT"
|
||||||
|
APTYNT = "APTYNT"
|
||||||
|
APTPRK = "APTPRK"
|
||||||
|
APTYSL = "APTYSL"
|
||||||
|
APTYOL = "APTYOL"
|
||||||
|
BDTI = "BDTI"
|
||||||
|
BDTD = "BDTD"
|
||||||
|
BDTA = "BDTA"
|
||||||
|
BDTR = "BDTR"
|
||||||
|
BDTL = "BDTL"
|
||||||
|
TPW = "TPW"
|
||||||
|
TPM = "TPM"
|
||||||
|
TPQ = "TPQ"
|
||||||
|
TPY = "TPY"
|
||||||
|
PERSON = "PERSON"
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def get_category_id(cls, category_name):
|
||||||
|
category_dict = {
|
||||||
|
"APTKZN": (52, "56b75aec-d28f-4cd4-84e9-ea222cc1d9bd"),
|
||||||
|
"APTGRJ": (53, "a9f854d1-d01d-4f2a-af5f-1ccf34193e0f"),
|
||||||
|
"APTDP": (54, "ed7371a4-0a0a-491d-b1f9-015025b6ac91"),
|
||||||
|
"DAIRE": (55, "f6eb95dd-5ed0-407b-8205-4bc855199b06"),
|
||||||
|
"APT": (56, "242bbe5e-44df-4f10-9583-9d80ff93c52d"),
|
||||||
|
"APTYNT": (57, "8920eb8b-a5aa-42c3-81d0-13afca85ba1f"),
|
||||||
|
"APTPRK": (58, "a7b98daf-c83e-494d-8938-d716be131b5d"),
|
||||||
|
"APTYSL": (59, "628188d9-b5e3-493e-9a42-afac3f5bf816"),
|
||||||
|
"APTYOL": (60, "8b5bcca2-7702-4486-904c-d708248ccd4d"),
|
||||||
|
"BDTI": (5, "1b51381a-b5a9-485e-884e-fab07b4adf21"),
|
||||||
|
"BDTD": (1, "2d0127eb-899e-47c5-ad86-67a78174bf90"),
|
||||||
|
"BDTA": (2, "11656423-24b7-4ed9-96e7-1563f639da53"),
|
||||||
|
"BDTR": (3, "c74c72f4-5e10-4d00-8016-4f9ddd50b3c4"),
|
||||||
|
"BDTL": (4, "5edeb654-b7ce-4c1f-b7e3-2c717bb1d263"),
|
||||||
|
"TPW": (48, "f14ae805-8238-438f-a522-d8ac6553f717"),
|
||||||
|
"TPM": (49, "184c3356-6397-476d-a965-45ddf26a4ff5"),
|
||||||
|
"TPQ": (50, "ba36110f-7afe-4c41-bcad-f80ce71f626e"),
|
||||||
|
"TPY": (51, "89ff94c6-126e-45c2-9bc7-6d1007d02528"),
|
||||||
|
"PERSON": (32, "3cf533a4-3947-4563-9a43-16ea2bab1119"),
|
||||||
|
}
|
||||||
|
if not category_name in category_dict:
|
||||||
|
raise ValueError(f"Invalid category name: {category_name}")
|
||||||
|
return category_dict.get(category_name)
|
||||||
|
|
||||||
|
|
||||||
|
def get_model_train_query():
|
||||||
|
"""
|
||||||
|
Returns a SQLAlchemy text object for inserting model train data.
|
||||||
|
Parameters will be bound when executing the query.
|
||||||
|
"""
|
||||||
|
return sqlalchemy_text("""
|
||||||
|
INSERT INTO public.account_records_model_train (
|
||||||
|
account_records_id,
|
||||||
|
account_records_uu_id,
|
||||||
|
search_text,
|
||||||
|
start_index,
|
||||||
|
end_index,
|
||||||
|
category_id,
|
||||||
|
category_uu_id
|
||||||
|
)
|
||||||
|
VALUES (
|
||||||
|
:account_records_id,
|
||||||
|
:account_records_uu_id,
|
||||||
|
:search_text,
|
||||||
|
:start_index,
|
||||||
|
:end_index,
|
||||||
|
:category_id,
|
||||||
|
:category_uu_id
|
||||||
|
);
|
||||||
|
""")
|
||||||
|
|
||||||
|
|
||||||
|
def find_indices(search_text: str, target_text: str, offset: int = 1):
|
||||||
|
"""
|
||||||
|
Returns start and end indices of the search_text within target_text
|
||||||
|
|
||||||
|
Args:
|
||||||
|
search_text (str): The text to search for
|
||||||
|
target_text (str): The text to search within
|
||||||
|
offset (int, optional): Which occurrence to find (1 for first, 2 for second, etc). Defaults to 1.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
tuple: (start_index, end_index) of the found text, or (-1, -1) if not found or offset is too large
|
||||||
|
"""
|
||||||
|
if offset < 1:
|
||||||
|
offset = 1
|
||||||
|
|
||||||
|
current_pos = 0
|
||||||
|
occurrence_count = 0
|
||||||
|
|
||||||
|
while occurrence_count < offset:
|
||||||
|
start_index = target_text.find(search_text, current_pos)
|
||||||
|
|
||||||
|
if start_index == -1: # Not found
|
||||||
|
return -1, -1
|
||||||
|
|
||||||
|
occurrence_count += 1
|
||||||
|
|
||||||
|
if occurrence_count == offset:
|
||||||
|
end_index = start_index + len(search_text) - 1
|
||||||
|
return start_index, end_index + 1
|
||||||
|
|
||||||
|
# Move past this occurrence to find the next one
|
||||||
|
current_pos = start_index + 1
|
||||||
|
|
||||||
|
raise Exception("Offset is too large")
|
||||||
|
|
||||||
|
|
||||||
|
class AccountRecord:
|
||||||
|
|
||||||
|
def __init__(self, id, uu_id, iban, bank_date, process_comment, payment_result_type, part_code, sum_a2_id):
|
||||||
|
self.id = id
|
||||||
|
self.uu_id = str(uu_id)
|
||||||
|
self.iban = iban
|
||||||
|
self.bank_date = bank_date
|
||||||
|
self.process_comment = process_comment
|
||||||
|
self.payment_result_type = payment_result_type
|
||||||
|
self.part_code = part_code
|
||||||
|
self.sum_a2_id = sum_a2_id
|
||||||
|
|
||||||
|
def to_dict(self):
|
||||||
|
return {
|
||||||
|
"id": self.id,
|
||||||
|
"uu_id": self.uu_id,
|
||||||
|
"iban": self.iban,
|
||||||
|
"bank_date": self.bank_date,
|
||||||
|
"process_comment": self.process_comment,
|
||||||
|
"payment_result_type": self.payment_result_type,
|
||||||
|
"part_code": self.part_code,
|
||||||
|
"sum_a2_id": self.sum_a2_id,
|
||||||
|
}
|
||||||
|
|
||||||
|
# Execute the query and process results directly
|
||||||
|
results = session.execute(query_to_run(limit=1, offset=0))
|
||||||
|
|
||||||
|
account_record_dict = dict()
|
||||||
|
for result in results:
|
||||||
|
"""
|
||||||
|
(791, UUID('9d276cc8-289f-45c1-9805-44464af5d7bf'), 'TR400006400000142450093333',
|
||||||
|
datetime.datetime(2023, 7, 1, 12, 22, 27, tzinfo=datetime.timezone.utc),
|
||||||
|
'2 NOLU DAİRE TEMMUZ Ç3 AİDAT*SONGÜL VAR*Hİ7748686973', 1, 'DAIRE_2', 4)
|
||||||
|
"""
|
||||||
|
account_record = AccountRecord(*result)
|
||||||
|
account_record_dict = account_record.to_dict()
|
||||||
|
|
||||||
|
|
||||||
|
"""
|
||||||
|
"APTKZN": (52, "56b75aec-d28f-4cd4-84e9-ea222cc1d9bd"),
|
||||||
|
"APTGRJ": (53, "a9f854d1-d01d-4f2a-af5f-1ccf34193e0f"),
|
||||||
|
"APTDP": (54, "ed7371a4-0a0a-491d-b1f9-015025b6ac91"),
|
||||||
|
"DAIRE": (55, "f6eb95dd-5ed0-407b-8205-4bc855199b06"),
|
||||||
|
"APT": (56, "242bbe5e-44df-4f10-9583-9d80ff93c52d"),
|
||||||
|
"APTYNT": (57, "8920eb8b-a5aa-42c3-81d0-13afca85ba1f"),
|
||||||
|
"APTPRK": (58, "a7b98daf-c83e-494d-8938-d716be131b5d"),
|
||||||
|
"APTYSL": (59, "628188d9-b5e3-493e-9a42-afac3f5bf816"),
|
||||||
|
"APTYOL": (60, "8b5bcca2-7702-4486-904c-d708248ccd4d"),
|
||||||
|
"BDTI": (5, "1b51381a-b5a9-485e-884e-fab07b4adf21"),
|
||||||
|
"BDTD": (6, "4619b29f-7b60-4b95-9a97-50a4e5d40f94"),
|
||||||
|
"BDTA": (2, "11656423-24b7-4ed9-96e7-1563f639da53"),
|
||||||
|
"BDTR": (3, "c74c72f4-5e10-4d00-8016-4f9ddd50b3c4"),
|
||||||
|
"BDTL": (4, "5edeb654-b7ce-4c1f-b7e3-2c717bb1d263"),
|
||||||
|
"TPW": (48, "f14ae805-8238-438f-a522-d8ac6553f717"),
|
||||||
|
"TPM": (49, "184c3356-6397-476d-a965-45ddf26a4ff5"),
|
||||||
|
"TPQ": (50, "ba36110f-7afe-4c41-bcad-f80ce71f626e"),
|
||||||
|
"TPY": (51, "89ff94c6-126e-45c2-9bc7-6d1007d02528"),
|
||||||
|
"PCT": (32, "3cf533a4-3947-4563-9a43-16ea2bab1119"),
|
||||||
|
"""
|
||||||
|
|
||||||
|
print(account_record_dict['id'])
|
||||||
|
print(account_record_dict['uu_id'])
|
||||||
|
print(account_record_dict['process_comment'])
|
||||||
|
|
||||||
|
|
||||||
|
account_records_id = int(219)
|
||||||
|
account_records_uuid = str("5d301273-806c-47d6-aeeb-e056dc119494")
|
||||||
|
sample_text = "GÜNEŞ APARTMANI AİDAT EYLÜL*HASAN CİHAN ŞENKÜÇÜK*Hİ9021822604"
|
||||||
|
search_text = "HASAN CİHAN ŞENKÜÇÜK"
|
||||||
|
start_index, end_index = find_indices(search_text, sample_text, offset=1)
|
||||||
|
#exit()
|
||||||
|
print("start_index", start_index)
|
||||||
|
print("end_index", end_index)
|
||||||
|
category_id, category_uuid = Categories.get_category_id(Categories.PERSON)
|
||||||
|
|
||||||
|
write_dict = dict(
|
||||||
|
account_records_id=account_records_id,
|
||||||
|
account_records_uu_id=account_records_uuid,
|
||||||
|
search_text=search_text,
|
||||||
|
start_index=start_index,
|
||||||
|
end_index=end_index,
|
||||||
|
category_id=category_id,
|
||||||
|
category_uu_id=category_uuid
|
||||||
|
)
|
||||||
|
print('write_dict', write_dict)
|
||||||
|
|
||||||
|
# Prepare the parameters for the query
|
||||||
|
params = {
|
||||||
|
"account_records_id": account_records_id,
|
||||||
|
"account_records_uu_id": account_records_uuid,
|
||||||
|
"search_text": search_text,
|
||||||
|
"start_index": start_index,
|
||||||
|
"end_index": end_index,
|
||||||
|
"category_id": category_id,
|
||||||
|
"category_uu_id": category_uuid
|
||||||
|
}
|
||||||
|
|
||||||
|
# Get the parameterized query template
|
||||||
|
query_template = get_model_train_query()
|
||||||
|
|
||||||
|
# Print parameters for debugging
|
||||||
|
print("Parameters:", params)
|
||||||
|
|
||||||
|
# Execute the query with parameters
|
||||||
|
session.execute(query_template, params)
|
||||||
|
session.commit()
|
||||||
|
|
@ -0,0 +1 @@
|
||||||
|
Subproject commit 0695581ea98094b470369add8a170469cc6102ad
|
||||||
|
|
@ -0,0 +1,5 @@
|
||||||
|
sqlalchemy-mixins>=2.0.5
|
||||||
|
psycopg2-binary>=2.9.10
|
||||||
|
arrow>=1.3.0
|
||||||
|
pandas>=2.2.2
|
||||||
|
numpy>=1.26.4
|
||||||
|
|
@ -0,0 +1,9 @@
|
||||||
|
from base_import import get_session_factory, engine
|
||||||
|
from sqlalchemy import text as sqlalchemy_text
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
# session_factory = get_session_factory()
|
||||||
|
# session = session_factory()
|
||||||
|
|
||||||
|
query = sqlalchemy_text("SELECT * FROM public.account_records where currency_value > 0")
|
||||||
|
pd.read_sql(query, engine).to_csv("account_records_incoming.csv", index=False)
|
||||||
|
|
@ -0,0 +1,553 @@
|
||||||
|
import re
|
||||||
|
import arrow
|
||||||
|
from unidecode import unidecode
|
||||||
|
from difflib import SequenceMatcher
|
||||||
|
from itertools import permutations
|
||||||
|
from time import perf_counter
|
||||||
|
from base_import import get_session_factory
|
||||||
|
from sqlalchemy import text as sqlalchemy_text
|
||||||
|
|
||||||
|
|
||||||
|
session_factory = get_session_factory()
|
||||||
|
session = session_factory()
|
||||||
|
|
||||||
|
|
||||||
|
turkish_months = ["OCAK", "ŞUBAT", "MART", "NİSAN", "MAYIS", "HAZİRAN", "TEMMUZ", "AĞUSTOS", "EYLÜL", "EKİM", "KASIM", "ARALIK"]
|
||||||
|
turkish_months_abbr = {
|
||||||
|
"OCA": "OCAK", "SUB": "ŞUBAT", "ŞUB": "ŞUBAT", "MAR": "MART", "NIS": "NİSAN", "MAY": "MAYIS", "HAZ": "HAZİRAN", "HZR": "HAZİRAN",
|
||||||
|
"TEM": "TEMMUZ", "AGU": "AĞUSTOS", "AGT": "AĞUSTOS", "EYL": "EYLÜL", "EKI": "EKİM", "KAS": "KASIM", "ARA": "ARALIK",
|
||||||
|
}
|
||||||
|
start_year = 1950
|
||||||
|
current_year = arrow.now().year
|
||||||
|
|
||||||
|
people_query = sqlalchemy_text("""
|
||||||
|
SELECT DISTINCT ON (p.id) p.firstname, p.middle_name, p.surname, p.birthname, bl.id
|
||||||
|
FROM public.people as p
|
||||||
|
INNER JOIN public.build_living_space as bl ON bl.person_id = p.id
|
||||||
|
INNER JOIN public.build_parts as bp ON bp.id = bl.build_parts_id
|
||||||
|
INNER JOIN public.build as b ON b.id = bp.build_id
|
||||||
|
WHERE b.id = 1
|
||||||
|
ORDER BY p.id
|
||||||
|
""")
|
||||||
|
people_raw = session.execute(people_query).all()
|
||||||
|
|
||||||
|
remove_duplicate = list()
|
||||||
|
clean_people_list = list()
|
||||||
|
for person in people_raw:
|
||||||
|
merged_name = f"{person[0]} {person[1]} {person[2]} {person[3]}"
|
||||||
|
if merged_name not in remove_duplicate:
|
||||||
|
clean_people_list.append(person)
|
||||||
|
remove_duplicate.append(merged_name)
|
||||||
|
|
||||||
|
people = [{"firstname": p[0], "middle_name": p[1], "surname": p[2], "birthname": p[3], 'id': p[4]} for p in clean_people_list]
|
||||||
|
|
||||||
|
query_account_records = sqlalchemy_text("""
|
||||||
|
SELECT a.id, a.iban, a.bank_date, a.process_comment FROM public.account_records as a where currency_value > 0
|
||||||
|
|
||||||
|
""") # and bank_date::date >= '2020-01-01'
|
||||||
|
account_records = session.execute(query_account_records).all()
|
||||||
|
account_records = [{"id": ar[0], "iban": ar[1], "bank_date": ar[2], "process_comment": ar[3]} for ar in account_records]
|
||||||
|
|
||||||
|
|
||||||
|
def clean_text(text):
|
||||||
|
text = str(text)
|
||||||
|
text = re.sub(r'\d{8,}', ' ', text)
|
||||||
|
# text = re.sub(r'\b[A-Za-z0-9]*?[0-9]+[A-Za-z0-9]*?[A-Za-z]+[A-Za-z0-9]*\b|\b[A-Za-z0-9]*?[A-Za-z]+[A-Za-z0-9]*?[0-9]+[A-Za-z0-9]*\b', ' ', text)
|
||||||
|
text = text.replace("/", " ")
|
||||||
|
text = text.replace("_", " ")
|
||||||
|
text_remove_underscore = text.replace("-", " ").replace("+", " ")
|
||||||
|
text_remove_asterisk = text_remove_underscore.replace("*", " ")
|
||||||
|
text_remove_comma = text_remove_asterisk.replace(",", " ")
|
||||||
|
text_remove_dots = text_remove_comma.replace(".", " ")
|
||||||
|
text_remove_dots = re.sub(r'\s+', ' ', text_remove_dots)
|
||||||
|
text_remove_dots = text_remove_dots.strip()
|
||||||
|
return text_remove_dots
|
||||||
|
|
||||||
|
|
||||||
|
def normalize_text(text):
|
||||||
|
text = text.replace('İ', 'i')
|
||||||
|
text = text.replace('I', 'ı')
|
||||||
|
text = text.replace('Ş', 'ş')
|
||||||
|
text = text.replace('Ğ', 'ğ')
|
||||||
|
text = text.replace('Ü', 'ü')
|
||||||
|
text = text.replace('Ö', 'ö')
|
||||||
|
text = text.replace('Ç', 'ç')
|
||||||
|
return unidecode(text).lower()
|
||||||
|
|
||||||
|
|
||||||
|
def get_person_initials(person):
|
||||||
|
parts = [person.get("firstname", ""), person.get("middle_name", ""), person.get("surname", ""), person.get("birthname", "")]
|
||||||
|
return [unidecode(p.strip())[0].upper() for p in parts if p]
|
||||||
|
|
||||||
|
|
||||||
|
def get_text_initials(matched_text):
|
||||||
|
return [unidecode(word.strip())[0].upper() for word in matched_text.split() if word.strip()]
|
||||||
|
|
||||||
|
|
||||||
|
def generate_dictonary_of_patterns():
|
||||||
|
|
||||||
|
"""
|
||||||
|
completly remove middle_name instead do regex firstName + SomeWord + surname
|
||||||
|
"""
|
||||||
|
patterns_dict = {}
|
||||||
|
|
||||||
|
for person in people:
|
||||||
|
person_id = person.get('id')
|
||||||
|
firstname = person.get('firstname', '').strip() if person.get('firstname') else ""
|
||||||
|
middle_name = person.get('middle_name', '').strip() if person.get('middle_name') else ""
|
||||||
|
surname = person.get('surname', '').strip() if person.get('surname') else ""
|
||||||
|
birthname = person.get('birthname', '').strip() if person.get('birthname') else ""
|
||||||
|
|
||||||
|
if not firstname or not surname:
|
||||||
|
continue
|
||||||
|
|
||||||
|
name_parts = {
|
||||||
|
'firstname': {
|
||||||
|
'orig': firstname,
|
||||||
|
'norm': normalize_text(firstname) if firstname else "",
|
||||||
|
'init': normalize_text(firstname)[0] if firstname else ""
|
||||||
|
},
|
||||||
|
'surname': {
|
||||||
|
'orig': surname,
|
||||||
|
'norm': normalize_text(surname) if surname else "",
|
||||||
|
'init': normalize_text(surname)[0] if surname else ""
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if middle_name:
|
||||||
|
name_parts['middle_name'] = {
|
||||||
|
'orig': middle_name,
|
||||||
|
'norm': normalize_text(middle_name) if middle_name else "",
|
||||||
|
'init': normalize_text(middle_name)[0] if middle_name else ""
|
||||||
|
}
|
||||||
|
|
||||||
|
if birthname and normalize_text(birthname) != normalize_text(surname):
|
||||||
|
name_parts['birthname'] = {
|
||||||
|
'orig': birthname,
|
||||||
|
'norm': normalize_text(birthname),
|
||||||
|
'init': normalize_text(birthname)[0] if birthname else ""
|
||||||
|
}
|
||||||
|
|
||||||
|
person_patterns = set()
|
||||||
|
|
||||||
|
def create_pattern(parts, formats, separators=None):
|
||||||
|
if separators is None:
|
||||||
|
separators = [""]
|
||||||
|
|
||||||
|
patterns = []
|
||||||
|
for fmt in formats:
|
||||||
|
for sep in separators:
|
||||||
|
pattern_parts = []
|
||||||
|
for part_type, part_name in fmt:
|
||||||
|
if part_name in parts and part_type in parts[part_name]:
|
||||||
|
pattern_parts.append(re.escape(parts[part_name][part_type]))
|
||||||
|
if pattern_parts:
|
||||||
|
patterns.append(r"\b" + sep.join(pattern_parts) + r"\b")
|
||||||
|
return patterns
|
||||||
|
|
||||||
|
name_formats = [
|
||||||
|
[('orig', 'firstname'), ('orig', 'surname')],
|
||||||
|
[('norm', 'firstname'), ('norm', 'surname')],
|
||||||
|
[('orig', 'surname'), ('orig', 'firstname')],
|
||||||
|
[('norm', 'surname'), ('norm', 'firstname')],
|
||||||
|
]
|
||||||
|
if 'middle_name' in name_parts:
|
||||||
|
name_formats = [
|
||||||
|
[('orig', 'firstname'), ('orig', 'middle_name'), ('orig', 'surname')],
|
||||||
|
[('norm', 'firstname'), ('norm', 'middle_name'), ('norm', 'surname')],
|
||||||
|
]
|
||||||
|
|
||||||
|
person_patterns.update(create_pattern(name_parts, name_formats, [" ", ""]))
|
||||||
|
|
||||||
|
if 'middle_name' in name_parts:
|
||||||
|
middle_name_formats = [
|
||||||
|
[('orig', 'firstname'), ('orig', 'middle_name')],
|
||||||
|
[('norm', 'firstname'), ('norm', 'middle_name')],
|
||||||
|
[('orig', 'middle_name'), ('orig', 'surname')],
|
||||||
|
[('norm', 'middle_name'), ('norm', 'surname')],
|
||||||
|
]
|
||||||
|
person_patterns.update(create_pattern(name_parts, middle_name_formats, [" ", ""]))
|
||||||
|
|
||||||
|
if 'birthname' in name_parts and name_parts['surname']['orig'] != name_parts['birthname']['orig']:
|
||||||
|
birthname_formats = [
|
||||||
|
[('orig', 'firstname'), ('orig', 'birthname')],
|
||||||
|
[('norm', 'firstname'), ('norm', 'birthname')],
|
||||||
|
[('orig', 'birthname'), ('orig', 'firstname')],
|
||||||
|
[('norm', 'birthname'), ('norm', 'firstname')],
|
||||||
|
]
|
||||||
|
person_patterns.update(create_pattern(name_parts, birthname_formats, [" ", ""]))
|
||||||
|
|
||||||
|
initial_formats = [
|
||||||
|
[('init', 'firstname'), ('init', 'middle_name'), ('init', 'surname')],
|
||||||
|
[('init', 'firstname'), ('init', 'surname')],
|
||||||
|
]
|
||||||
|
person_patterns.update(create_pattern(name_parts, initial_formats, ["", ".", " ", ". "]))
|
||||||
|
|
||||||
|
if 'middle_name' in name_parts:
|
||||||
|
triple_initial_formats = [
|
||||||
|
[('init', 'firstname'), ('init', 'middle_name'), ('init', 'surname')],
|
||||||
|
]
|
||||||
|
person_patterns.update(create_pattern(name_parts, triple_initial_formats, ["", ".", " ", ". "]))
|
||||||
|
|
||||||
|
compiled_patterns = [re.compile(pattern, re.IGNORECASE) for pattern in person_patterns]
|
||||||
|
|
||||||
|
patterns_dict[person_id] = compiled_patterns
|
||||||
|
|
||||||
|
return patterns_dict
|
||||||
|
|
||||||
|
|
||||||
|
def extract_person_name_with_regex(found_dict, process_comment, patterns_dict):
|
||||||
|
cleaned_text = process_comment
|
||||||
|
all_matches = []
|
||||||
|
|
||||||
|
for person_id, patterns in patterns_dict.items():
|
||||||
|
person = next((p for p in people if p.get('id') == person_id), None)
|
||||||
|
if not person:
|
||||||
|
continue
|
||||||
|
|
||||||
|
firstname_norm = normalize_text(person.get("firstname", "").strip()) if person.get("firstname") else ""
|
||||||
|
middle_name_norm = normalize_text(person.get("middle_name", "").strip()) if person.get("middle_name") else ""
|
||||||
|
surname_norm = normalize_text(person.get("surname", "").strip()) if person.get("surname") else ""
|
||||||
|
birthname_norm = normalize_text(person.get("birthname", "").strip()) if person.get("birthname") else ""
|
||||||
|
|
||||||
|
text_norm = normalize_text(process_comment)
|
||||||
|
person_matches = []
|
||||||
|
|
||||||
|
for pattern in patterns:
|
||||||
|
for match in pattern.finditer(text_norm):
|
||||||
|
start, end = match.span()
|
||||||
|
matched_text = process_comment[start:end]
|
||||||
|
matched_text_norm = normalize_text(matched_text)
|
||||||
|
|
||||||
|
is_valid_match = False
|
||||||
|
|
||||||
|
# Strict validation: require both firstname AND surname/birthname
|
||||||
|
# No single-word matches allowed
|
||||||
|
if len(matched_text_norm.split()) <= 1:
|
||||||
|
# Single word matches are not allowed
|
||||||
|
is_valid_match = False
|
||||||
|
else:
|
||||||
|
# For multi-word matches, require firstname AND (surname OR birthname)
|
||||||
|
has_firstname = firstname_norm and firstname_norm in matched_text_norm
|
||||||
|
has_surname = surname_norm and surname_norm in matched_text_norm
|
||||||
|
has_birthname = birthname_norm and birthname_norm in matched_text_norm
|
||||||
|
|
||||||
|
# Both firstname and surname/birthname must be present
|
||||||
|
if (has_firstname and has_surname) or (has_firstname and has_birthname):
|
||||||
|
is_valid_match = True
|
||||||
|
|
||||||
|
if is_valid_match:
|
||||||
|
person_matches.append({
|
||||||
|
'matched_text': matched_text,
|
||||||
|
'start': start,
|
||||||
|
'end': end
|
||||||
|
})
|
||||||
|
|
||||||
|
if person_matches:
|
||||||
|
person_matches.sort(key=lambda x: len(x['matched_text']), reverse=True)
|
||||||
|
|
||||||
|
non_overlapping_matches = []
|
||||||
|
for match in person_matches:
|
||||||
|
overlaps = False
|
||||||
|
for existing_match in non_overlapping_matches:
|
||||||
|
if (match['start'] < existing_match['end'] and match['end'] > existing_match['start']):
|
||||||
|
overlaps = True
|
||||||
|
break
|
||||||
|
|
||||||
|
if not overlaps:
|
||||||
|
non_overlapping_matches.append(match)
|
||||||
|
|
||||||
|
if non_overlapping_matches:
|
||||||
|
found_dict["name_match"] = person
|
||||||
|
all_matches.extend([(match, person) for match in non_overlapping_matches])
|
||||||
|
|
||||||
|
if all_matches:
|
||||||
|
all_matches.sort(key=lambda x: x[0]['start'], reverse=True)
|
||||||
|
|
||||||
|
for match, person in all_matches:
|
||||||
|
matched_text = match['matched_text']
|
||||||
|
|
||||||
|
matched_words = matched_text.split()
|
||||||
|
|
||||||
|
for word in matched_words:
|
||||||
|
word_norm = normalize_text(word).strip()
|
||||||
|
|
||||||
|
if not word_norm:
|
||||||
|
continue
|
||||||
|
|
||||||
|
text_norm = normalize_text(cleaned_text)
|
||||||
|
for word_match in re.finditer(rf'\b{re.escape(word_norm)}\b', text_norm, re.IGNORECASE):
|
||||||
|
start, end = word_match.span()
|
||||||
|
cleaned_text = cleaned_text[:start] + ' ' * (end - start) + cleaned_text[end:]
|
||||||
|
|
||||||
|
cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
|
||||||
|
|
||||||
|
return found_dict, cleaned_text
|
||||||
|
|
||||||
|
|
||||||
|
def extract_build_parts_info(found_dict, process_comment):
|
||||||
|
|
||||||
|
"""
|
||||||
|
Regex of parts such as :
|
||||||
|
2 nolu daire
|
||||||
|
9 NUMARALI DAI
|
||||||
|
daire 3
|
||||||
|
3 nolu dairenin
|
||||||
|
11nolu daire
|
||||||
|
Daire No 12
|
||||||
|
2NOLU DAIRE
|
||||||
|
12 No lu daire
|
||||||
|
D:10
|
||||||
|
NO:11
|
||||||
|
NO :3
|
||||||
|
"""
|
||||||
|
# Initialize apartment number variable
|
||||||
|
apartment_number = None
|
||||||
|
cleaned_text = process_comment
|
||||||
|
|
||||||
|
def clean_text_apartment_number(text, match):
|
||||||
|
clean_text = text.replace(match.group(0), '').strip()
|
||||||
|
clean_text = re.sub(r'\s+', ' ', clean_text).strip()
|
||||||
|
return clean_text
|
||||||
|
|
||||||
|
# Pattern 1: X nolu daire (with space)
|
||||||
|
pattern1 = re.compile(r'(\d+)\s*nolu\s*daire', re.IGNORECASE)
|
||||||
|
match = pattern1.search(cleaned_text)
|
||||||
|
if match:
|
||||||
|
apartment_number = match.group(1)
|
||||||
|
found_dict['apartment_number'] = apartment_number
|
||||||
|
return found_dict, clean_text_apartment_number(cleaned_text, match)
|
||||||
|
|
||||||
|
# Pattern 4: X nolu dairenin
|
||||||
|
pattern4 = re.compile(r'(\d+)\s*nolu\s*daire\w*', re.IGNORECASE)
|
||||||
|
match = pattern4.search(cleaned_text)
|
||||||
|
if match:
|
||||||
|
apartment_number = match.group(1)
|
||||||
|
found_dict['apartment_number'] = apartment_number
|
||||||
|
return found_dict, clean_text_apartment_number(cleaned_text, match)
|
||||||
|
|
||||||
|
# Pattern 5: XNolu daire (without space)
|
||||||
|
pattern5 = re.compile(r'(\d+)nolu\s*daire', re.IGNORECASE)
|
||||||
|
match = pattern5.search(cleaned_text)
|
||||||
|
if match:
|
||||||
|
apartment_number = match.group(1)
|
||||||
|
found_dict['apartment_number'] = apartment_number
|
||||||
|
return found_dict, clean_text_apartment_number(cleaned_text, match)
|
||||||
|
|
||||||
|
# Pattern 7: XNOLU DAIRE (all caps, no space)
|
||||||
|
pattern7 = re.compile(r'(\d+)nolu\s*daire', re.IGNORECASE)
|
||||||
|
match = pattern7.search(cleaned_text)
|
||||||
|
if match:
|
||||||
|
apartment_number = match.group(1)
|
||||||
|
found_dict['apartment_number'] = apartment_number
|
||||||
|
return found_dict, clean_text_apartment_number(cleaned_text, match)
|
||||||
|
|
||||||
|
# Pattern 8: X No lu daire
|
||||||
|
pattern8 = re.compile(r'(\d+)\s*no\s*lu\s*daire', re.IGNORECASE)
|
||||||
|
match = pattern8.search(cleaned_text)
|
||||||
|
if match:
|
||||||
|
apartment_number = match.group(1)
|
||||||
|
found_dict['apartment_number'] = apartment_number
|
||||||
|
return found_dict, clean_text_apartment_number(cleaned_text, match)
|
||||||
|
|
||||||
|
# Pattern 6: Daire No X
|
||||||
|
pattern6 = re.compile(r'daire\s*no\s*(\d+)', re.IGNORECASE)
|
||||||
|
match = pattern6.search(cleaned_text)
|
||||||
|
if match:
|
||||||
|
apartment_number = match.group(1)
|
||||||
|
found_dict['apartment_number'] = apartment_number
|
||||||
|
return found_dict, clean_text_apartment_number(cleaned_text, match)
|
||||||
|
|
||||||
|
# Pattern 2: X NUMARALI DAI
|
||||||
|
pattern2 = re.compile(r'(\d+)\s*numarali\s*dai', re.IGNORECASE)
|
||||||
|
match = pattern2.search(cleaned_text)
|
||||||
|
if match:
|
||||||
|
apartment_number = match.group(1)
|
||||||
|
found_dict['apartment_number'] = apartment_number
|
||||||
|
return found_dict, clean_text_apartment_number(cleaned_text, match)
|
||||||
|
|
||||||
|
# Pattern 3: daire X
|
||||||
|
pattern3 = re.compile(r'daire\s*(\d+)', re.IGNORECASE)
|
||||||
|
match = pattern3.search(cleaned_text)
|
||||||
|
if match:
|
||||||
|
apartment_number = match.group(1)
|
||||||
|
found_dict['apartment_number'] = apartment_number
|
||||||
|
return found_dict, clean_text_apartment_number(cleaned_text, match)
|
||||||
|
|
||||||
|
# Pattern 9: D:X
|
||||||
|
pattern9 = re.compile(r'd\s*:\s*(\d+)', re.IGNORECASE)
|
||||||
|
match = pattern9.search(cleaned_text)
|
||||||
|
if match:
|
||||||
|
apartment_number = match.group(1)
|
||||||
|
found_dict['apartment_number'] = apartment_number
|
||||||
|
return found_dict, clean_text_apartment_number(cleaned_text, match)
|
||||||
|
|
||||||
|
# Pattern 10: NO:X or NO :X
|
||||||
|
pattern10 = re.compile(r'no\s*:\s*(\d+)', re.IGNORECASE)
|
||||||
|
match = pattern10.search(cleaned_text)
|
||||||
|
if match:
|
||||||
|
apartment_number = match.group(1)
|
||||||
|
found_dict['apartment_number'] = apartment_number
|
||||||
|
return found_dict, clean_text_apartment_number(cleaned_text, match)
|
||||||
|
|
||||||
|
return found_dict, cleaned_text
|
||||||
|
|
||||||
|
|
||||||
|
def extract_months(found_dict, process_comment):
|
||||||
|
"""
|
||||||
|
Extract Turkish month names and abbreviations from the process comment
|
||||||
|
"""
|
||||||
|
original_text = process_comment
|
||||||
|
# Updated dictionary with normalized keys for better matching
|
||||||
|
month_to_number_dict = {
|
||||||
|
"ocak": 1, "şubat": 2, "mart": 3, "nisan": 4, "mayıs": 5, "haziran": 6,
|
||||||
|
"temmuz": 7, "ağustos": 8, "eylül": 9, "ekim": 10, "kasım": 11, "aralık": 12,
|
||||||
|
# Add normalized versions without Turkish characters
|
||||||
|
"ocak": 1, "subat": 2, "mart": 3, "nisan": 4, "mayis": 5, "haziran": 6,
|
||||||
|
"temmuz": 7, "agustos": 8, "eylul": 9, "ekim": 10, "kasim": 11, "aralik": 12
|
||||||
|
}
|
||||||
|
|
||||||
|
def clean_text_month(text, match):
|
||||||
|
clean_text = text.replace(match.group(0), '').strip()
|
||||||
|
clean_text = re.sub(r'\s+', ' ', clean_text).strip()
|
||||||
|
return clean_text
|
||||||
|
|
||||||
|
def normalize_turkish(text):
|
||||||
|
"""Properly normalize Turkish text for case-insensitive comparison"""
|
||||||
|
text = text.lower()
|
||||||
|
text = text.replace('i̇', 'i') # Handle dotted i properly
|
||||||
|
text = text.replace('ı', 'i') # Convert dotless i to regular i for matching
|
||||||
|
text = unidecode(text) # Remove other diacritics
|
||||||
|
return text
|
||||||
|
|
||||||
|
if 'months' not in found_dict:
|
||||||
|
found_dict['months'] = []
|
||||||
|
|
||||||
|
months_found, working_text = False, original_text
|
||||||
|
|
||||||
|
for month in turkish_months:
|
||||||
|
pattern = re.compile(r'\b' + re.escape(month) + r'\b', re.IGNORECASE)
|
||||||
|
for match in pattern.finditer(original_text):
|
||||||
|
matched_text = match.group(0)
|
||||||
|
|
||||||
|
normalized_month = normalize_turkish(month)
|
||||||
|
month_number = None
|
||||||
|
|
||||||
|
if month.lower() in month_to_number_dict:
|
||||||
|
month_number = month_to_number_dict[month.lower()]
|
||||||
|
elif normalized_month in month_to_number_dict:
|
||||||
|
month_number = month_to_number_dict[normalized_month]
|
||||||
|
|
||||||
|
month_info = {'name': month, 'number': month_number}
|
||||||
|
found_dict['months'].append(month_info)
|
||||||
|
months_found = True
|
||||||
|
working_text = working_text.replace(matched_text, '', 1)
|
||||||
|
|
||||||
|
for abbr, full_month in turkish_months_abbr.items():
|
||||||
|
pattern = re.compile(r'\b' + re.escape(abbr) + r'\b', re.IGNORECASE)
|
||||||
|
|
||||||
|
for match in pattern.finditer(working_text):
|
||||||
|
matched_text = match.group(0)
|
||||||
|
normalized_month = normalize_turkish(full_month)
|
||||||
|
month_number = None
|
||||||
|
|
||||||
|
if full_month.lower() in month_to_number_dict:
|
||||||
|
month_number = month_to_number_dict[full_month.lower()]
|
||||||
|
elif normalized_month in month_to_number_dict:
|
||||||
|
month_number = month_to_number_dict[normalized_month]
|
||||||
|
|
||||||
|
month_info = {'name': full_month, 'number': month_number}
|
||||||
|
found_dict['months'].append(month_info)
|
||||||
|
months_found = True
|
||||||
|
working_text = working_text.replace(matched_text, '', 1)
|
||||||
|
|
||||||
|
return found_dict, working_text
|
||||||
|
|
||||||
|
|
||||||
|
def extract_year(found_dict, process_comment):
|
||||||
|
"""
|
||||||
|
Extract years from the process comment
|
||||||
|
"""
|
||||||
|
original_text = process_comment
|
||||||
|
|
||||||
|
if 'years' not in found_dict:
|
||||||
|
found_dict['years'] = []
|
||||||
|
|
||||||
|
working_text = original_text
|
||||||
|
|
||||||
|
for year in range(start_year, current_year + 1):
|
||||||
|
pattern = re.compile(r'\b' + str(year) + r'\b', re.IGNORECASE)
|
||||||
|
for match in pattern.finditer(original_text):
|
||||||
|
matched_text = match.group(0)
|
||||||
|
if str(matched_text).isdigit():
|
||||||
|
found_dict['years'].append(int(matched_text))
|
||||||
|
working_text = working_text.replace(matched_text, '', 1)
|
||||||
|
|
||||||
|
return found_dict, working_text
|
||||||
|
|
||||||
|
|
||||||
|
def extract_payment_type(found_dict, process_comment):
|
||||||
|
"""
|
||||||
|
Extract payment type from the process comment
|
||||||
|
aidat
|
||||||
|
AİD
|
||||||
|
aidatı
|
||||||
|
TADİLAT
|
||||||
|
YAKIT
|
||||||
|
yakıt
|
||||||
|
yakit
|
||||||
|
"""
|
||||||
|
original_text = process_comment
|
||||||
|
working_text = original_text
|
||||||
|
|
||||||
|
if 'payment_types' not in found_dict:
|
||||||
|
found_dict['payment_types'] = []
|
||||||
|
|
||||||
|
payment_keywords = {
|
||||||
|
'aidat': ['aidat', 'aİd', 'aid', 'aidatı', 'aidati'],
|
||||||
|
'tadilat': ['tadilat', 'tadİlat', 'tadilatı'],
|
||||||
|
'yakit': ['yakit', 'yakıt', 'yakıtı', 'yakiti']
|
||||||
|
}
|
||||||
|
|
||||||
|
for payment_type, keywords in payment_keywords.items():
|
||||||
|
for keyword in keywords:
|
||||||
|
pattern = re.compile(r'\b' + keyword + r'\b', re.IGNORECASE)
|
||||||
|
for match in pattern.finditer(original_text):
|
||||||
|
matched_text = match.group(0)
|
||||||
|
if payment_type not in found_dict['payment_types']:
|
||||||
|
found_dict['payment_types'].append(payment_type)
|
||||||
|
working_text = working_text.replace(matched_text, '', 1)
|
||||||
|
|
||||||
|
return found_dict, working_text
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
|
||||||
|
list_of_regex_patterns = generate_dictonary_of_patterns()
|
||||||
|
dicts_found = dict()
|
||||||
|
dicts_not_found = dict()
|
||||||
|
for account_record in account_records:
|
||||||
|
account_record_id = str(account_record["id"])
|
||||||
|
found_dict = {}
|
||||||
|
process_comment_iteration = clean_text(text=account_record["process_comment"])
|
||||||
|
found_dict, cleaned_process_comment = extract_person_name_with_regex(
|
||||||
|
found_dict=found_dict, process_comment=process_comment_iteration, patterns_dict=list_of_regex_patterns
|
||||||
|
)
|
||||||
|
|
||||||
|
found_dict, cleaned_process_comment = extract_build_parts_info(found_dict=found_dict, process_comment=cleaned_process_comment)
|
||||||
|
found_dict, cleaned_process_comment = extract_months(found_dict=found_dict, process_comment=cleaned_process_comment)
|
||||||
|
found_dict, cleaned_process_comment = extract_year(found_dict=found_dict, process_comment=cleaned_process_comment)
|
||||||
|
found_dict, cleaned_process_comment = extract_payment_type(found_dict=found_dict, process_comment=cleaned_process_comment)
|
||||||
|
if found_dict:
|
||||||
|
dicts_found[process_comment_iteration] = found_dict
|
||||||
|
else:
|
||||||
|
dicts_not_found[process_comment_iteration] = account_record_id
|
||||||
|
|
||||||
|
print("\n===== SUMMARY =====")
|
||||||
|
print(f"extracted data total : {len(dicts_found)}")
|
||||||
|
print(f"not extracted data total : {len(account_records) - len(dicts_found)}")
|
||||||
|
print(f"Total account records processed : {len(account_records)}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
Loading…
Reference in New Issue