prod-wag-backend-automate-s.../BankServices/ParserService/app.py

import time
import arrow
import io
import json
import base64
import datetime

from pandas import DataFrame, read_excel
from unidecode import unidecode
from BankServices.ServiceDepends.config import Config

# Import Redis pub/sub handler
from Controllers.Redis.Broadcast.actions import redis_pubsub

# Define Redis channels
REDIS_CHANNEL_IN = "reader"  # Subscribe to Email Service channel
REDIS_CHANNEL_OUT = "parser"  # Publish to Parser Service channel
delimiter = "|"


def publish_parsed_data_to_redis(data, collected_data_dict: list[dict], filename: str) -> bool:
    """Publish parsed data to Redis.

    Args:
        data: Original message data from Redis
        collected_data_dict: Parsed data from Excel file
        filename: Name of the processed file

    Returns:
        bool: Success status
    """
    # Create a copy of the original message to preserve metadata
    message = data.copy() if isinstance(data, dict) else {}
    message.pop("payload", None)
    message.pop("is_base64", None)
    # Update with parsed data and new stage
    if collected_data_dict:
        message["parsed"] = collected_data_dict
        message["stage"] = "parsed"  # Update stage to 'parsed'
    else:
        message["parsed"] = None
        message["stage"] = "not found"  # Mark as 'not found' if parsing failed

    # Add processing timestamp
    message["parsed_at"] = str(arrow.now())
    message["filename"] = filename

    # Publish to Redis channel
    result = redis_pubsub.publisher.publish(REDIS_CHANNEL_OUT, message)

    if result.status:
        print(f"[PARSER_SERVICE] Published parsed data for {filename} with stage: {message['stage']}")
        return True
    else:
        print(f"[PARSER_SERVICE] Publish error: {result.error}")
        return False


def parse_excel_file(excel_frame: DataFrame) -> list[dict]:
    """Parse Excel file data.

    Args:
        excel_frame: DataFrame containing Excel data

    Returns:
        list[dict]: List of parsed data dictionaries
    """
    iban, data_list = "", []
    try:
        for row in excel_frame.itertuples():
            if "IBAN" in str(row[3]).upper():
                iban = str(row[5]).replace(" ", "")
            if not str(row[1]) == "nan" and not str(row[2]) == "nan":
                if len(str(row[1]).split("/")) > 2:
                    data_list.append(
                        dict(
                            iban=str(iban),
                            bank_date=arrow.get(
                                datetime.datetime.strptime(str(row[1]), "%d/%m/%Y-%H:%M:%S")
                            ).__str__(),
                            channel_branch=unidecode(str(row[3])),
                            currency_value=(
                                float(str(row[4]).replace(",", "")) if row[4] else 0
                            ),
                            balance=float(str(row[5]).replace(",", "")) if row[5] else 0,
                            additional_balance=(
                                float(str(row[6]).replace(",", "")) if row[6] else 0
                            ),
                            process_name=str(row[7]),
                            process_type=unidecode(str(row[8])),
                            process_comment=unidecode(str(row[9])),
                            bank_reference_code=str(row[15]),
                        )
                    )
        print(f"[PARSER_SERVICE] Successfully parsed {len(data_list)} records from Excel file")
    except Exception as e:
        print(f"[PARSER_SERVICE] Error parsing Excel file: {str(e)}")
    return data_list


def process_message(message):
    """Process a message from Redis.

    Args:
        message: Message data from Redis subscriber
    """
    # Extract the message data
    data = message["data"]

    # If data is a string, parse it as JSON
    if isinstance(data, str):
        try:
            data = json.loads(data)
        except json.JSONDecodeError as e:
            print(f"[PARSER_SERVICE] Error parsing message data: {e}")
            return

    # Check if stage is 'red' before processing
    if data.get("stage") == "red":
        try:
            filename = data.get("filename")
            payload = data.get("payload")
            is_base64 = data.get("is_base64", False)
            print(f"[PARSER_SERVICE] Processing file: {filename}")

            # Handle base64-encoded payload
            if is_base64 and isinstance(payload, str):
                try:
                    # Decode base64 string to bytes
                    payload = base64.b64decode(payload)
                    print(f"[PARSER_SERVICE] Successfully decoded base64 payload, size: {len(payload)} bytes")
                except Exception as e:
                    print(f"[PARSER_SERVICE] Error decoding base64 payload: {str(e)}")
            # Convert regular string payload to bytes if needed
            elif isinstance(payload, str):
                payload = payload.encode('utf-8')

            # Create an in-memory file-like object and try multiple approaches
            excel_frame = None
            errors = []

            # Save payload to a temporary file for debugging if needed
            temp_file_path = f"/tmp/{filename}"
            try:
                with open(temp_file_path, 'wb') as f:
                    f.write(payload)
                print(f"[PARSER_SERVICE] Saved payload to {temp_file_path} for debugging")
            except Exception as e:
                print(f"[PARSER_SERVICE] Could not save debug file: {str(e)}")

            # Try different approaches to read the Excel file
            approaches = [
                # Approach 1: Try xlrd for .xls files
                lambda: DataFrame(read_excel(io.BytesIO(payload), engine='xlrd')) if filename.lower().endswith('.xls') else None,
                # Approach 2: Try openpyxl for .xlsx files
                lambda: DataFrame(read_excel(io.BytesIO(payload), engine='openpyxl')) if filename.lower().endswith('.xlsx') else None,
                # Approach 3: Try xlrd with explicit sheet name
                lambda: DataFrame(read_excel(io.BytesIO(payload), engine='xlrd', sheet_name=0)),
                # Approach 4: Try with temporary file
                lambda: DataFrame(read_excel(temp_file_path)),
            ]

            # Try each approach until one works
            for i, approach in enumerate(approaches):
                try:
                    result = approach()
                    if result is not None:
                        excel_frame = result
                        print(f"[PARSER_SERVICE] Successfully read Excel file using approach {i+1}")
                        break
                except Exception as e:
                    errors.append(f"Approach {i+1}: {str(e)}")

            # If all approaches failed, raise an exception
            if excel_frame is None:
                error_details = "\n".join(errors)
                raise Exception(f"Failed to read Excel file using all approaches:\n{error_details}")

            # Extract data from the Excel file
            collected_data_dict = parse_excel_file(excel_frame)

            # Publish parsed data to Redis
            publish_parsed_data_to_redis(
                data=data,
                collected_data_dict=collected_data_dict,
                filename=filename
            )
        except Exception as e:
            print(f"[PARSER_SERVICE] Error processing message: {str(e)}")
    else:
        print(f"[PARSER_SERVICE] Skipped message with UUID: {data.get('uuid')} (stage is not 'red')")


def app():
    """Main application function."""
    print("[PARSER_SERVICE] Starting Parser Service")

    # Subscribe to the input channel
    result = redis_pubsub.subscriber.subscribe(REDIS_CHANNEL_IN, process_message)

    if result.status:
        print(f"[PARSER_SERVICE] Subscribed to channel: {REDIS_CHANNEL_IN}")
    else:
        print(f"[PARSER_SERVICE] Subscribe error: {result.error}")
        return

    # Start listening for messages
    listen_result = redis_pubsub.subscriber.start_listening(in_thread=True)

    if listen_result.status:
        print("[PARSER_SERVICE] Listening for messages")
    else:
        print(f"[PARSER_SERVICE] Error starting listener: {listen_result.error}")
        return


if __name__ == "__main__":
    # Initialize the app once
    app()

    # Keep the main thread alive
    try:
        while True:
            time.sleep(Config.EMAIL_SLEEP)
    except KeyboardInterrupt:
        print("\n[PARSER_SERVICE] Stopping service...")
        redis_pubsub.subscriber.stop_listening()