# transaction_classifier.py """ Transaction Classifier for Tax Optimization Classifies Mono API and manual transactions into tax-relevant categories """ from __future__ import annotations from typing import Dict, List, Any, Optional import re from dataclasses import dataclass from datetime import datetime @dataclass class TaxClassification: """Result of classifying a transaction for tax purposes""" tax_category: str tax_treatment: str # taxable, deductible, exempt, unknown deductible: bool confidence: float suggested_rule_ids: List[str] notes: Optional[str] = None class TransactionClassifier: """ Classifies bank transactions (from Mono API or manual entry) into tax categories """ # Nigerian bank transaction patterns (expanded for better coverage) INCOME_PATTERNS = { 'employment_income': [ r'\bSALARY\b', r'\bWAGES\b', r'\bPAYROLL\b', r'\bSTIPEND\b', r'\bEMPLOYMENT\b', r'\bMONTHLY PAY\b', r'\bNET PAY\b', r'\bGROSS PAY\b', r'\bEARNINGS\b', r'\bSALARY PAYMENT\b' ], 'business_income': [ r'\bSALES\b', r'\bREVENUE\b', r'\bINVOICE\b', r'\bPAYMENT RECEIVED\b', r'\bCUSTOMER\b', r'\bCLIENT\b', r'\bPROJECT\b', r'\bCONSULTING\b', r'\bFREELANCE\b', r'\bCONTRACT\b' ], 'rental_income': [ r'\bRENT RECEIVED\b', r'\bTENANT\b', r'\bLEASE PAYMENT\b', r'\bPROPERTY INCOME\b', r'\bRENTAL\b' ], 'investment_income': [ r'\bDIVIDEND\b', r'\bINTEREST\b', r'\bINVESTMENT\b', r'\bCOUPON\b', r'\bBOND\b', r'\bSTOCK\b', r'\bSHARE\b' ] } DEDUCTION_PATTERNS = { 'pension_contribution': [ r'\bPENSION\b', r'\bPFA\b', r'\bRSA\b', r'\bRETIREMENT\b', r'\bPENSION FUND\b', r'\bPENSION CONTRIBUTION\b' ], 'nhf_contribution': [ r'\bNHF\b', r'\bHOUSING FUND\b', r'\bNATIONAL HOUSING\b' ], 'life_insurance': [ r'\bLIFE INSURANCE\b', r'\bLIFE ASSURANCE\b', r'\bINSURANCE PREMIUM\b', r'\bPOLICY PREMIUM\b' ], 'health_insurance': [ r'\bHEALTH INSURANCE\b', r'\bHMO\b', r'\bMEDICAL INSURANCE\b', r'\bHEALTH PLAN\b' ], 'rent_paid': [ r'\bRENT\b', r'\bLANDLORD\b', r'\bLEASE\b', r'\bHOUSE RENT\b', r'\bAPARTMENT RENT\b' ], 'union_dues': [ r'\bUNION DUES\b', r'\bPROFESSIONAL FEES\b', r'\bASSOCIATION FEES\b', r'\bMEMBERSHIP DUES\b' ] } def __init__(self, rag_pipeline: Optional[Any] = None): """ Initialize classifier Args: rag_pipeline: Optional RAG pipeline for LLM-based classification of ambiguous transactions """ self.rag = rag_pipeline def classify_transaction(self, transaction: Dict[str, Any]) -> Dict[str, Any]: """ Classify a transaction (from Mono API or manual entry) Accepts both formats: - Mono API: {"_id", "type": "credit/debit", "amount": 50000, "narration": "..."} - Backend: {"id", "type": "income/expense", "amount_kobo": 5000000, "description": "..."} Returns enriched transaction with tax classification """ # Normalize narration/description narration = (transaction.get("narration") or transaction.get("description") or "").upper() # Normalize amount (handle both kobo and naira) amount = transaction.get("amount") if amount is None: amount_kobo = transaction.get("amount_kobo", 0) amount = abs(float(amount_kobo) / 100.0) else: amount = abs(float(amount)) # Normalize type (handle both credit/debit and income/expense) tx_type = transaction.get("type", "").lower() if tx_type in ['income']: tx_type = 'credit' elif tx_type in ['expense']: tx_type = 'debit' # Classify using pattern matching classification = self._classify_by_patterns(narration, tx_type, amount) # DISABLED: LLM classification to avoid rate limits # Only use pattern matching for now # If confidence is low and RAG is available, use LLM # if classification.confidence < 0.7 and self.rag: # llm_classification = self._llm_classify(transaction) # if llm_classification.confidence > classification.confidence: # classification = llm_classification # Enrich original transaction return { **transaction, "tax_category": classification.tax_category, "tax_treatment": classification.tax_treatment, "deductible": classification.deductible, "confidence": classification.confidence, "suggested_rule_ids": classification.suggested_rule_ids, "tax_notes": classification.notes } def classify_batch(self, transactions: List[Dict[str, Any]]) -> List[Dict[str, Any]]: """Classify multiple transactions""" return [self.classify_transaction(tx) for tx in transactions] def _classify_by_patterns( self, narration: str, tx_type: str, amount: float ) -> TaxClassification: """Pattern-based classification using regex""" # Check income patterns (for credits) if tx_type == "credit": for category, patterns in self.INCOME_PATTERNS.items(): for pattern in patterns: if re.search(pattern, narration): return self._get_income_classification(category, amount) # Check deduction patterns (for debits) if tx_type == "debit": for category, patterns in self.DEDUCTION_PATTERNS.items(): for pattern in patterns: if re.search(pattern, narration): return self._get_deduction_classification(category, amount) # Default: uncategorized return TaxClassification( tax_category="uncategorized", tax_treatment="unknown", deductible=False, confidence=0.3, suggested_rule_ids=[], notes="Could not automatically categorize. Manual review recommended." ) def _get_income_classification(self, category: str, amount: float) -> TaxClassification: """Get classification for income categories""" classifications = { 'employment_income': TaxClassification( tax_category="employment_income", tax_treatment="taxable", deductible=False, confidence=0.95, suggested_rule_ids=["pit.base.gross_income"], notes="Employment income is fully taxable under PITA" ), 'business_income': TaxClassification( tax_category="business_income", tax_treatment="taxable", deductible=False, confidence=0.85, suggested_rule_ids=["cit.rate.small_2025", "cit.rate.medium_2025", "cit.rate.large_2025"], notes="Business income subject to CIT or PIT depending on structure" ), 'rental_income': TaxClassification( tax_category="rental_income", tax_treatment="taxable", deductible=False, confidence=0.90, suggested_rule_ids=["pit.base.gross_income"], notes="Rental income is taxable. Consider property expenses as deductions." ), 'investment_income': TaxClassification( tax_category="investment_income", tax_treatment="taxable", deductible=False, confidence=0.85, suggested_rule_ids=[], notes="Investment income may be subject to withholding tax" ) } return classifications.get(category, TaxClassification( tax_category="other_income", tax_treatment="taxable", deductible=False, confidence=0.5, suggested_rule_ids=[] )) def _get_deduction_classification(self, category: str, amount: float) -> TaxClassification: """Get classification for deduction categories""" classifications = { 'pension_contribution': TaxClassification( tax_category="pension_contribution", tax_treatment="deductible", deductible=True, confidence=0.95, suggested_rule_ids=["pit.deduction.pension"], notes="Pension contributions to PRA-approved schemes are tax deductible (PITA s.20(1)(g))" ), 'nhf_contribution': TaxClassification( tax_category="nhf_contribution", tax_treatment="deductible", deductible=True, confidence=0.95, suggested_rule_ids=["pit.base.taxable_income"], notes="NHF contributions are tax deductible (2.5% of basic salary)" ), 'life_insurance': TaxClassification( tax_category="life_insurance", tax_treatment="deductible", deductible=True, confidence=0.85, suggested_rule_ids=["pit.base.taxable_income"], notes="Life insurance premiums are tax deductible if policy is with licensed insurer" ), 'health_insurance': TaxClassification( tax_category="health_insurance", tax_treatment="deductible", deductible=True, confidence=0.80, suggested_rule_ids=["pit.base.taxable_income"], notes="Health insurance premiums may be tax deductible" ), 'rent_paid': TaxClassification( tax_category="rent_paid", tax_treatment="potentially_deductible", deductible=False, # Not in 2025, but yes in 2026 confidence=0.85, suggested_rule_ids=["pit.relief.rent_2026"], notes="Rent paid: Not deductible in 2025. From 2026, 20% of rent (max ₦500K) under NTA 2025" ), 'union_dues': TaxClassification( tax_category="union_dues", tax_treatment="deductible", deductible=True, confidence=0.80, suggested_rule_ids=["pit.base.taxable_income"], notes="Professional association fees and union dues are tax deductible" ) } return classifications.get(category, TaxClassification( tax_category="other_expense", tax_treatment="unknown", deductible=False, confidence=0.4, suggested_rule_ids=[] )) def _llm_classify(self, transaction: Dict[str, Any]) -> TaxClassification: """ Use LLM/RAG to classify ambiguous transactions This is a fallback for transactions that don't match patterns """ if not self.rag: return TaxClassification( tax_category="uncategorized", tax_treatment="unknown", deductible=False, confidence=0.3, suggested_rule_ids=[] ) narration = transaction.get("narration", "") amount = transaction.get("amount", 0) tx_type = transaction.get("type", "") prompt = f""" Classify this Nigerian bank transaction for tax purposes: Transaction Details: - Narration: {narration} - Amount: ₦{amount:,.2f} - Type: {tx_type} Classify into ONE of these categories: - employment_income (salary, wages, stipend) - business_income (sales, revenue, client payments) - rental_income (rent received from tenants) - pension_contribution (PFA, RSA contributions) - nhf_contribution (National Housing Fund) - life_insurance (insurance premiums) - rent_paid (rent paid to landlord) - union_dues (professional fees, association dues) - uncategorized (if unclear) Also indicate: 1. Is it tax deductible? (yes/no) 2. Confidence level (0.0 to 1.0) Respond with just the category name, deductible status, and confidence. Example: "employment_income, no, 0.95" """ try: # Query RAG pipeline response = self.rag.query(prompt, verbose=False) # Parse response (simplified - you may want more robust parsing) parts = response.lower().split(',') if len(parts) >= 3: category = parts[0].strip() deductible = 'yes' in parts[1].strip() confidence = float(parts[2].strip()) return TaxClassification( tax_category=category, tax_treatment="deductible" if deductible else "taxable", deductible=deductible, confidence=min(confidence, 0.85), # Cap LLM confidence suggested_rule_ids=[], notes="Classified using AI analysis" ) except Exception as e: print(f"LLM classification failed: {e}") # Fallback return TaxClassification( tax_category="uncategorized", tax_treatment="unknown", deductible=False, confidence=0.3, suggested_rule_ids=[] ) def get_classification_summary(self, classified_transactions: List[Dict[str, Any]]) -> Dict[str, Any]: """Generate summary statistics of classified transactions""" total = len(classified_transactions) if total == 0: return {"total": 0, "categorized": 0, "high_confidence": 0} categorized = len([t for t in classified_transactions if t.get("tax_category") != "uncategorized"]) high_confidence = len([t for t in classified_transactions if t.get("confidence", 0) > 0.8]) # Group by category by_category = {} for tx in classified_transactions: cat = tx.get("tax_category", "uncategorized") by_category[cat] = by_category.get(cat, 0) + 1 # Calculate total amounts by category amounts_by_category = {} for tx in classified_transactions: cat = tx.get("tax_category", "uncategorized") amt = abs(float(tx.get("amount", 0))) amounts_by_category[cat] = amounts_by_category.get(cat, 0) + amt return { "total_transactions": total, "categorized": categorized, "uncategorized": total - categorized, "high_confidence": high_confidence, "categorization_rate": categorized / total if total > 0 else 0, "transactions_by_category": by_category, "amounts_by_category": amounts_by_category }