# transaction_classifier.py
"""
Transaction Classifier for Tax Optimization
Classifies Mono API and manual transactions into tax-relevant categories
"""
from __future__ import annotations
from typing import Dict, List, Any, Optional
import re
from dataclasses import dataclass
from datetime import datetime


@dataclass
class TaxClassification:
    """Result of classifying a transaction for tax purposes"""
    tax_category: str
    tax_treatment: str  # taxable, deductible, exempt, unknown
    deductible: bool
    confidence: float
    suggested_rule_ids: List[str]
    notes: Optional[str] = None


class TransactionClassifier:
    """
    Classifies bank transactions (from Mono API or manual entry) into tax categories
    """
    
    # Nigerian bank transaction patterns (expanded for better coverage)
    INCOME_PATTERNS = {
        'employment_income': [
            r'\bSALARY\b', r'\bWAGES\b', r'\bPAYROLL\b', r'\bSTIPEND\b',
            r'\bEMPLOYMENT\b', r'\bMONTHLY PAY\b', r'\bNET PAY\b',
            r'\bGROSS PAY\b', r'\bEARNINGS\b', r'\bSALARY PAYMENT\b'
        ],
        'business_income': [
            r'\bSALES\b', r'\bREVENUE\b', r'\bINVOICE\b', r'\bPAYMENT RECEIVED\b',
            r'\bCUSTOMER\b', r'\bCLIENT\b', r'\bPROJECT\b', r'\bCONSULTING\b',
            r'\bFREELANCE\b', r'\bCONTRACT\b'
        ],
        'rental_income': [
            r'\bRENT RECEIVED\b', r'\bTENANT\b', r'\bLEASE PAYMENT\b',
            r'\bPROPERTY INCOME\b', r'\bRENTAL\b'
        ],
        'investment_income': [
            r'\bDIVIDEND\b', r'\bINTEREST\b', r'\bINVESTMENT\b',
            r'\bCOUPON\b', r'\bBOND\b', r'\bSTOCK\b', r'\bSHARE\b'
        ]
    }
    
    DEDUCTION_PATTERNS = {
        'pension_contribution': [
            r'\bPENSION\b', r'\bPFA\b', r'\bRSA\b', r'\bRETIREMENT\b',
            r'\bPENSION FUND\b', r'\bPENSION CONTRIBUTION\b'
        ],
        'nhf_contribution': [
            r'\bNHF\b', r'\bHOUSING FUND\b', r'\bNATIONAL HOUSING\b'
        ],
        'life_insurance': [
            r'\bLIFE INSURANCE\b', r'\bLIFE ASSURANCE\b', r'\bINSURANCE PREMIUM\b',
            r'\bPOLICY PREMIUM\b'
        ],
        'health_insurance': [
            r'\bHEALTH INSURANCE\b', r'\bHMO\b', r'\bMEDICAL INSURANCE\b',
            r'\bHEALTH PLAN\b'
        ],
        'rent_paid': [
            r'\bRENT\b', r'\bLANDLORD\b', r'\bLEASE\b', r'\bHOUSE RENT\b',
            r'\bAPARTMENT RENT\b'
        ],
        'union_dues': [
            r'\bUNION DUES\b', r'\bPROFESSIONAL FEES\b', r'\bASSOCIATION FEES\b',
            r'\bMEMBERSHIP DUES\b'
        ]
    }
    
    def __init__(self, rag_pipeline: Optional[Any] = None):
        """
        Initialize classifier
        
        Args:
            rag_pipeline: Optional RAG pipeline for LLM-based classification of ambiguous transactions
        """
        self.rag = rag_pipeline
    
    def classify_transaction(self, transaction: Dict[str, Any]) -> Dict[str, Any]:
        """
        Classify a transaction (from Mono API or manual entry)
        
        Accepts both formats:
        - Mono API: {"_id", "type": "credit/debit", "amount": 50000, "narration": "..."}
        - Backend: {"id", "type": "income/expense", "amount_kobo": 5000000, "description": "..."}
        
        Returns enriched transaction with tax classification
        """
        # Normalize narration/description
        narration = (transaction.get("narration") or transaction.get("description") or "").upper()
        
        # Normalize amount (handle both kobo and naira)
        amount = transaction.get("amount")
        if amount is None:
            amount_kobo = transaction.get("amount_kobo", 0)
            amount = abs(float(amount_kobo) / 100.0)
        else:
            amount = abs(float(amount))
        
        # Normalize type (handle both credit/debit and income/expense)
        tx_type = transaction.get("type", "").lower()
        if tx_type in ['income']:
            tx_type = 'credit'
        elif tx_type in ['expense']:
            tx_type = 'debit'
        
        # Classify using pattern matching
        classification = self._classify_by_patterns(narration, tx_type, amount)
        
        # DISABLED: LLM classification to avoid rate limits
        # Only use pattern matching for now
        # If confidence is low and RAG is available, use LLM
        # if classification.confidence < 0.7 and self.rag:
        #     llm_classification = self._llm_classify(transaction)
        #     if llm_classification.confidence > classification.confidence:
        #         classification = llm_classification
        
        # Enrich original transaction
        return {
            **transaction,
            "tax_category": classification.tax_category,
            "tax_treatment": classification.tax_treatment,
            "deductible": classification.deductible,
            "confidence": classification.confidence,
            "suggested_rule_ids": classification.suggested_rule_ids,
            "tax_notes": classification.notes
        }
    
    def classify_batch(self, transactions: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
        """Classify multiple transactions"""
        return [self.classify_transaction(tx) for tx in transactions]
    
    def _classify_by_patterns(
        self, 
        narration: str, 
        tx_type: str, 
        amount: float
    ) -> TaxClassification:
        """Pattern-based classification using regex"""
        
        # Check income patterns (for credits)
        if tx_type == "credit":
            for category, patterns in self.INCOME_PATTERNS.items():
                for pattern in patterns:
                    if re.search(pattern, narration):
                        return self._get_income_classification(category, amount)
        
        # Check deduction patterns (for debits)
        if tx_type == "debit":
            for category, patterns in self.DEDUCTION_PATTERNS.items():
                for pattern in patterns:
                    if re.search(pattern, narration):
                        return self._get_deduction_classification(category, amount)
        
        # Default: uncategorized
        return TaxClassification(
            tax_category="uncategorized",
            tax_treatment="unknown",
            deductible=False,
            confidence=0.3,
            suggested_rule_ids=[],
            notes="Could not automatically categorize. Manual review recommended."
        )
    
    def _get_income_classification(self, category: str, amount: float) -> TaxClassification:
        """Get classification for income categories"""
        
        classifications = {
            'employment_income': TaxClassification(
                tax_category="employment_income",
                tax_treatment="taxable",
                deductible=False,
                confidence=0.95,
                suggested_rule_ids=["pit.base.gross_income"],
                notes="Employment income is fully taxable under PITA"
            ),
            'business_income': TaxClassification(
                tax_category="business_income",
                tax_treatment="taxable",
                deductible=False,
                confidence=0.85,
                suggested_rule_ids=["cit.rate.small_2025", "cit.rate.medium_2025", "cit.rate.large_2025"],
                notes="Business income subject to CIT or PIT depending on structure"
            ),
            'rental_income': TaxClassification(
                tax_category="rental_income",
                tax_treatment="taxable",
                deductible=False,
                confidence=0.90,
                suggested_rule_ids=["pit.base.gross_income"],
                notes="Rental income is taxable. Consider property expenses as deductions."
            ),
            'investment_income': TaxClassification(
                tax_category="investment_income",
                tax_treatment="taxable",
                deductible=False,
                confidence=0.85,
                suggested_rule_ids=[],
                notes="Investment income may be subject to withholding tax"
            )
        }
        
        return classifications.get(category, TaxClassification(
            tax_category="other_income",
            tax_treatment="taxable",
            deductible=False,
            confidence=0.5,
            suggested_rule_ids=[]
        ))
    
    def _get_deduction_classification(self, category: str, amount: float) -> TaxClassification:
        """Get classification for deduction categories"""
        
        classifications = {
            'pension_contribution': TaxClassification(
                tax_category="pension_contribution",
                tax_treatment="deductible",
                deductible=True,
                confidence=0.95,
                suggested_rule_ids=["pit.deduction.pension"],
                notes="Pension contributions to PRA-approved schemes are tax deductible (PITA s.20(1)(g))"
            ),
            'nhf_contribution': TaxClassification(
                tax_category="nhf_contribution",
                tax_treatment="deductible",
                deductible=True,
                confidence=0.95,
                suggested_rule_ids=["pit.base.taxable_income"],
                notes="NHF contributions are tax deductible (2.5% of basic salary)"
            ),
            'life_insurance': TaxClassification(
                tax_category="life_insurance",
                tax_treatment="deductible",
                deductible=True,
                confidence=0.85,
                suggested_rule_ids=["pit.base.taxable_income"],
                notes="Life insurance premiums are tax deductible if policy is with licensed insurer"
            ),
            'health_insurance': TaxClassification(
                tax_category="health_insurance",
                tax_treatment="deductible",
                deductible=True,
                confidence=0.80,
                suggested_rule_ids=["pit.base.taxable_income"],
                notes="Health insurance premiums may be tax deductible"
            ),
            'rent_paid': TaxClassification(
                tax_category="rent_paid",
                tax_treatment="potentially_deductible",
                deductible=False,  # Not in 2025, but yes in 2026
                confidence=0.85,
                suggested_rule_ids=["pit.relief.rent_2026"],
                notes="Rent paid: Not deductible in 2025. From 2026, 20% of rent (max ₦500K) under NTA 2025"
            ),
            'union_dues': TaxClassification(
                tax_category="union_dues",
                tax_treatment="deductible",
                deductible=True,
                confidence=0.80,
                suggested_rule_ids=["pit.base.taxable_income"],
                notes="Professional association fees and union dues are tax deductible"
            )
        }
        
        return classifications.get(category, TaxClassification(
            tax_category="other_expense",
            tax_treatment="unknown",
            deductible=False,
            confidence=0.4,
            suggested_rule_ids=[]
        ))
    
    def _llm_classify(self, transaction: Dict[str, Any]) -> TaxClassification:
        """
        Use LLM/RAG to classify ambiguous transactions
        This is a fallback for transactions that don't match patterns
        """
        if not self.rag:
            return TaxClassification(
                tax_category="uncategorized",
                tax_treatment="unknown",
                deductible=False,
                confidence=0.3,
                suggested_rule_ids=[]
            )
        
        narration = transaction.get("narration", "")
        amount = transaction.get("amount", 0)
        tx_type = transaction.get("type", "")
        
        prompt = f"""
Classify this Nigerian bank transaction for tax purposes:

Transaction Details:
- Narration: {narration}
- Amount: ₦{amount:,.2f}
- Type: {tx_type}

Classify into ONE of these categories:
- employment_income (salary, wages, stipend)
- business_income (sales, revenue, client payments)
- rental_income (rent received from tenants)
- pension_contribution (PFA, RSA contributions)
- nhf_contribution (National Housing Fund)
- life_insurance (insurance premiums)
- rent_paid (rent paid to landlord)
- union_dues (professional fees, association dues)
- uncategorized (if unclear)

Also indicate:
1. Is it tax deductible? (yes/no)
2. Confidence level (0.0 to 1.0)

Respond with just the category name, deductible status, and confidence.
Example: "employment_income, no, 0.95"
"""
        
        try:
            # Query RAG pipeline
            response = self.rag.query(prompt, verbose=False)
            
            # Parse response (simplified - you may want more robust parsing)
            parts = response.lower().split(',')
            if len(parts) >= 3:
                category = parts[0].strip()
                deductible = 'yes' in parts[1].strip()
                confidence = float(parts[2].strip())
                
                return TaxClassification(
                    tax_category=category,
                    tax_treatment="deductible" if deductible else "taxable",
                    deductible=deductible,
                    confidence=min(confidence, 0.85),  # Cap LLM confidence
                    suggested_rule_ids=[],
                    notes="Classified using AI analysis"
                )
        except Exception as e:
            print(f"LLM classification failed: {e}")
        
        # Fallback
        return TaxClassification(
            tax_category="uncategorized",
            tax_treatment="unknown",
            deductible=False,
            confidence=0.3,
            suggested_rule_ids=[]
        )
    
    def get_classification_summary(self, classified_transactions: List[Dict[str, Any]]) -> Dict[str, Any]:
        """Generate summary statistics of classified transactions"""
        
        total = len(classified_transactions)
        if total == 0:
            return {"total": 0, "categorized": 0, "high_confidence": 0}
        
        categorized = len([t for t in classified_transactions if t.get("tax_category") != "uncategorized"])
        high_confidence = len([t for t in classified_transactions if t.get("confidence", 0) > 0.8])
        
        # Group by category
        by_category = {}
        for tx in classified_transactions:
            cat = tx.get("tax_category", "uncategorized")
            by_category[cat] = by_category.get(cat, 0) + 1
        
        # Calculate total amounts by category
        amounts_by_category = {}
        for tx in classified_transactions:
            cat = tx.get("tax_category", "uncategorized")
            amt = abs(float(tx.get("amount", 0)))
            amounts_by_category[cat] = amounts_by_category.get(cat, 0) + amt
        
        return {
            "total_transactions": total,
            "categorized": categorized,
            "uncategorized": total - categorized,
            "high_confidence": high_confidence,
            "categorization_rate": categorized / total if total > 0 else 0,
            "transactions_by_category": by_category,
            "amounts_by_category": amounts_by_category
        }