|
|
|
|
|
""" |
|
|
Transaction Classifier for Tax Optimization |
|
|
Classifies Mono API and manual transactions into tax-relevant categories |
|
|
""" |
|
|
from __future__ import annotations |
|
|
from typing import Dict, List, Any, Optional |
|
|
import re |
|
|
from dataclasses import dataclass |
|
|
from datetime import datetime |
|
|
|
|
|
|
|
|
@dataclass |
|
|
class TaxClassification: |
|
|
"""Result of classifying a transaction for tax purposes""" |
|
|
tax_category: str |
|
|
tax_treatment: str |
|
|
deductible: bool |
|
|
confidence: float |
|
|
suggested_rule_ids: List[str] |
|
|
notes: Optional[str] = None |
|
|
|
|
|
|
|
|
class TransactionClassifier: |
|
|
""" |
|
|
Classifies bank transactions (from Mono API or manual entry) into tax categories |
|
|
""" |
|
|
|
|
|
|
|
|
INCOME_PATTERNS = { |
|
|
'employment_income': [ |
|
|
r'\bSALARY\b', r'\bWAGES\b', r'\bPAYROLL\b', r'\bSTIPEND\b', |
|
|
r'\bEMPLOYMENT\b', r'\bMONTHLY PAY\b', r'\bNET PAY\b', |
|
|
r'\bGROSS PAY\b', r'\bEARNINGS\b', r'\bSALARY PAYMENT\b' |
|
|
], |
|
|
'business_income': [ |
|
|
r'\bSALES\b', r'\bREVENUE\b', r'\bINVOICE\b', r'\bPAYMENT RECEIVED\b', |
|
|
r'\bCUSTOMER\b', r'\bCLIENT\b', r'\bPROJECT\b', r'\bCONSULTING\b', |
|
|
r'\bFREELANCE\b', r'\bCONTRACT\b' |
|
|
], |
|
|
'rental_income': [ |
|
|
r'\bRENT RECEIVED\b', r'\bTENANT\b', r'\bLEASE PAYMENT\b', |
|
|
r'\bPROPERTY INCOME\b', r'\bRENTAL\b' |
|
|
], |
|
|
'investment_income': [ |
|
|
r'\bDIVIDEND\b', r'\bINTEREST\b', r'\bINVESTMENT\b', |
|
|
r'\bCOUPON\b', r'\bBOND\b', r'\bSTOCK\b', r'\bSHARE\b' |
|
|
] |
|
|
} |
|
|
|
|
|
DEDUCTION_PATTERNS = { |
|
|
'pension_contribution': [ |
|
|
r'\bPENSION\b', r'\bPFA\b', r'\bRSA\b', r'\bRETIREMENT\b', |
|
|
r'\bPENSION FUND\b', r'\bPENSION CONTRIBUTION\b' |
|
|
], |
|
|
'nhf_contribution': [ |
|
|
r'\bNHF\b', r'\bHOUSING FUND\b', r'\bNATIONAL HOUSING\b' |
|
|
], |
|
|
'life_insurance': [ |
|
|
r'\bLIFE INSURANCE\b', r'\bLIFE ASSURANCE\b', r'\bINSURANCE PREMIUM\b', |
|
|
r'\bPOLICY PREMIUM\b' |
|
|
], |
|
|
'health_insurance': [ |
|
|
r'\bHEALTH INSURANCE\b', r'\bHMO\b', r'\bMEDICAL INSURANCE\b', |
|
|
r'\bHEALTH PLAN\b' |
|
|
], |
|
|
'rent_paid': [ |
|
|
r'\bRENT\b', r'\bLANDLORD\b', r'\bLEASE\b', r'\bHOUSE RENT\b', |
|
|
r'\bAPARTMENT RENT\b' |
|
|
], |
|
|
'union_dues': [ |
|
|
r'\bUNION DUES\b', r'\bPROFESSIONAL FEES\b', r'\bASSOCIATION FEES\b', |
|
|
r'\bMEMBERSHIP DUES\b' |
|
|
] |
|
|
} |
|
|
|
|
|
def __init__(self, rag_pipeline: Optional[Any] = None): |
|
|
""" |
|
|
Initialize classifier |
|
|
|
|
|
Args: |
|
|
rag_pipeline: Optional RAG pipeline for LLM-based classification of ambiguous transactions |
|
|
""" |
|
|
self.rag = rag_pipeline |
|
|
|
|
|
def classify_transaction(self, transaction: Dict[str, Any]) -> Dict[str, Any]: |
|
|
""" |
|
|
Classify a transaction (from Mono API or manual entry) |
|
|
|
|
|
Accepts both formats: |
|
|
- Mono API: {"_id", "type": "credit/debit", "amount": 50000, "narration": "..."} |
|
|
- Backend: {"id", "type": "income/expense", "amount_kobo": 5000000, "description": "..."} |
|
|
|
|
|
Returns enriched transaction with tax classification |
|
|
""" |
|
|
|
|
|
narration = (transaction.get("narration") or transaction.get("description") or "").upper() |
|
|
|
|
|
|
|
|
amount = transaction.get("amount") |
|
|
if amount is None: |
|
|
amount_kobo = transaction.get("amount_kobo", 0) |
|
|
amount = abs(float(amount_kobo) / 100.0) |
|
|
else: |
|
|
amount = abs(float(amount)) |
|
|
|
|
|
|
|
|
tx_type = transaction.get("type", "").lower() |
|
|
if tx_type in ['income']: |
|
|
tx_type = 'credit' |
|
|
elif tx_type in ['expense']: |
|
|
tx_type = 'debit' |
|
|
|
|
|
|
|
|
classification = self._classify_by_patterns(narration, tx_type, amount) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
return { |
|
|
**transaction, |
|
|
"tax_category": classification.tax_category, |
|
|
"tax_treatment": classification.tax_treatment, |
|
|
"deductible": classification.deductible, |
|
|
"confidence": classification.confidence, |
|
|
"suggested_rule_ids": classification.suggested_rule_ids, |
|
|
"tax_notes": classification.notes |
|
|
} |
|
|
|
|
|
def classify_batch(self, transactions: List[Dict[str, Any]]) -> List[Dict[str, Any]]: |
|
|
"""Classify multiple transactions""" |
|
|
return [self.classify_transaction(tx) for tx in transactions] |
|
|
|
|
|
def _classify_by_patterns( |
|
|
self, |
|
|
narration: str, |
|
|
tx_type: str, |
|
|
amount: float |
|
|
) -> TaxClassification: |
|
|
"""Pattern-based classification using regex""" |
|
|
|
|
|
|
|
|
if tx_type == "credit": |
|
|
for category, patterns in self.INCOME_PATTERNS.items(): |
|
|
for pattern in patterns: |
|
|
if re.search(pattern, narration): |
|
|
return self._get_income_classification(category, amount) |
|
|
|
|
|
|
|
|
if tx_type == "debit": |
|
|
for category, patterns in self.DEDUCTION_PATTERNS.items(): |
|
|
for pattern in patterns: |
|
|
if re.search(pattern, narration): |
|
|
return self._get_deduction_classification(category, amount) |
|
|
|
|
|
|
|
|
return TaxClassification( |
|
|
tax_category="uncategorized", |
|
|
tax_treatment="unknown", |
|
|
deductible=False, |
|
|
confidence=0.3, |
|
|
suggested_rule_ids=[], |
|
|
notes="Could not automatically categorize. Manual review recommended." |
|
|
) |
|
|
|
|
|
def _get_income_classification(self, category: str, amount: float) -> TaxClassification: |
|
|
"""Get classification for income categories""" |
|
|
|
|
|
classifications = { |
|
|
'employment_income': TaxClassification( |
|
|
tax_category="employment_income", |
|
|
tax_treatment="taxable", |
|
|
deductible=False, |
|
|
confidence=0.95, |
|
|
suggested_rule_ids=["pit.base.gross_income"], |
|
|
notes="Employment income is fully taxable under PITA" |
|
|
), |
|
|
'business_income': TaxClassification( |
|
|
tax_category="business_income", |
|
|
tax_treatment="taxable", |
|
|
deductible=False, |
|
|
confidence=0.85, |
|
|
suggested_rule_ids=["cit.rate.small_2025", "cit.rate.medium_2025", "cit.rate.large_2025"], |
|
|
notes="Business income subject to CIT or PIT depending on structure" |
|
|
), |
|
|
'rental_income': TaxClassification( |
|
|
tax_category="rental_income", |
|
|
tax_treatment="taxable", |
|
|
deductible=False, |
|
|
confidence=0.90, |
|
|
suggested_rule_ids=["pit.base.gross_income"], |
|
|
notes="Rental income is taxable. Consider property expenses as deductions." |
|
|
), |
|
|
'investment_income': TaxClassification( |
|
|
tax_category="investment_income", |
|
|
tax_treatment="taxable", |
|
|
deductible=False, |
|
|
confidence=0.85, |
|
|
suggested_rule_ids=[], |
|
|
notes="Investment income may be subject to withholding tax" |
|
|
) |
|
|
} |
|
|
|
|
|
return classifications.get(category, TaxClassification( |
|
|
tax_category="other_income", |
|
|
tax_treatment="taxable", |
|
|
deductible=False, |
|
|
confidence=0.5, |
|
|
suggested_rule_ids=[] |
|
|
)) |
|
|
|
|
|
def _get_deduction_classification(self, category: str, amount: float) -> TaxClassification: |
|
|
"""Get classification for deduction categories""" |
|
|
|
|
|
classifications = { |
|
|
'pension_contribution': TaxClassification( |
|
|
tax_category="pension_contribution", |
|
|
tax_treatment="deductible", |
|
|
deductible=True, |
|
|
confidence=0.95, |
|
|
suggested_rule_ids=["pit.deduction.pension"], |
|
|
notes="Pension contributions to PRA-approved schemes are tax deductible (PITA s.20(1)(g))" |
|
|
), |
|
|
'nhf_contribution': TaxClassification( |
|
|
tax_category="nhf_contribution", |
|
|
tax_treatment="deductible", |
|
|
deductible=True, |
|
|
confidence=0.95, |
|
|
suggested_rule_ids=["pit.base.taxable_income"], |
|
|
notes="NHF contributions are tax deductible (2.5% of basic salary)" |
|
|
), |
|
|
'life_insurance': TaxClassification( |
|
|
tax_category="life_insurance", |
|
|
tax_treatment="deductible", |
|
|
deductible=True, |
|
|
confidence=0.85, |
|
|
suggested_rule_ids=["pit.base.taxable_income"], |
|
|
notes="Life insurance premiums are tax deductible if policy is with licensed insurer" |
|
|
), |
|
|
'health_insurance': TaxClassification( |
|
|
tax_category="health_insurance", |
|
|
tax_treatment="deductible", |
|
|
deductible=True, |
|
|
confidence=0.80, |
|
|
suggested_rule_ids=["pit.base.taxable_income"], |
|
|
notes="Health insurance premiums may be tax deductible" |
|
|
), |
|
|
'rent_paid': TaxClassification( |
|
|
tax_category="rent_paid", |
|
|
tax_treatment="potentially_deductible", |
|
|
deductible=False, |
|
|
confidence=0.85, |
|
|
suggested_rule_ids=["pit.relief.rent_2026"], |
|
|
notes="Rent paid: Not deductible in 2025. From 2026, 20% of rent (max ₦500K) under NTA 2025" |
|
|
), |
|
|
'union_dues': TaxClassification( |
|
|
tax_category="union_dues", |
|
|
tax_treatment="deductible", |
|
|
deductible=True, |
|
|
confidence=0.80, |
|
|
suggested_rule_ids=["pit.base.taxable_income"], |
|
|
notes="Professional association fees and union dues are tax deductible" |
|
|
) |
|
|
} |
|
|
|
|
|
return classifications.get(category, TaxClassification( |
|
|
tax_category="other_expense", |
|
|
tax_treatment="unknown", |
|
|
deductible=False, |
|
|
confidence=0.4, |
|
|
suggested_rule_ids=[] |
|
|
)) |
|
|
|
|
|
def _llm_classify(self, transaction: Dict[str, Any]) -> TaxClassification: |
|
|
""" |
|
|
Use LLM/RAG to classify ambiguous transactions |
|
|
This is a fallback for transactions that don't match patterns |
|
|
""" |
|
|
if not self.rag: |
|
|
return TaxClassification( |
|
|
tax_category="uncategorized", |
|
|
tax_treatment="unknown", |
|
|
deductible=False, |
|
|
confidence=0.3, |
|
|
suggested_rule_ids=[] |
|
|
) |
|
|
|
|
|
narration = transaction.get("narration", "") |
|
|
amount = transaction.get("amount", 0) |
|
|
tx_type = transaction.get("type", "") |
|
|
|
|
|
prompt = f""" |
|
|
Classify this Nigerian bank transaction for tax purposes: |
|
|
|
|
|
Transaction Details: |
|
|
- Narration: {narration} |
|
|
- Amount: ₦{amount:,.2f} |
|
|
- Type: {tx_type} |
|
|
|
|
|
Classify into ONE of these categories: |
|
|
- employment_income (salary, wages, stipend) |
|
|
- business_income (sales, revenue, client payments) |
|
|
- rental_income (rent received from tenants) |
|
|
- pension_contribution (PFA, RSA contributions) |
|
|
- nhf_contribution (National Housing Fund) |
|
|
- life_insurance (insurance premiums) |
|
|
- rent_paid (rent paid to landlord) |
|
|
- union_dues (professional fees, association dues) |
|
|
- uncategorized (if unclear) |
|
|
|
|
|
Also indicate: |
|
|
1. Is it tax deductible? (yes/no) |
|
|
2. Confidence level (0.0 to 1.0) |
|
|
|
|
|
Respond with just the category name, deductible status, and confidence. |
|
|
Example: "employment_income, no, 0.95" |
|
|
""" |
|
|
|
|
|
try: |
|
|
|
|
|
response = self.rag.query(prompt, verbose=False) |
|
|
|
|
|
|
|
|
parts = response.lower().split(',') |
|
|
if len(parts) >= 3: |
|
|
category = parts[0].strip() |
|
|
deductible = 'yes' in parts[1].strip() |
|
|
confidence = float(parts[2].strip()) |
|
|
|
|
|
return TaxClassification( |
|
|
tax_category=category, |
|
|
tax_treatment="deductible" if deductible else "taxable", |
|
|
deductible=deductible, |
|
|
confidence=min(confidence, 0.85), |
|
|
suggested_rule_ids=[], |
|
|
notes="Classified using AI analysis" |
|
|
) |
|
|
except Exception as e: |
|
|
print(f"LLM classification failed: {e}") |
|
|
|
|
|
|
|
|
return TaxClassification( |
|
|
tax_category="uncategorized", |
|
|
tax_treatment="unknown", |
|
|
deductible=False, |
|
|
confidence=0.3, |
|
|
suggested_rule_ids=[] |
|
|
) |
|
|
|
|
|
def get_classification_summary(self, classified_transactions: List[Dict[str, Any]]) -> Dict[str, Any]: |
|
|
"""Generate summary statistics of classified transactions""" |
|
|
|
|
|
total = len(classified_transactions) |
|
|
if total == 0: |
|
|
return {"total": 0, "categorized": 0, "high_confidence": 0} |
|
|
|
|
|
categorized = len([t for t in classified_transactions if t.get("tax_category") != "uncategorized"]) |
|
|
high_confidence = len([t for t in classified_transactions if t.get("confidence", 0) > 0.8]) |
|
|
|
|
|
|
|
|
by_category = {} |
|
|
for tx in classified_transactions: |
|
|
cat = tx.get("tax_category", "uncategorized") |
|
|
by_category[cat] = by_category.get(cat, 0) + 1 |
|
|
|
|
|
|
|
|
amounts_by_category = {} |
|
|
for tx in classified_transactions: |
|
|
cat = tx.get("tax_category", "uncategorized") |
|
|
amt = abs(float(tx.get("amount", 0))) |
|
|
amounts_by_category[cat] = amounts_by_category.get(cat, 0) + amt |
|
|
|
|
|
return { |
|
|
"total_transactions": total, |
|
|
"categorized": categorized, |
|
|
"uncategorized": total - categorized, |
|
|
"high_confidence": high_confidence, |
|
|
"categorization_rate": categorized / total if total > 0 else 0, |
|
|
"transactions_by_category": by_category, |
|
|
"amounts_by_category": amounts_by_category |
|
|
} |
|
|
|