Kaanta / transaction_classifier.py
Oluwaferanmi
Update the classifier function
090df05
# transaction_classifier.py
"""
Transaction Classifier for Tax Optimization
Classifies Mono API and manual transactions into tax-relevant categories
"""
from __future__ import annotations
from typing import Dict, List, Any, Optional
import re
from dataclasses import dataclass
from datetime import datetime
@dataclass
class TaxClassification:
"""Result of classifying a transaction for tax purposes"""
tax_category: str
tax_treatment: str # taxable, deductible, exempt, unknown
deductible: bool
confidence: float
suggested_rule_ids: List[str]
notes: Optional[str] = None
class TransactionClassifier:
"""
Classifies bank transactions (from Mono API or manual entry) into tax categories
"""
# Nigerian bank transaction patterns (expanded for better coverage)
INCOME_PATTERNS = {
'employment_income': [
r'\bSALARY\b', r'\bWAGES\b', r'\bPAYROLL\b', r'\bSTIPEND\b',
r'\bEMPLOYMENT\b', r'\bMONTHLY PAY\b', r'\bNET PAY\b',
r'\bGROSS PAY\b', r'\bEARNINGS\b', r'\bSALARY PAYMENT\b'
],
'business_income': [
r'\bSALES\b', r'\bREVENUE\b', r'\bINVOICE\b', r'\bPAYMENT RECEIVED\b',
r'\bCUSTOMER\b', r'\bCLIENT\b', r'\bPROJECT\b', r'\bCONSULTING\b',
r'\bFREELANCE\b', r'\bCONTRACT\b'
],
'rental_income': [
r'\bRENT RECEIVED\b', r'\bTENANT\b', r'\bLEASE PAYMENT\b',
r'\bPROPERTY INCOME\b', r'\bRENTAL\b'
],
'investment_income': [
r'\bDIVIDEND\b', r'\bINTEREST\b', r'\bINVESTMENT\b',
r'\bCOUPON\b', r'\bBOND\b', r'\bSTOCK\b', r'\bSHARE\b'
]
}
DEDUCTION_PATTERNS = {
'pension_contribution': [
r'\bPENSION\b', r'\bPFA\b', r'\bRSA\b', r'\bRETIREMENT\b',
r'\bPENSION FUND\b', r'\bPENSION CONTRIBUTION\b'
],
'nhf_contribution': [
r'\bNHF\b', r'\bHOUSING FUND\b', r'\bNATIONAL HOUSING\b'
],
'life_insurance': [
r'\bLIFE INSURANCE\b', r'\bLIFE ASSURANCE\b', r'\bINSURANCE PREMIUM\b',
r'\bPOLICY PREMIUM\b'
],
'health_insurance': [
r'\bHEALTH INSURANCE\b', r'\bHMO\b', r'\bMEDICAL INSURANCE\b',
r'\bHEALTH PLAN\b'
],
'rent_paid': [
r'\bRENT\b', r'\bLANDLORD\b', r'\bLEASE\b', r'\bHOUSE RENT\b',
r'\bAPARTMENT RENT\b'
],
'union_dues': [
r'\bUNION DUES\b', r'\bPROFESSIONAL FEES\b', r'\bASSOCIATION FEES\b',
r'\bMEMBERSHIP DUES\b'
]
}
def __init__(self, rag_pipeline: Optional[Any] = None):
"""
Initialize classifier
Args:
rag_pipeline: Optional RAG pipeline for LLM-based classification of ambiguous transactions
"""
self.rag = rag_pipeline
def classify_transaction(self, transaction: Dict[str, Any]) -> Dict[str, Any]:
"""
Classify a transaction (from Mono API or manual entry)
Accepts both formats:
- Mono API: {"_id", "type": "credit/debit", "amount": 50000, "narration": "..."}
- Backend: {"id", "type": "income/expense", "amount_kobo": 5000000, "description": "..."}
Returns enriched transaction with tax classification
"""
# Normalize narration/description
narration = (transaction.get("narration") or transaction.get("description") or "").upper()
# Normalize amount (handle both kobo and naira)
amount = transaction.get("amount")
if amount is None:
amount_kobo = transaction.get("amount_kobo", 0)
amount = abs(float(amount_kobo) / 100.0)
else:
amount = abs(float(amount))
# Normalize type (handle both credit/debit and income/expense)
tx_type = transaction.get("type", "").lower()
if tx_type in ['income']:
tx_type = 'credit'
elif tx_type in ['expense']:
tx_type = 'debit'
# Classify using pattern matching
classification = self._classify_by_patterns(narration, tx_type, amount)
# DISABLED: LLM classification to avoid rate limits
# Only use pattern matching for now
# If confidence is low and RAG is available, use LLM
# if classification.confidence < 0.7 and self.rag:
# llm_classification = self._llm_classify(transaction)
# if llm_classification.confidence > classification.confidence:
# classification = llm_classification
# Enrich original transaction
return {
**transaction,
"tax_category": classification.tax_category,
"tax_treatment": classification.tax_treatment,
"deductible": classification.deductible,
"confidence": classification.confidence,
"suggested_rule_ids": classification.suggested_rule_ids,
"tax_notes": classification.notes
}
def classify_batch(self, transactions: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
"""Classify multiple transactions"""
return [self.classify_transaction(tx) for tx in transactions]
def _classify_by_patterns(
self,
narration: str,
tx_type: str,
amount: float
) -> TaxClassification:
"""Pattern-based classification using regex"""
# Check income patterns (for credits)
if tx_type == "credit":
for category, patterns in self.INCOME_PATTERNS.items():
for pattern in patterns:
if re.search(pattern, narration):
return self._get_income_classification(category, amount)
# Check deduction patterns (for debits)
if tx_type == "debit":
for category, patterns in self.DEDUCTION_PATTERNS.items():
for pattern in patterns:
if re.search(pattern, narration):
return self._get_deduction_classification(category, amount)
# Default: uncategorized
return TaxClassification(
tax_category="uncategorized",
tax_treatment="unknown",
deductible=False,
confidence=0.3,
suggested_rule_ids=[],
notes="Could not automatically categorize. Manual review recommended."
)
def _get_income_classification(self, category: str, amount: float) -> TaxClassification:
"""Get classification for income categories"""
classifications = {
'employment_income': TaxClassification(
tax_category="employment_income",
tax_treatment="taxable",
deductible=False,
confidence=0.95,
suggested_rule_ids=["pit.base.gross_income"],
notes="Employment income is fully taxable under PITA"
),
'business_income': TaxClassification(
tax_category="business_income",
tax_treatment="taxable",
deductible=False,
confidence=0.85,
suggested_rule_ids=["cit.rate.small_2025", "cit.rate.medium_2025", "cit.rate.large_2025"],
notes="Business income subject to CIT or PIT depending on structure"
),
'rental_income': TaxClassification(
tax_category="rental_income",
tax_treatment="taxable",
deductible=False,
confidence=0.90,
suggested_rule_ids=["pit.base.gross_income"],
notes="Rental income is taxable. Consider property expenses as deductions."
),
'investment_income': TaxClassification(
tax_category="investment_income",
tax_treatment="taxable",
deductible=False,
confidence=0.85,
suggested_rule_ids=[],
notes="Investment income may be subject to withholding tax"
)
}
return classifications.get(category, TaxClassification(
tax_category="other_income",
tax_treatment="taxable",
deductible=False,
confidence=0.5,
suggested_rule_ids=[]
))
def _get_deduction_classification(self, category: str, amount: float) -> TaxClassification:
"""Get classification for deduction categories"""
classifications = {
'pension_contribution': TaxClassification(
tax_category="pension_contribution",
tax_treatment="deductible",
deductible=True,
confidence=0.95,
suggested_rule_ids=["pit.deduction.pension"],
notes="Pension contributions to PRA-approved schemes are tax deductible (PITA s.20(1)(g))"
),
'nhf_contribution': TaxClassification(
tax_category="nhf_contribution",
tax_treatment="deductible",
deductible=True,
confidence=0.95,
suggested_rule_ids=["pit.base.taxable_income"],
notes="NHF contributions are tax deductible (2.5% of basic salary)"
),
'life_insurance': TaxClassification(
tax_category="life_insurance",
tax_treatment="deductible",
deductible=True,
confidence=0.85,
suggested_rule_ids=["pit.base.taxable_income"],
notes="Life insurance premiums are tax deductible if policy is with licensed insurer"
),
'health_insurance': TaxClassification(
tax_category="health_insurance",
tax_treatment="deductible",
deductible=True,
confidence=0.80,
suggested_rule_ids=["pit.base.taxable_income"],
notes="Health insurance premiums may be tax deductible"
),
'rent_paid': TaxClassification(
tax_category="rent_paid",
tax_treatment="potentially_deductible",
deductible=False, # Not in 2025, but yes in 2026
confidence=0.85,
suggested_rule_ids=["pit.relief.rent_2026"],
notes="Rent paid: Not deductible in 2025. From 2026, 20% of rent (max ₦500K) under NTA 2025"
),
'union_dues': TaxClassification(
tax_category="union_dues",
tax_treatment="deductible",
deductible=True,
confidence=0.80,
suggested_rule_ids=["pit.base.taxable_income"],
notes="Professional association fees and union dues are tax deductible"
)
}
return classifications.get(category, TaxClassification(
tax_category="other_expense",
tax_treatment="unknown",
deductible=False,
confidence=0.4,
suggested_rule_ids=[]
))
def _llm_classify(self, transaction: Dict[str, Any]) -> TaxClassification:
"""
Use LLM/RAG to classify ambiguous transactions
This is a fallback for transactions that don't match patterns
"""
if not self.rag:
return TaxClassification(
tax_category="uncategorized",
tax_treatment="unknown",
deductible=False,
confidence=0.3,
suggested_rule_ids=[]
)
narration = transaction.get("narration", "")
amount = transaction.get("amount", 0)
tx_type = transaction.get("type", "")
prompt = f"""
Classify this Nigerian bank transaction for tax purposes:
Transaction Details:
- Narration: {narration}
- Amount: ₦{amount:,.2f}
- Type: {tx_type}
Classify into ONE of these categories:
- employment_income (salary, wages, stipend)
- business_income (sales, revenue, client payments)
- rental_income (rent received from tenants)
- pension_contribution (PFA, RSA contributions)
- nhf_contribution (National Housing Fund)
- life_insurance (insurance premiums)
- rent_paid (rent paid to landlord)
- union_dues (professional fees, association dues)
- uncategorized (if unclear)
Also indicate:
1. Is it tax deductible? (yes/no)
2. Confidence level (0.0 to 1.0)
Respond with just the category name, deductible status, and confidence.
Example: "employment_income, no, 0.95"
"""
try:
# Query RAG pipeline
response = self.rag.query(prompt, verbose=False)
# Parse response (simplified - you may want more robust parsing)
parts = response.lower().split(',')
if len(parts) >= 3:
category = parts[0].strip()
deductible = 'yes' in parts[1].strip()
confidence = float(parts[2].strip())
return TaxClassification(
tax_category=category,
tax_treatment="deductible" if deductible else "taxable",
deductible=deductible,
confidence=min(confidence, 0.85), # Cap LLM confidence
suggested_rule_ids=[],
notes="Classified using AI analysis"
)
except Exception as e:
print(f"LLM classification failed: {e}")
# Fallback
return TaxClassification(
tax_category="uncategorized",
tax_treatment="unknown",
deductible=False,
confidence=0.3,
suggested_rule_ids=[]
)
def get_classification_summary(self, classified_transactions: List[Dict[str, Any]]) -> Dict[str, Any]:
"""Generate summary statistics of classified transactions"""
total = len(classified_transactions)
if total == 0:
return {"total": 0, "categorized": 0, "high_confidence": 0}
categorized = len([t for t in classified_transactions if t.get("tax_category") != "uncategorized"])
high_confidence = len([t for t in classified_transactions if t.get("confidence", 0) > 0.8])
# Group by category
by_category = {}
for tx in classified_transactions:
cat = tx.get("tax_category", "uncategorized")
by_category[cat] = by_category.get(cat, 0) + 1
# Calculate total amounts by category
amounts_by_category = {}
for tx in classified_transactions:
cat = tx.get("tax_category", "uncategorized")
amt = abs(float(tx.get("amount", 0)))
amounts_by_category[cat] = amounts_by_category.get(cat, 0) + amt
return {
"total_transactions": total,
"categorized": categorized,
"uncategorized": total - categorized,
"high_confidence": high_confidence,
"categorization_rate": categorized / total if total > 0 else 0,
"transactions_by_category": by_category,
"amounts_by_category": amounts_by_category
}