Spaces:

Eniiyanu
/

Kaanta

Running

Kaanta / transaction_classifier.py

Oluwaferanmi

Update the classifier function

090df05 2 months ago

15.3 kB

	# transaction_classifier.py
	"""
	Transaction Classifier for Tax Optimization
	Classifies Mono API and manual transactions into tax-relevant categories
	"""
	from __future__ import annotations
	from typing import Dict, List, Any, Optional
	import re
	from dataclasses import dataclass
	from datetime import datetime


	@dataclass
	class TaxClassification:
	"""Result of classifying a transaction for tax purposes"""
	tax_category: str
	tax_treatment: str # taxable, deductible, exempt, unknown
	deductible: bool
	confidence: float
	suggested_rule_ids: List[str]
	notes: Optional[str] = None


	class TransactionClassifier:
	"""
	Classifies bank transactions (from Mono API or manual entry) into tax categories
	"""

	# Nigerian bank transaction patterns (expanded for better coverage)
	INCOME_PATTERNS = {
	'employment_income': [
	r'\bSALARY\b', r'\bWAGES\b', r'\bPAYROLL\b', r'\bSTIPEND\b',
	r'\bEMPLOYMENT\b', r'\bMONTHLY PAY\b', r'\bNET PAY\b',
	r'\bGROSS PAY\b', r'\bEARNINGS\b', r'\bSALARY PAYMENT\b'
	],
	'business_income': [
	r'\bSALES\b', r'\bREVENUE\b', r'\bINVOICE\b', r'\bPAYMENT RECEIVED\b',
	r'\bCUSTOMER\b', r'\bCLIENT\b', r'\bPROJECT\b', r'\bCONSULTING\b',
	r'\bFREELANCE\b', r'\bCONTRACT\b'
	],
	'rental_income': [
	r'\bRENT RECEIVED\b', r'\bTENANT\b', r'\bLEASE PAYMENT\b',
	r'\bPROPERTY INCOME\b', r'\bRENTAL\b'
	],
	'investment_income': [
	r'\bDIVIDEND\b', r'\bINTEREST\b', r'\bINVESTMENT\b',
	r'\bCOUPON\b', r'\bBOND\b', r'\bSTOCK\b', r'\bSHARE\b'
	]
	}

	DEDUCTION_PATTERNS = {
	'pension_contribution': [
	r'\bPENSION\b', r'\bPFA\b', r'\bRSA\b', r'\bRETIREMENT\b',
	r'\bPENSION FUND\b', r'\bPENSION CONTRIBUTION\b'
	],
	'nhf_contribution': [
	r'\bNHF\b', r'\bHOUSING FUND\b', r'\bNATIONAL HOUSING\b'
	],
	'life_insurance': [
	r'\bLIFE INSURANCE\b', r'\bLIFE ASSURANCE\b', r'\bINSURANCE PREMIUM\b',
	r'\bPOLICY PREMIUM\b'
	],
	'health_insurance': [
	r'\bHEALTH INSURANCE\b', r'\bHMO\b', r'\bMEDICAL INSURANCE\b',
	r'\bHEALTH PLAN\b'
	],
	'rent_paid': [
	r'\bRENT\b', r'\bLANDLORD\b', r'\bLEASE\b', r'\bHOUSE RENT\b',
	r'\bAPARTMENT RENT\b'
	],
	'union_dues': [
	r'\bUNION DUES\b', r'\bPROFESSIONAL FEES\b', r'\bASSOCIATION FEES\b',
	r'\bMEMBERSHIP DUES\b'
	]
	}

	def __init__(self, rag_pipeline: Optional[Any] = None):
	"""
	Initialize classifier

	Args:
	rag_pipeline: Optional RAG pipeline for LLM-based classification of ambiguous transactions
	"""
	self.rag = rag_pipeline

	def classify_transaction(self, transaction: Dict[str, Any]) -> Dict[str, Any]:
	"""
	Classify a transaction (from Mono API or manual entry)

	Accepts both formats:
	- Mono API: {"_id", "type": "credit/debit", "amount": 50000, "narration": "..."}
	- Backend: {"id", "type": "income/expense", "amount_kobo": 5000000, "description": "..."}

	Returns enriched transaction with tax classification
	"""
	# Normalize narration/description
	narration = (transaction.get("narration") or transaction.get("description") or "").upper()

	# Normalize amount (handle both kobo and naira)
	amount = transaction.get("amount")
	if amount is None:
	amount_kobo = transaction.get("amount_kobo", 0)
	amount = abs(float(amount_kobo) / 100.0)
	else:
	amount = abs(float(amount))

	# Normalize type (handle both credit/debit and income/expense)
	tx_type = transaction.get("type", "").lower()
	if tx_type in ['income']:
	tx_type = 'credit'
	elif tx_type in ['expense']:
	tx_type = 'debit'

	# Classify using pattern matching
	classification = self._classify_by_patterns(narration, tx_type, amount)

	# DISABLED: LLM classification to avoid rate limits
	# Only use pattern matching for now
	# If confidence is low and RAG is available, use LLM
	# if classification.confidence < 0.7 and self.rag:
	# llm_classification = self._llm_classify(transaction)
	# if llm_classification.confidence > classification.confidence:
	# classification = llm_classification

	# Enrich original transaction
	return {
	**transaction,
	"tax_category": classification.tax_category,
	"tax_treatment": classification.tax_treatment,
	"deductible": classification.deductible,
	"confidence": classification.confidence,
	"suggested_rule_ids": classification.suggested_rule_ids,
	"tax_notes": classification.notes
	}

	def classify_batch(self, transactions: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
	"""Classify multiple transactions"""
	return [self.classify_transaction(tx) for tx in transactions]

	def _classify_by_patterns(
	self,
	narration: str,
	tx_type: str,
	amount: float
	) -> TaxClassification:
	"""Pattern-based classification using regex"""

	# Check income patterns (for credits)
	if tx_type == "credit":
	for category, patterns in self.INCOME_PATTERNS.items():
	for pattern in patterns:
	if re.search(pattern, narration):
	return self._get_income_classification(category, amount)

	# Check deduction patterns (for debits)
	if tx_type == "debit":
	for category, patterns in self.DEDUCTION_PATTERNS.items():
	for pattern in patterns:
	if re.search(pattern, narration):
	return self._get_deduction_classification(category, amount)

	# Default: uncategorized
	return TaxClassification(
	tax_category="uncategorized",
	tax_treatment="unknown",
	deductible=False,
	confidence=0.3,
	suggested_rule_ids=[],
	notes="Could not automatically categorize. Manual review recommended."
	)

	def _get_income_classification(self, category: str, amount: float) -> TaxClassification:
	"""Get classification for income categories"""

	classifications = {
	'employment_income': TaxClassification(
	tax_category="employment_income",
	tax_treatment="taxable",
	deductible=False,
	confidence=0.95,
	suggested_rule_ids=["pit.base.gross_income"],
	notes="Employment income is fully taxable under PITA"
	),
	'business_income': TaxClassification(
	tax_category="business_income",
	tax_treatment="taxable",
	deductible=False,
	confidence=0.85,
	suggested_rule_ids=["cit.rate.small_2025", "cit.rate.medium_2025", "cit.rate.large_2025"],
	notes="Business income subject to CIT or PIT depending on structure"
	),
	'rental_income': TaxClassification(
	tax_category="rental_income",
	tax_treatment="taxable",
	deductible=False,
	confidence=0.90,
	suggested_rule_ids=["pit.base.gross_income"],
	notes="Rental income is taxable. Consider property expenses as deductions."
	),
	'investment_income': TaxClassification(
	tax_category="investment_income",
	tax_treatment="taxable",
	deductible=False,
	confidence=0.85,
	suggested_rule_ids=[],
	notes="Investment income may be subject to withholding tax"
	)
	}

	return classifications.get(category, TaxClassification(
	tax_category="other_income",
	tax_treatment="taxable",
	deductible=False,
	confidence=0.5,
	suggested_rule_ids=[]
	))

	def _get_deduction_classification(self, category: str, amount: float) -> TaxClassification:
	"""Get classification for deduction categories"""

	classifications = {
	'pension_contribution': TaxClassification(
	tax_category="pension_contribution",
	tax_treatment="deductible",
	deductible=True,
	confidence=0.95,
	suggested_rule_ids=["pit.deduction.pension"],
	notes="Pension contributions to PRA-approved schemes are tax deductible (PITA s.20(1)(g))"
	),
	'nhf_contribution': TaxClassification(
	tax_category="nhf_contribution",
	tax_treatment="deductible",
	deductible=True,
	confidence=0.95,
	suggested_rule_ids=["pit.base.taxable_income"],
	notes="NHF contributions are tax deductible (2.5% of basic salary)"
	),
	'life_insurance': TaxClassification(
	tax_category="life_insurance",
	tax_treatment="deductible",
	deductible=True,
	confidence=0.85,
	suggested_rule_ids=["pit.base.taxable_income"],
	notes="Life insurance premiums are tax deductible if policy is with licensed insurer"
	),
	'health_insurance': TaxClassification(
	tax_category="health_insurance",
	tax_treatment="deductible",
	deductible=True,
	confidence=0.80,
	suggested_rule_ids=["pit.base.taxable_income"],
	notes="Health insurance premiums may be tax deductible"
	),
	'rent_paid': TaxClassification(
	tax_category="rent_paid",
	tax_treatment="potentially_deductible",
	deductible=False, # Not in 2025, but yes in 2026
	confidence=0.85,
	suggested_rule_ids=["pit.relief.rent_2026"],
	notes="Rent paid: Not deductible in 2025. From 2026, 20% of rent (max ₦500K) under NTA 2025"
	),
	'union_dues': TaxClassification(
	tax_category="union_dues",
	tax_treatment="deductible",
	deductible=True,
	confidence=0.80,
	suggested_rule_ids=["pit.base.taxable_income"],
	notes="Professional association fees and union dues are tax deductible"
	)
	}

	return classifications.get(category, TaxClassification(
	tax_category="other_expense",
	tax_treatment="unknown",
	deductible=False,
	confidence=0.4,
	suggested_rule_ids=[]
	))

	def _llm_classify(self, transaction: Dict[str, Any]) -> TaxClassification:
	"""
	Use LLM/RAG to classify ambiguous transactions
	This is a fallback for transactions that don't match patterns
	"""
	if not self.rag:
	return TaxClassification(
	tax_category="uncategorized",
	tax_treatment="unknown",
	deductible=False,
	confidence=0.3,
	suggested_rule_ids=[]
	)

	narration = transaction.get("narration", "")
	amount = transaction.get("amount", 0)
	tx_type = transaction.get("type", "")

	prompt = f"""
	Classify this Nigerian bank transaction for tax purposes:

	Transaction Details:
	- Narration: {narration}
	- Amount: ₦{amount:,.2f}
	- Type: {tx_type}

	Classify into ONE of these categories:
	- employment_income (salary, wages, stipend)
	- business_income (sales, revenue, client payments)
	- rental_income (rent received from tenants)
	- pension_contribution (PFA, RSA contributions)
	- nhf_contribution (National Housing Fund)
	- life_insurance (insurance premiums)
	- rent_paid (rent paid to landlord)
	- union_dues (professional fees, association dues)
	- uncategorized (if unclear)

	Also indicate:
	1. Is it tax deductible? (yes/no)
	2. Confidence level (0.0 to 1.0)

	Respond with just the category name, deductible status, and confidence.
	Example: "employment_income, no, 0.95"
	"""

	try:
	# Query RAG pipeline
	response = self.rag.query(prompt, verbose=False)

	# Parse response (simplified - you may want more robust parsing)
	parts = response.lower().split(',')
	if len(parts) >= 3:
	category = parts[0].strip()
	deductible = 'yes' in parts[1].strip()
	confidence = float(parts[2].strip())

	return TaxClassification(
	tax_category=category,
	tax_treatment="deductible" if deductible else "taxable",
	deductible=deductible,
	confidence=min(confidence, 0.85), # Cap LLM confidence
	suggested_rule_ids=[],
	notes="Classified using AI analysis"
	)
	except Exception as e:
	print(f"LLM classification failed: {e}")

	# Fallback
	return TaxClassification(
	tax_category="uncategorized",
	tax_treatment="unknown",
	deductible=False,
	confidence=0.3,
	suggested_rule_ids=[]
	)

	def get_classification_summary(self, classified_transactions: List[Dict[str, Any]]) -> Dict[str, Any]:
	"""Generate summary statistics of classified transactions"""

	total = len(classified_transactions)
	if total == 0:
	return {"total": 0, "categorized": 0, "high_confidence": 0}

	categorized = len([t for t in classified_transactions if t.get("tax_category") != "uncategorized"])
	high_confidence = len([t for t in classified_transactions if t.get("confidence", 0) > 0.8])

	# Group by category
	by_category = {}
	for tx in classified_transactions:
	cat = tx.get("tax_category", "uncategorized")
	by_category[cat] = by_category.get(cat, 0) + 1

	# Calculate total amounts by category
	amounts_by_category = {}
	for tx in classified_transactions:
	cat = tx.get("tax_category", "uncategorized")
	amt = abs(float(tx.get("amount", 0)))
	amounts_by_category[cat] = amounts_by_category.get(cat, 0) + amt

	return {
	"total_transactions": total,
	"categorized": categorized,
	"uncategorized": total - categorized,
	"high_confidence": high_confidence,
	"categorization_rate": categorized / total if total > 0 else 0,
	"transactions_by_category": by_category,
	"amounts_by_category": amounts_by_category
	}