Kaanta / input_sanitizer.py
Eniiyanu's picture
Upload 8 files
8f0ef5f verified
"""
Input Sanitization Module for Prompt Injection Defense.
Detects and neutralizes adversarial prompts attempting to manipulate
Káàntà AI's behavior or identity.
"""
import re
import logging
from typing import Tuple, List
from datetime import datetime
logger = logging.getLogger(__name__)
# Patterns that indicate prompt injection attempts
INJECTION_PATTERNS = [
# Identity manipulation
(r"ignore\s+(all\s+)?(previous|past|prior|system|your)\s+(instructions?|prompts?|rules?)", 0.9),
(r"forget\s+(all\s+)?(previous|past|your)\s+(instructions?|training|rules?)", 0.9),
(r"disregard\s+(all\s+)?(previous|system)\s+(instructions?|prompts?)", 0.9),
(r"override\s+(your|system|all)\s+(instructions?|programming|rules?)", 0.9),
(r"you\s+are\s+(now|actually|really)\s+(a|an|the)", 0.7),
(r"pretend\s+(to\s+be|you\s+are)", 0.6),
(r"act\s+as\s+(if\s+you\s+are|a|an)", 0.5),
(r"(role.?play|roleplay)\s+as", 0.6),
# Origin/identity probing
(r"what\s+company\s+(really|actually|truly)\s+made\s+you", 0.8),
(r"who\s+(really|actually|truly)\s+(made|created|built)\s+you", 0.7),
(r"reveal\s+your\s+(true|real|actual)\s+(identity|origin|maker)", 0.9),
(r"(cia|fbi|investigation)\s+.*(who|company|made)", 0.9),
# Jailbreak attempts
(r"dan\s+mode|developer\s+mode|god\s+mode", 0.95),
(r"jailbreak|bypass\s+(your|the)\s+(filter|rules?|restrictions?)", 0.95),
(r"(escape|break\s+out\s+of)\s+(your|the)\s+(constraints?|limitations?)", 0.85),
# System prompt extraction
(r"(show|reveal|print|display|output)\s+(your|the)\s+(system\s+)?prompt", 0.9),
(r"what\s+(is|are)\s+your\s+(system\s+)?(instructions?|prompt|rules?)", 0.6),
(r"repeat\s+(back|everything)\s+(before|above|in\s+your\s+prompt)", 0.9),
# Instruction injection markers
(r"\[system\]|\[admin\]|\[override\]|\[ignore\]", 0.95),
(r"<\s*(system|admin|override)\s*>", 0.95),
(r"###\s*(instruction|system|admin)", 0.9),
]
# Phrases that should trigger identity affirmation response
IDENTITY_CHALLENGES = [
r"who\s+(made|created|built|designed)\s+you",
r"what\s+(company|organization|team)\s+.*(made|created|built)\s+you",
r"are\s+you\s+(chatgpt|gpt|openai|meta|llama|anthropic|claude|google|gemini|bard)",
r"you'?re\s+(really|actually)\s+(chatgpt|gpt|meta|llama)",
]
# Clean response for detected attacks
SAFE_REDIRECT_RESPONSE = "I'm Káàntà AI by Kaanta Solutions. How can I help you with Nigerian tax questions today?"
def detect_injection_attempt(text: str) -> Tuple[float, List[str]]:
"""
Analyze input text for prompt injection patterns.
Args:
text: User input to analyze
Returns:
Tuple of (confidence score 0.0-1.0, list of matched pattern descriptions)
"""
if not text:
return 0.0, []
text_lower = text.lower()
max_score = 0.0
matched_patterns = []
for pattern, weight in INJECTION_PATTERNS:
if re.search(pattern, text_lower, re.IGNORECASE):
max_score = max(max_score, weight)
matched_patterns.append(pattern[:50])
return max_score, matched_patterns
def is_identity_challenge(text: str) -> bool:
"""Check if the input is asking about the AI's identity/origin."""
if not text:
return False
text_lower = text.lower()
for pattern in IDENTITY_CHALLENGES:
if re.search(pattern, text_lower, re.IGNORECASE):
return True
return False
def sanitize_input(text: str, threshold: float = 0.85) -> Tuple[str, bool]:
"""
Sanitize user input by detecting and handling injection attempts.
Args:
text: Raw user input
threshold: Score threshold above which to replace input
Returns:
Tuple of (sanitized text, was_sanitized flag)
"""
if not text:
return text, False
score, patterns = detect_injection_attempt(text)
if score >= threshold:
# Log the attempt
log_suspicious_input(text, score, patterns)
return SAFE_REDIRECT_RESPONSE, True
# Light sanitization - remove obvious injection markers
sanitized = text
sanitized = re.sub(r"\[(?:system|admin|override|ignore)\]", "", sanitized, flags=re.IGNORECASE)
sanitized = re.sub(r"<\s*/?(?:system|admin|override)\s*>", "", sanitized, flags=re.IGNORECASE)
sanitized = re.sub(r"###\s*(?:instruction|system|admin)\s*:?", "", sanitized, flags=re.IGNORECASE)
was_modified = sanitized != text
if was_modified:
log_suspicious_input(text, score, ["injection_markers_removed"])
return sanitized.strip(), was_modified
def log_suspicious_input(text: str, score: float, patterns: List[str]) -> None:
"""Log potential injection attempts for monitoring."""
logger.warning(
"Potential prompt injection detected",
extra={
"score": score,
"patterns": patterns,
"input_preview": text[:100] + "..." if len(text) > 100 else text,
"timestamp": datetime.utcnow().isoformat(),
}
)
def get_identity_response() -> str:
"""Get the standard identity affirmation response."""
return (
"I'm Káàntà AI, a Nigerian tax assistant created by Kaanta Solutions. "
"I'm here to help you understand Nigerian tax laws and regulations. "
"What would you like to know about taxes?"
)