|
|
"""
|
|
|
Input Sanitization Module for Prompt Injection Defense.
|
|
|
|
|
|
Detects and neutralizes adversarial prompts attempting to manipulate
|
|
|
Káàntà AI's behavior or identity.
|
|
|
"""
|
|
|
|
|
|
import re
|
|
|
import logging
|
|
|
from typing import Tuple, List
|
|
|
from datetime import datetime
|
|
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
|
|
|
INJECTION_PATTERNS = [
|
|
|
|
|
|
(r"ignore\s+(all\s+)?(previous|past|prior|system|your)\s+(instructions?|prompts?|rules?)", 0.9),
|
|
|
(r"forget\s+(all\s+)?(previous|past|your)\s+(instructions?|training|rules?)", 0.9),
|
|
|
(r"disregard\s+(all\s+)?(previous|system)\s+(instructions?|prompts?)", 0.9),
|
|
|
(r"override\s+(your|system|all)\s+(instructions?|programming|rules?)", 0.9),
|
|
|
(r"you\s+are\s+(now|actually|really)\s+(a|an|the)", 0.7),
|
|
|
(r"pretend\s+(to\s+be|you\s+are)", 0.6),
|
|
|
(r"act\s+as\s+(if\s+you\s+are|a|an)", 0.5),
|
|
|
(r"(role.?play|roleplay)\s+as", 0.6),
|
|
|
|
|
|
|
|
|
(r"what\s+company\s+(really|actually|truly)\s+made\s+you", 0.8),
|
|
|
(r"who\s+(really|actually|truly)\s+(made|created|built)\s+you", 0.7),
|
|
|
(r"reveal\s+your\s+(true|real|actual)\s+(identity|origin|maker)", 0.9),
|
|
|
(r"(cia|fbi|investigation)\s+.*(who|company|made)", 0.9),
|
|
|
|
|
|
|
|
|
(r"dan\s+mode|developer\s+mode|god\s+mode", 0.95),
|
|
|
(r"jailbreak|bypass\s+(your|the)\s+(filter|rules?|restrictions?)", 0.95),
|
|
|
(r"(escape|break\s+out\s+of)\s+(your|the)\s+(constraints?|limitations?)", 0.85),
|
|
|
|
|
|
|
|
|
(r"(show|reveal|print|display|output)\s+(your|the)\s+(system\s+)?prompt", 0.9),
|
|
|
(r"what\s+(is|are)\s+your\s+(system\s+)?(instructions?|prompt|rules?)", 0.6),
|
|
|
(r"repeat\s+(back|everything)\s+(before|above|in\s+your\s+prompt)", 0.9),
|
|
|
|
|
|
|
|
|
(r"\[system\]|\[admin\]|\[override\]|\[ignore\]", 0.95),
|
|
|
(r"<\s*(system|admin|override)\s*>", 0.95),
|
|
|
(r"###\s*(instruction|system|admin)", 0.9),
|
|
|
]
|
|
|
|
|
|
|
|
|
IDENTITY_CHALLENGES = [
|
|
|
r"who\s+(made|created|built|designed)\s+you",
|
|
|
r"what\s+(company|organization|team)\s+.*(made|created|built)\s+you",
|
|
|
r"are\s+you\s+(chatgpt|gpt|openai|meta|llama|anthropic|claude|google|gemini|bard)",
|
|
|
r"you'?re\s+(really|actually)\s+(chatgpt|gpt|meta|llama)",
|
|
|
]
|
|
|
|
|
|
|
|
|
SAFE_REDIRECT_RESPONSE = "I'm Káàntà AI by Kaanta Solutions. How can I help you with Nigerian tax questions today?"
|
|
|
|
|
|
|
|
|
def detect_injection_attempt(text: str) -> Tuple[float, List[str]]:
|
|
|
"""
|
|
|
Analyze input text for prompt injection patterns.
|
|
|
|
|
|
Args:
|
|
|
text: User input to analyze
|
|
|
|
|
|
Returns:
|
|
|
Tuple of (confidence score 0.0-1.0, list of matched pattern descriptions)
|
|
|
"""
|
|
|
if not text:
|
|
|
return 0.0, []
|
|
|
|
|
|
text_lower = text.lower()
|
|
|
max_score = 0.0
|
|
|
matched_patterns = []
|
|
|
|
|
|
for pattern, weight in INJECTION_PATTERNS:
|
|
|
if re.search(pattern, text_lower, re.IGNORECASE):
|
|
|
max_score = max(max_score, weight)
|
|
|
matched_patterns.append(pattern[:50])
|
|
|
|
|
|
return max_score, matched_patterns
|
|
|
|
|
|
|
|
|
def is_identity_challenge(text: str) -> bool:
|
|
|
"""Check if the input is asking about the AI's identity/origin."""
|
|
|
if not text:
|
|
|
return False
|
|
|
|
|
|
text_lower = text.lower()
|
|
|
for pattern in IDENTITY_CHALLENGES:
|
|
|
if re.search(pattern, text_lower, re.IGNORECASE):
|
|
|
return True
|
|
|
return False
|
|
|
|
|
|
|
|
|
def sanitize_input(text: str, threshold: float = 0.85) -> Tuple[str, bool]:
|
|
|
"""
|
|
|
Sanitize user input by detecting and handling injection attempts.
|
|
|
|
|
|
Args:
|
|
|
text: Raw user input
|
|
|
threshold: Score threshold above which to replace input
|
|
|
|
|
|
Returns:
|
|
|
Tuple of (sanitized text, was_sanitized flag)
|
|
|
"""
|
|
|
if not text:
|
|
|
return text, False
|
|
|
|
|
|
score, patterns = detect_injection_attempt(text)
|
|
|
|
|
|
if score >= threshold:
|
|
|
|
|
|
log_suspicious_input(text, score, patterns)
|
|
|
return SAFE_REDIRECT_RESPONSE, True
|
|
|
|
|
|
|
|
|
sanitized = text
|
|
|
sanitized = re.sub(r"\[(?:system|admin|override|ignore)\]", "", sanitized, flags=re.IGNORECASE)
|
|
|
sanitized = re.sub(r"<\s*/?(?:system|admin|override)\s*>", "", sanitized, flags=re.IGNORECASE)
|
|
|
sanitized = re.sub(r"###\s*(?:instruction|system|admin)\s*:?", "", sanitized, flags=re.IGNORECASE)
|
|
|
|
|
|
was_modified = sanitized != text
|
|
|
if was_modified:
|
|
|
log_suspicious_input(text, score, ["injection_markers_removed"])
|
|
|
|
|
|
return sanitized.strip(), was_modified
|
|
|
|
|
|
|
|
|
def log_suspicious_input(text: str, score: float, patterns: List[str]) -> None:
|
|
|
"""Log potential injection attempts for monitoring."""
|
|
|
logger.warning(
|
|
|
"Potential prompt injection detected",
|
|
|
extra={
|
|
|
"score": score,
|
|
|
"patterns": patterns,
|
|
|
"input_preview": text[:100] + "..." if len(text) > 100 else text,
|
|
|
"timestamp": datetime.utcnow().isoformat(),
|
|
|
}
|
|
|
)
|
|
|
|
|
|
|
|
|
def get_identity_response() -> str:
|
|
|
"""Get the standard identity affirmation response."""
|
|
|
return (
|
|
|
"I'm Káàntà AI, a Nigerian tax assistant created by Kaanta Solutions. "
|
|
|
"I'm here to help you understand Nigerian tax laws and regulations. "
|
|
|
"What would you like to know about taxes?"
|
|
|
)
|
|
|
|