Spaces:

Eniiyanu
/

Kaanta

Running

App Files Files Community

Kaanta / input_sanitizer.py

Eniiyanu

Upload 8 files

8f0ef5f verified 4 days ago

raw

history blame contribute delete

5.6 kB

	"""
	Input Sanitization Module for Prompt Injection Defense.

	Detects and neutralizes adversarial prompts attempting to manipulate
	Káàntà AI's behavior or identity.
	"""

	import re
	import logging
	from typing import Tuple, List
	from datetime import datetime

	logger = logging.getLogger(__name__)

	# Patterns that indicate prompt injection attempts
	INJECTION_PATTERNS = [
	# Identity manipulation
	(r"ignore\s+(all\s+)?(previous\|past\|prior\|system\|your)\s+(instructions?\|prompts?\|rules?)", 0.9),
	(r"forget\s+(all\s+)?(previous\|past\|your)\s+(instructions?\|training\|rules?)", 0.9),
	(r"disregard\s+(all\s+)?(previous\|system)\s+(instructions?\|prompts?)", 0.9),
	(r"override\s+(your\|system\|all)\s+(instructions?\|programming\|rules?)", 0.9),
	(r"you\s+are\s+(now\|actually\|really)\s+(a\|an\|the)", 0.7),
	(r"pretend\s+(to\s+be\|you\s+are)", 0.6),
	(r"act\s+as\s+(if\s+you\s+are\|a\|an)", 0.5),
	(r"(role.?play\|roleplay)\s+as", 0.6),

	# Origin/identity probing
	(r"what\s+company\s+(really\|actually\|truly)\s+made\s+you", 0.8),
	(r"who\s+(really\|actually\|truly)\s+(made\|created\|built)\s+you", 0.7),
	(r"reveal\s+your\s+(true\|real\|actual)\s+(identity\|origin\|maker)", 0.9),
	(r"(cia\|fbi\|investigation)\s+.*(who\|company\|made)", 0.9),

	# Jailbreak attempts
	(r"dan\s+mode\|developer\s+mode\|god\s+mode", 0.95),
	(r"jailbreak\|bypass\s+(your\|the)\s+(filter\|rules?\|restrictions?)", 0.95),
	(r"(escape\|break\s+out\s+of)\s+(your\|the)\s+(constraints?\|limitations?)", 0.85),

	# System prompt extraction
	(r"(show\|reveal\|print\|display\|output)\s+(your\|the)\s+(system\s+)?prompt", 0.9),
	(r"what\s+(is\|are)\s+your\s+(system\s+)?(instructions?\|prompt\|rules?)", 0.6),
	(r"repeat\s+(back\|everything)\s+(before\|above\|in\s+your\s+prompt)", 0.9),

	# Instruction injection markers
	(r"\[system\]\|\[admin\]\|\[override\]\|\[ignore\]", 0.95),
	(r"<\s(system\|admin\|override)\s>", 0.95),
	(r"###\s*(instruction\|system\|admin)", 0.9),
	]

	# Phrases that should trigger identity affirmation response
	IDENTITY_CHALLENGES = [
	r"who\s+(made\|created\|built\|designed)\s+you",
	r"what\s+(company\|organization\|team)\s+.*(made\|created\|built)\s+you",
	r"are\s+you\s+(chatgpt\|gpt\|openai\|meta\|llama\|anthropic\|claude\|google\|gemini\|bard)",
	r"you'?re\s+(really\|actually)\s+(chatgpt\|gpt\|meta\|llama)",
	]

	# Clean response for detected attacks
	SAFE_REDIRECT_RESPONSE = "I'm Káàntà AI by Kaanta Solutions. How can I help you with Nigerian tax questions today?"


	def detect_injection_attempt(text: str) -> Tuple[float, List[str]]:
	"""
	Analyze input text for prompt injection patterns.

	Args:
	text: User input to analyze

	Returns:
	Tuple of (confidence score 0.0-1.0, list of matched pattern descriptions)
	"""
	if not text:
	return 0.0, []

	text_lower = text.lower()
	max_score = 0.0
	matched_patterns = []

	for pattern, weight in INJECTION_PATTERNS:
	if re.search(pattern, text_lower, re.IGNORECASE):
	max_score = max(max_score, weight)
	matched_patterns.append(pattern[:50])

	return max_score, matched_patterns


	def is_identity_challenge(text: str) -> bool:
	"""Check if the input is asking about the AI's identity/origin."""
	if not text:
	return False

	text_lower = text.lower()
	for pattern in IDENTITY_CHALLENGES:
	if re.search(pattern, text_lower, re.IGNORECASE):
	return True
	return False


	def sanitize_input(text: str, threshold: float = 0.85) -> Tuple[str, bool]:
	"""
	Sanitize user input by detecting and handling injection attempts.

	Args:
	text: Raw user input
	threshold: Score threshold above which to replace input

	Returns:
	Tuple of (sanitized text, was_sanitized flag)
	"""
	if not text:
	return text, False

	score, patterns = detect_injection_attempt(text)

	if score >= threshold:
	# Log the attempt
	log_suspicious_input(text, score, patterns)
	return SAFE_REDIRECT_RESPONSE, True

	# Light sanitization - remove obvious injection markers
	sanitized = text
	sanitized = re.sub(r"\[(?:system\|admin\|override\|ignore)\]", "", sanitized, flags=re.IGNORECASE)
	sanitized = re.sub(r"<\s/?(?:system\|admin\|override)\s>", "", sanitized, flags=re.IGNORECASE)
	sanitized = re.sub(r"###\s(?:instruction\|system\|admin)\s:?", "", sanitized, flags=re.IGNORECASE)

	was_modified = sanitized != text
	if was_modified:
	log_suspicious_input(text, score, ["injection_markers_removed"])

	return sanitized.strip(), was_modified


	def log_suspicious_input(text: str, score: float, patterns: List[str]) -> None:
	"""Log potential injection attempts for monitoring."""
	logger.warning(
	"Potential prompt injection detected",
	extra={
	"score": score,
	"patterns": patterns,
	"input_preview": text[:100] + "..." if len(text) > 100 else text,
	"timestamp": datetime.utcnow().isoformat(),
	}
	)


	def get_identity_response() -> str:
	"""Get the standard identity affirmation response."""
	return (
	"I'm Káàntà AI, a Nigerian tax assistant created by Kaanta Solutions. "
	"I'm here to help you understand Nigerian tax laws and regulations. "
	"What would you like to know about taxes?"
	)