Spaces:

VAGOsolutions
/

SauerkrautLM-ColPali

Running on L40S

David Golchinfar

feat: Show VLM loading hint when checkbox is enabled

25f9f39 5 days ago

41.7 kB

	"""
	SauerkrautLM-ColPali Demo Space
	Visual Document Retrieval with Similarity Heat Maps + VLM Answer Generation
	Multi-document indexing for realistic retrieval scenarios
	"""
	import os
	import gradio as gr
	import torch
	import numpy as np
	from PIL import Image
	from einops import rearrange
	from huggingface_hub import login
	import spaces
	import math
	from pathlib import Path
	from typing import List, Tuple, Optional, Dict

	# Import model classes at startup
	from sauerkrautlm_colpali.models.lfm2.collfm2.modeling_collfm2 import ColLFM2
	from sauerkrautlm_colpali.models.lfm2.collfm2.processing_collfm2 import ColLFM2Processor
	from sauerkrautlm_colpali.models.qwen3.colqwen3.modeling_colqwen3 import ColQwen3
	from sauerkrautlm_colpali.models.qwen3.colqwen3.processing_colqwen3 import ColQwen3Processor

	print("All model imports successful!")

	EPSILON = 1e-10


	def install_fa2():
	print("Installing Flash Attention 2...")
	os.system("pip install flash-attn --no-build-isolation")


	# HF Token for private models
	hf_token = os.getenv("HF_KEY")
	if hf_token:
	login(token=hf_token)

	# ColPali Model options
	COLPALI_MODELS = {
	"SauerkrautLM-ColLFM2-450M (Fastest, 0.9GB)": "VAGOsolutions/SauerkrautLM-ColLFM2-450M-v0.1",
	"SauerkrautLM-ColQwen3-1.7B-Turbo (Fast, 3.4GB)": "VAGOsolutions/SauerkrautLM-ColQwen3-1.7b-Turbo-v0.1",
	"SauerkrautLM-ColQwen3-2B (Balanced, 4.4GB)": "VAGOsolutions/SauerkrautLM-ColQwen3-2b-v0.1",
	"SauerkrautLM-ColQwen3-4B (Quality, 8GB)": "VAGOsolutions/SauerkrautLM-ColQwen3-4b-v0.1",
	"SauerkrautLM-ColQwen3-8B (Best, 16GB)": "VAGOsolutions/SauerkrautLM-ColQwen3-8b-v0.1",
	}

	# Global model cache
	loaded_colpali_model = None
	loaded_colpali_processor = None
	loaded_colpali_model_name = None

	loaded_vlm_model = None
	loaded_vlm_processor = None


	# =============================================================================
	# EXAMPLE CONFIGURATION - Organized by language and use case
	# =============================================================================
	EXAMPLE_CONFIG = [
	# ==================== GERMAN (DE) ====================
	# Annual Reports
	{
	"file": "deutsch/2024-infineon-geschaeftsbericht-v01-00-de_p4.png",
	"query": "Wie hoch ist der Umsatz von Infineon?",
	"category": "📊 Annual Report",
	"lang": "🇩🇪 DE",
	"description": "Infineon Geschäftsbericht 2024 - Kennzahlen"
	},
	{
	"file": "deutsch/BASF_Bericht_2024_p90.png",
	"query": "Wie ist das Risikomanagement bei BASF organisiert?",
	"category": "📊 Annual Report",
	"lang": "🇩🇪 DE",
	"description": "BASF Bericht 2024 - Risikomanagement"
	},
	{
	"file": "deutsch/entire-dtag-gb24_p351.png",
	"query": "Wie setzt sich die Vorstandsvergütung bei der Deutschen Telekom zusammen?",
	"category": "📊 Annual Report",
	"lang": "🇩🇪 DE",
	"description": "Deutsche Telekom GB 2024 - Vergütung"
	},
	{
	"file": "deutsch/entire-dtag-gb24_p77.png",
	"query": "Wie hoch sind die Nettofinanzverbindlichkeiten der Deutschen Telekom?",
	"category": "📊 Annual Report",
	"lang": "🇩🇪 DE",
	"description": "Deutsche Telekom GB 2024 - Finanzen"
	},
	# Hydrogen / Energy
	{
	"file": "deutsch/Bildschirmfoto 2025-12-14 um 01.31.45.png",
	"query": "Wo verläuft die geplante Wasserstofftrasse in Leipzig?",
	"category": "⚡ Hydrogen/Energy",
	"lang": "🇩🇪 DE",
	"description": "Wasserstoff-Infrastruktur Leipzig"
	},
	{
	"file": "deutsch/Bildschirmfoto 2025-12-14 um 01.36.01.png",
	"query": "Wie wird die Gewässerquerung der Wasserstoffleitung realisiert?",
	"category": "⚡ Hydrogen/Energy",
	"lang": "🇩🇪 DE",
	"description": "Wasserstoff-Technische Zeichnung"
	},
	# Tax/Forms
	{
	"file": "deutsch/ESt_1_A_2022_p2.png",
	"query": "Wo trage ich meine Bankverbindung in der Steuererklärung ein?",
	"category": "📝 Tax Form",
	"lang": "🇩🇪 DE",
	"description": "Einkommensteuererklärung ESt 1A"
	},
	# Economic Reports
	{
	"file": "deutsch/Monatsbericht---Oktober-2025_p152.png",
	"query": "Wie hoch sind die aktuellen Zinssätze für Wohnungsbaukredite?",
	"category": "💰 Financial Report",
	"lang": "🇩🇪 DE",
	"description": "Bundesbank Monatsbericht - Zinssätze"
	},
	{
	"file": "deutsch/sd-2025-digital-07-wollmershaeuser-etal-ifo-konjunkturprognose-sommer-2025_p23.png",
	"query": "Was sind die Annahmen für den Ölpreis in der ifo-Prognose?",
	"category": "📈 Economic Forecast",
	"lang": "🇩🇪 DE",
	"description": "ifo Konjunkturprognose 2025"
	},
	# Environmental
	{
	"file": "deutsch/rep0913_p197.png",
	"query": "Wie hat sich der Fleischkonsum in Österreich entwickelt?",
	"category": "🌱 Environmental",
	"lang": "🇩🇪 DE",
	"description": "Klimaschutzbericht - Umweltbundesamt"
	},

	# ==================== ENGLISH (EN) ====================
	# ESG/Sustainability
	{
	"file": "englisch/2025051910270996484_p19.png",
	"query": "What is the waste recycling rate at CRRC?",
	"category": "🌱 ESG Report",
	"lang": "🇬🇧 EN",
	"description": "CRRC ESG Report 2024 - Waste Management"
	},
	# Scientific
	{
	"file": "englisch/6e81bb8284357ea1773e99832d21c65b_new_myosinlecture_ag_p5.png",
	"query": "What are the different classes of myosin?",
	"category": "🔬 Scientific Paper",
	"lang": "🇬🇧 EN",
	"description": "Myosin Phylogenetic Tree"
	},
	# Historical/Vintage
	{
	"file": "englisch/ADVE_0004.png",
	"query": "Which cigarette brand advertised 'Call for Philip Morris'?",
	"category": "📜 Historical Document",
	"lang": "🇬🇧 EN",
	"description": "Vintage Philip Morris Advertisement"
	},
	# Business Forms
	{
	"file": "englisch/Form_0033.png",
	"query": "How long should vendor audit records be retained?",
	"category": "📝 Business Form",
	"lang": "🇬🇧 EN",
	"description": "Records Retention Schedule"
	},
	{
	"file": "englisch/Letter_0061.png",
	"query": "Who requested copies of the 'Helping Youth Decide' booklet?",
	"category": "✉️ Business Letter",
	"lang": "🇬🇧 EN",
	"description": "Girl Scouts Correspondence"
	},
	# Financial Reports
	{
	"file": "englisch/NASDAQ_DDD_2024_p76.png",
	"query": "What is the total stockholders' equity of 3D Systems?",
	"category": "📊 Annual Report",
	"lang": "🇬🇧 EN",
	"description": "3D Systems Financial Statement"
	},
	{
	"file": "englisch/TMUS-2024-Annual-Report_p143.png",
	"query": "Who is the CEO of T-Mobile US?",
	"category": "📊 Annual Report",
	"lang": "🇬🇧 EN",
	"description": "T-Mobile US Annual Report 2024"
	},
	{
	"file": "englisch/pwc-transparency-report-2023-2024_p33.png",
	"query": "What are the rotation periods for audit partners at PwC?",
	"category": "📋 Transparency Report",
	"lang": "🇬🇧 EN",
	"description": "PwC Transparency Report"
	},

	# ==================== FRENCH (FR) ====================
	{
	"file": "französisch/194000315_0_p178.png",
	"query": "Quel est le coût du travail au SMIC en France?",
	"category": "💰 Labor Statistics",
	"lang": "🇫🇷 FR",
	"description": "Statistiques du travail - SMIC"
	},
	{
	"file": "französisch/194000315_0_p21.png",
	"query": "Quelle est la prévision de croissance du PIB en zone euro?",
	"category": "📈 Economic Forecast",
	"lang": "🇫🇷 FR",
	"description": "Prévisions économiques Zone Euro"
	},
	{
	"file": "französisch/Cours-de-physique-1v2_p44.png",
	"query": "Comment fonctionne la vision des couleurs?",
	"category": "🎓 Educational",
	"lang": "🇫🇷 FR",
	"description": "Cours de Physique - Vision"
	},
	{
	"file": "französisch/CSSF_RA_2024_FR_p14.png",
	"query": "Quelle est la répartition des employés de la CSSF par nationalité?",
	"category": "📊 Annual Report",
	"lang": "🇫🇷 FR",
	"description": "CSSF Luxembourg - Rapport Annuel"
	},
	{
	"file": "französisch/ICN_Definition-Nursing_Report_FR_Web_p47.png",
	"query": "Combien d'associations nationales d'infirmières participent au CII?",
	"category": "🏥 Healthcare Report",
	"lang": "🇫🇷 FR",
	"description": "ICN Rapport Infirmières"
	},
	{
	"file": "französisch/rapport-cns-2024-internet_p14.png",
	"query": "Quelles sont les principales missions de la CNS?",
	"category": "🏥 Healthcare Report",
	"lang": "🇫🇷 FR",
	"description": "CNS Rapport Annuel"
	},
	{
	"file": "französisch/rapport-esg-2024.pdf.coredownload.inline_p29.png",
	"query": "Quels sont les objectifs ESG pour 2024?",
	"category": "🌱 ESG Report",
	"lang": "🇫🇷 FR",
	"description": "Rapport ESG 2024"
	},

	# ==================== SPANISH (ES) ====================
	{
	"file": "spanisch/Coeur-ESG-Report-23-May-2024-Spanish-version-compressed_p31.png",
	"query": "¿Cuáles son las emisiones de gases de efecto invernadero de Coeur Mining?",
	"category": "🌱 ESG Report",
	"lang": "🇪🇸 ES",
	"description": "Coeur Mining ESG - Emisiones"
	},
	{
	"file": "spanisch/Coeur-ESG-Report-23-May-2024-Spanish-version-compressed_p39.png",
	"query": "¿Qué medidas de seguridad implementa Coeur Mining?",
	"category": "🌱 ESG Report",
	"lang": "🇪🇸 ES",
	"description": "Coeur Mining ESG - Seguridad"
	},
	{
	"file": "spanisch/Informe-Economico-Regional-2022-2023_p112.png",
	"query": "¿Cuál es la situación económica regional en 2023?",
	"category": "📈 Economic Report",
	"lang": "🇪🇸 ES",
	"description": "Informe Económico Regional"
	},
	{
	"file": "spanisch/Informe-Sostenibilidad-ESG-2024_p16.png",
	"query": "¿Cuáles son los objetivos de sostenibilidad para 2024?",
	"category": "🌱 ESG Report",
	"lang": "🇪🇸 ES",
	"description": "Informe Sostenibilidad ESG"
	},
	{
	"file": "spanisch/MAPs_PLAN_DESARROLLO_p14.png",
	"query": "¿Cuántas propuestas de renovables se recibieron en España?",
	"category": "⚡ Energy Infrastructure",
	"lang": "🇪🇸 ES",
	"description": "Plan Desarrollo Red Eléctrica"
	},
	{
	"file": "spanisch/Presupuestos_p32.png",
	"query": "¿Cuál es el presupuesto total asignado?",
	"category": "💰 Budget Document",
	"lang": "🇪🇸 ES",
	"description": "Presupuestos Generales"
	},
	]


	def get_all_example_images() -> List[Tuple[str, Image.Image]]:
	"""Load all example images for multi-document indexing."""
	examples_dir = Path(__file__).parent / "demopics"
	images = []

	for example in EXAMPLE_CONFIG:
	filepath = examples_dir / example["file"]
	if filepath.exists():
	try:
	img = Image.open(filepath).convert("RGB")
	images.append((str(filepath), img))
	except Exception as e:
	print(f"Error loading {filepath}: {e}")

	return images


	def get_available_examples():
	"""Load examples for the Gradio Examples component (shuffled)."""
	import random
	examples_dir = Path(__file__).parent / "demopics"
	available = []

	for example in EXAMPLE_CONFIG:
	filepath = examples_dir / example["file"]
	if filepath.exists():
	available.append([str(filepath), example["query"]])

	# Shuffle to mix languages
	random.seed(42) # Consistent shuffle
	random.shuffle(available)

	return available if available else None


	def get_example_gallery_data():
	"""Get data for the example gallery with categories."""
	examples_dir = Path(__file__).parent / "demopics"
	gallery_data = []

	for example in EXAMPLE_CONFIG:
	filepath = examples_dir / example["file"]
	if filepath.exists():
	gallery_data.append({
	"path": str(filepath),
	"query": example["query"],
	"label": f"{example['lang']} {example['category']}: {example['description']}",
	"category": example["category"],
	"lang": example["lang"],
	})

	return gallery_data


	@spaces.GPU
	def load_colpali_model(model_choice: str):
	"""Load the selected ColPali model with proper device placement."""
	global loaded_colpali_model, loaded_colpali_processor, loaded_colpali_model_name
	model_name = COLPALI_MODELS[model_choice]

	if loaded_colpali_model_name == model_name and loaded_colpali_model is not None:
	gr.Info(f"✅ {model_choice} ready!")
	return loaded_colpali_model, loaded_colpali_processor

	gr.Info(f"⏳ Loading {model_choice}... Please wait.")

	if loaded_colpali_model is not None:
	gr.Info("🔄 Unloading previous model...")
	try:
	del loaded_colpali_model
	del loaded_colpali_processor
	except Exception:
	pass
	torch.cuda.empty_cache()

	try:
	import flash_attn
	attn_impl = "flash_attention_2"
	gr.Info("⚡ Using Flash Attention 2")
	except ImportError:
	attn_impl = "sdpa"
	gr.Info("🔧 Using SDPA attention")

	print(f"Loading {model_name} with attention: {attn_impl}")

	if "ColLFM2" in model_name:
	gr.Info("📥 Downloading model weights...")
	loaded_colpali_model = ColLFM2.from_pretrained(
	model_name,
	torch_dtype=torch.bfloat16,
	attn_implementation=attn_impl,
	token=hf_token,
	).eval().to("cuda")
	gr.Info("📥 Downloading processor...")
	loaded_colpali_processor = ColLFM2Processor.from_pretrained(model_name, token=hf_token)

	elif "ColQwen3" in model_name:
	gr.Info("📥 Downloading model weights...")
	loaded_colpali_model = ColQwen3.from_pretrained(
	model_name,
	torch_dtype=torch.bfloat16,
	attn_implementation=attn_impl,
	device_map="cuda",
	token=hf_token,
	).eval()
	gr.Info("📥 Downloading processor...")
	loaded_colpali_processor = ColQwen3Processor.from_pretrained(model_name, token=hf_token)
	else:
	raise ValueError(f"Unknown model type: {model_name}")

	loaded_colpali_model_name = model_name
	gr.Info(f"✅ {model_choice} loaded and ready!")
	return loaded_colpali_model, loaded_colpali_processor


	@spaces.GPU
	def load_vlm_model():
	"""Load Qwen3-VL-4B for answer generation."""
	global loaded_vlm_model, loaded_vlm_processor

	if loaded_vlm_model is not None:
	gr.Info("✅ Qwen3-VL-4B ready!")
	return loaded_vlm_model, loaded_vlm_processor

	gr.Info("⏳ Loading Qwen3-VL-4B-Instruct... Please wait.")

	from transformers import AutoModelForImageTextToText, AutoProcessor

	vlm_model_name = "Qwen/Qwen3-VL-4B-Instruct"
	print(f"Loading VLM: {vlm_model_name}")

	gr.Info("📥 Downloading VLM model weights (8GB)...")
	loaded_vlm_model = AutoModelForImageTextToText.from_pretrained(
	vlm_model_name,
	torch_dtype=torch.bfloat16,
	device_map="cuda",
	token=hf_token,
	).eval()

	gr.Info("📥 Downloading VLM processor...")
	loaded_vlm_processor = AutoProcessor.from_pretrained(vlm_model_name, token=hf_token)

	gr.Info("✅ Qwen3-VL-4B loaded and ready!")
	return loaded_vlm_model, loaded_vlm_processor
	def on_vlm_toggle(enabled):
	"""Show hint when VLM is enabled."""
	if enabled:
	gr.Info("ℹ️ VLM (Qwen3-VL-4B) will be loaded on first analysis. This adds ~30-60 seconds.")
	return enabled




	def get_similarity_maps_from_embeddings(
	image_embeddings: torch.Tensor,
	query_embeddings: torch.Tensor,
	n_patches: Tuple[int, int],
	image_mask: torch.Tensor,
	query_mask: Optional[torch.Tensor] = None,
	) -> torch.Tensor:
	"""EXACT ColPali implementation of similarity map computation."""
	idx = 0
	n_patches_x, n_patches_y = n_patches[0], n_patches[1]

	n_image_tokens = int(image_mask[idx].sum().item())
	expected_tokens = n_patches_x * n_patches_y

	if n_image_tokens != expected_tokens:
	n = n_image_tokens
	sqrt_n = int(math.sqrt(n))
	for i in range(sqrt_n, 0, -1):
	if n % i == 0:
	n_patches_x, n_patches_y = n // i, i
	break

	image_embedding_grid = rearrange(
	image_embeddings[idx][image_mask[idx]],
	"(h w) c -> w h c",
	w=n_patches_x,
	h=n_patches_y,
	)

	query_emb = query_embeddings[idx]
	if query_mask is not None:
	query_emb = query_emb[query_mask[idx]]

	similarity_map = torch.einsum(
	"nk,ijk->nij",
	query_emb,
	image_embedding_grid,
	)

	return similarity_map


	def create_heatmap_overlay(
	image: Image.Image,
	similarity_map: torch.Tensor,
	alpha: float = 0.5,
	skip_normalize: bool = False,
	) -> Image.Image:
	"""Create heatmap overlay following EXACT ColPali visualization."""
	import seaborn as sns

	sim_float = similarity_map.float()

	if skip_normalize:
	sim_array = sim_float.cpu().numpy()
	else:
	min_val = sim_float.min()
	max_val = sim_float.max()
	sim_normalized = (sim_float - min_val) / (max_val - min_val + 1e-10)
	sim_array = sim_normalized.cpu().numpy()

	sim_array = rearrange(sim_array, "h w -> w h")

	sim_image = Image.fromarray((sim_array * 255).astype(np.uint8))
	sim_image = sim_image.resize(image.size, Image.Resampling.BICUBIC)
	sim_resized = np.array(sim_image) / 255.0

	cmap = sns.color_palette("mako", as_cmap=True)
	heatmap_rgba = cmap(sim_resized)
	heatmap = (heatmap_rgba[:, :, :3] * 255).astype(np.uint8)

	img_array = np.array(image.convert("RGB")).astype(np.float32)
	heatmap_float = heatmap.astype(np.float32)
	blended = img_array * (1 - alpha) + heatmap_float * alpha
	blended = np.clip(blended, 0, 255).astype(np.uint8)

	return Image.fromarray(blended)


	def get_collfm2_heatmap(model, processor, image, image_embeddings, query_embeddings, batch_images, batch_queries):
	"""Generate heatmap for ColLFM2 models (simplified version)."""
	try:
	if "input_ids" not in batch_images:
	return None

	input_ids = batch_images["input_ids"][0]
	tokenizer = processor.tokenizer if hasattr(processor, 'tokenizer') else processor.processor.tokenizer
	image_token_id = tokenizer.convert_tokens_to_ids('<image>')

	image_mask = input_ids == image_token_id
	n_image_tokens = image_mask.sum().item()

	# Find best grid that matches the token count and image aspect ratio
	img_width, img_height = image.size
	img_ratio = img_width / img_height

	best_diff = float('inf')
	n_patches_x, n_patches_y = int(math.sqrt(n_image_tokens)), int(math.sqrt(n_image_tokens))

	for i in range(1, int(math.sqrt(n_image_tokens)) + 1):
	if n_image_tokens % i == 0:
	j = n_image_tokens // i
	ratio1 = j / i
	ratio2 = i / j

	diff1 = abs(img_ratio - ratio1)
	diff2 = abs(img_ratio - ratio2)

	if diff1 < best_diff:
	best_diff = diff1
	n_patches_x, n_patches_y = j, i
	if diff2 < best_diff:
	best_diff = diff2
	n_patches_x, n_patches_y = i, j

	query_emb = query_embeddings[0]

	# Filter padding
	pad_token_id = getattr(tokenizer, "pad_token_id", 0) or 0
	query_mask = batch_queries["input_ids"][0] != pad_token_id
	query_emb = query_emb[query_mask]

	# Get image embeddings
	image_emb = image_embeddings[0][image_mask][:n_patches_x * n_patches_y]
	image_grid = rearrange(image_emb, "(h w) c -> w h c", w=n_patches_x, h=n_patches_y)

	# Compute similarity
	similarity_map = torch.einsum("nk,ijk->nij", query_emb, image_grid)
	aggregated = similarity_map.max(dim=0).values

	# Aggressive normalization for ColLFM2
	agg_float = aggregated.float()
	threshold = torch.quantile(agg_float.flatten(), 0.90)
	hot_mask = agg_float > threshold

	min_hot = agg_float[hot_mask].min() if hot_mask.sum() > 0 else threshold
	max_hot = agg_float.max()

	normalized = torch.zeros_like(agg_float)
	if hot_mask.sum() > 0:
	normalized[hot_mask] = 0.5 + 0.5 * (agg_float[hot_mask] - min_hot) / (max_hot - min_hot + 1e-10)

	return create_heatmap_overlay(image, normalized, skip_normalize=True)

	except Exception as e:
	import traceback
	print(f"ColLFM2 Heatmap error: {traceback.format_exc()}")
	return None


	@spaces.GPU
	def generate_vlm_answer(image: Image.Image, query: str) -> str:
	"""Generate an answer using Qwen3-VL-4B-Instruct."""
	try:
	vlm_model, vlm_processor = load_vlm_model()

	messages = [
	{
	"role": "user",
	"content": [
	{"type": "image", "image": image},
	{"type": "text", "text": f"Based on this document image, please answer the following question:\n\n{query}\n\nProvide a clear and concise answer based only on the information visible in the document."},
	],
	}
	]

	text = vlm_processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

	from qwen_vl_utils import process_vision_info
	image_inputs, video_inputs = process_vision_info(messages)

	inputs = vlm_processor(
	text=[text],
	images=image_inputs,
	videos=video_inputs,
	padding=True,
	return_tensors="pt",
	).to("cuda")

	with torch.no_grad():
	generated_ids = vlm_model.generate(
	**inputs,
	max_new_tokens=512,
	do_sample=True,
	temperature=0.7,
	top_p=0.9,
	)

	generated_ids_trimmed = [
	out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
	]
	answer = vlm_processor.batch_decode(
	generated_ids_trimmed,
	skip_special_tokens=True,
	clean_up_tokenization_spaces=False
	)[0]

	return answer

	except Exception as e:
	import traceback
	return f"Error generating answer: {str(e)}"


	@spaces.GPU
	def process_query_with_corpus(model_choice: str, image: Image.Image, query: str, enable_vlm: bool, enable_corpus: bool):
	"""
	Process a query against an image and optionally against all corpus documents.
	Returns similarity score, ranking info, heatmap, and optional VLM answer.
	"""
	if image is None:
	return None, "⚠️ Please upload an image.", None, "", None

	if not query.strip():
	return None, "⚠️ Please enter a search query.", None, "", None

	try:
	model, processor = load_colpali_model(model_choice)
	device = next(model.parameters()).device

	if image.mode != "RGB":
	image = image.convert("RGB")

	# Process query
	batch_queries = processor.process_queries([query]).to(device)

	with torch.no_grad():
	query_embeddings = model(**batch_queries)

	# Process main image
	batch_images = processor.process_images([image]).to(device)

	with torch.no_grad():
	image_embeddings = model(**batch_images)

	gr.Info("📊 Computing similarity scores...")
	scores = processor.score(query_embeddings, image_embeddings)
	main_score = scores[0][0].item()

	# Initialize for VLM
	ranking_info = None
	top_document_image = image
	top_document_label = "Your Document"

	if enable_corpus:
	gr.Info("📚 Indexing corpus documents...")
	# Index all corpus documents
	corpus_images = get_all_example_images()

	if corpus_images:
	# Check if user's image matches any corpus image by comparing pixels
	user_img_array = np.array(image.resize((64, 64)))
	user_is_example = False
	user_example_idx = -1

	for idx, (path, img) in enumerate(corpus_images):
	corpus_img_array = np.array(img.resize((64, 64)))
	if np.allclose(user_img_array, corpus_img_array, atol=10):
	user_is_example = True
	user_example_idx = idx
	break

	all_scores = []

	# Process ALL corpus images
	batch_size = 4
	for i in range(0, len(corpus_images), batch_size):
	batch_imgs = [img for _, img in corpus_images[i:i+batch_size]]

	batch_corpus = processor.process_images(batch_imgs).to(device)

	with torch.no_grad():
	corpus_embeddings = model(**batch_corpus)

	corpus_scores = processor.score(query_embeddings, corpus_embeddings)

	for j, (path, img) in enumerate(corpus_images[i:i+batch_size]):
	score = corpus_scores[0][j].item()
	example_name = Path(path).name
	is_user_doc = (i + j == user_example_idx)

	label = example_name
	for ex in EXAMPLE_CONFIG:
	if ex["file"].endswith(example_name):
	label = f"{ex['lang']} {ex['description']}"
	break

	if is_user_doc:
	label = f"📄 {label} (Selected)"

	all_scores.append((score, label, path, img))

	# If user uploaded custom document (not from corpus), add it
	if not user_is_example:
	all_scores.append((main_score, "📄 Your Document", None, image))

	# Sort by score descending
	all_scores.sort(key=lambda x: x[0], reverse=True)

	# Get top document for VLM
	top_score, top_label, top_path, top_img = all_scores[0]
	top_document_image = top_img
	top_document_label = top_label

	# Find rank of user's document
	user_rank = next((i for i, (_, label, _, _) in enumerate(all_scores) if "📄" in label), 0) + 1

	# Build ranking display
	total_docs = len(all_scores)
	ranking_lines = [f"### 📊 Ranking (out of {total_docs} documents)"]
	ranking_lines.append(f"Your document ranks #{user_rank}\n")
	ranking_lines.append("\| Rank \| Score \| Document \|")
	ranking_lines.append("\|------\|-------\|----------\|")

	for rank, (score, label, _, _) in enumerate(all_scores[:10], 1):
	marker = "👉 " if "📄" in label else ""
	ranking_lines.append(f"\| {rank} \| {score:.4f} \| {marker}{label} \|")

	if len(all_scores) > 10:
	ranking_lines.append(f"\| ... \| ... \| {len(all_scores) - 10} more documents \|")

	ranking_info = "\n".join(ranking_lines)

	# Generate heatmap
	gr.Info("🔥 Generating similarity heatmap...")
	heatmap_image = None
	heatmap_available = False

	if "ColQwen3" in loaded_colpali_model_name:
	try:
	spatial_merge_size = getattr(model, "spatial_merge_size", 2)
	n_patches = processor.get_n_patches(
	image_size=image.size,
	spatial_merge_size=spatial_merge_size,
	)

	image_mask = batch_images["input_ids"] == processor.image_token_id
	pad_token_id = getattr(processor.tokenizer, "pad_token_id", 0)
	query_mask = batch_queries["input_ids"] != pad_token_id

	similarity_map = get_similarity_maps_from_embeddings(
	image_embeddings,
	query_embeddings,
	n_patches,
	image_mask,
	query_mask,
	)

	aggregated = similarity_map.max(dim=0).values
	heatmap_image = create_heatmap_overlay(image, aggregated)
	heatmap_available = True

	except Exception as e:
	import traceback
	print(f"Heatmap error: {traceback.format_exc()}")

	elif "ColLFM2" in loaded_colpali_model_name:
	heatmap_image = get_collfm2_heatmap(
	model, processor, image, image_embeddings, query_embeddings, batch_images, batch_queries
	)
	if heatmap_image is not None:
	heatmap_available = True

	# Build result text
	user_image_path = None

	# Try to detect if user's image is from examples
	if hasattr(image, 'filename'):
	user_image_path = image.filename
	if heatmap_available:
	if "ColLFM2" in loaded_colpali_model_name:
	result_text = f"""## 📊 Similarity Score: {main_score:.4f}

	🔵 Dark blue = low relevance \| 🟢 Cyan/Green = high relevance

	⚠️ ColLFM2 Heatmap Note: This model uses a SigLIP2 vision encoder with pixel unshuffle, producing "holistic" embeddings. The heatmap shows region-level relevance rather than precise word-level localization. This is expected behavior - ColLFM2 excels at determining if a document is relevant. For precise heatmaps, try ColQwen3 models."""
	else:
	result_text = f"""## 📊 Similarity Score: {main_score:.4f}

	The heatmap shows which areas of the document are most relevant to your query.

	🔵 Dark blue = low relevance \| 🟢 Cyan/Green = high relevance"""
	else:
	result_text = f"""## 📊 Similarity Score: {main_score:.4f}

	Heatmap visualization is not available for this model configuration."""

	# Generate VLM answer using top-ranked document
	vlm_answer = ""
	if enable_vlm:
	vlm_answer = generate_vlm_answer(top_document_image, query)
	if enable_corpus and top_document_label != "📄 Your Document":
	vlm_answer = f"[Answer based on top-ranked document: {top_document_label}]\n\n{vlm_answer}"

	gr.Info("✅ Analysis complete!")
	return main_score, result_text, heatmap_image, vlm_answer, ranking_info

	except Exception as e:
	import traceback
	error_msg = f"❌ Error: {str(e)}\n\n```\n{traceback.format_exc()}\n```"
	return None, error_msg, None, "", None


	def create_demo():
	available_examples = get_available_examples()

	with gr.Blocks(title="SauerkrautLM-ColPali Demo") as demo:

	# Header with logo
	gr.HTML("""
	<div style="text-align: center; padding: 20px 0;">
	<div style="margin: 0 auto 20px auto; max-width: 800px;">
	<img src="https://vago-solutions.ai/wp-content/uploads/2025/12/Sauerkrautlm-colpali-scaled.png"
	alt="SauerkrautLM-ColPali"
	style="width: 75%; border-radius: 12px;"/>
	</div>
	<p style="color: #888; font-size: 1.2rem; margin: 0 0 16px 0;">
	Visual Document Retrieval with Multi-Vector Embeddings + VLM Answer Generation
	</p>
	<div style="margin-top: 16px;">
	<a href="https://huggingface.co/VAGOsolutions" target="_blank" style="color: #667eea; text-decoration: none; margin: 0 12px; font-weight: 500;">🤗 Models</a>
	<a href="https://github.com/VAGOsolutions/sauerkrautlm-colpali" target="_blank" style="color: #667eea; text-decoration: none; margin: 0 12px; font-weight: 500;">📖 GitHub</a>
	<a href="https://vago-solutions.ai" target="_blank" style="color: #667eea; text-decoration: none; margin: 0 12px; font-weight: 500;">🌐 VAGO Solutions</a>
	</div>
	</div>
	""")

	with gr.Row():
	# Left Column - Inputs
	with gr.Column(scale=1):
	gr.HTML('<h3 style="color: #b0b0b0; margin-bottom: 16px;">⚙️ Configuration</h3>')

	model_dropdown = gr.Dropdown(
	choices=list(COLPALI_MODELS.keys()),
	value="SauerkrautLM-ColQwen3-2B (Balanced, 4.4GB)",
	label="🔍 Retrieval Model",
	info="ColPali-based model for document retrieval and heatmap",
	)

	with gr.Row():
	enable_vlm = gr.Checkbox(
	label="🤖 Enable VLM",
	value=False,
	info="Use Qwen3-VL-4B for answers (adds ~30-60s on first use)",
	)
	enable_corpus = gr.Checkbox(
	label="📚 Compare with Corpus",
	value=True,
	info="Rank against 31 example documents",
	)

	gr.HTML('<h3 style="color: #b0b0b0; margin: 24px 0 16px 0;">📄 Document</h3>')

	image_input = gr.Image(
	label="Upload Document Image",
	type="pil",
	height=350,
	)

	query_input = gr.Textbox(
	label="🔍 Search Query",
	placeholder="e.g., What is the total revenue? / Wie hoch ist der Umsatz?",
	lines=2,
	)

	submit_btn = gr.Button(
	"🚀 Analyze Document",
	variant="primary",
	size="lg",
	)

	# Right Column - Results
	with gr.Column(scale=1):
	gr.HTML('<h3 style="color: #b0b0b0; margin-bottom: 16px;">📊 Results</h3>')

	with gr.Group():
	score_output = gr.Number(
	label="Similarity Score",
	precision=4,
	)
	result_markdown = gr.Markdown(
	value="Upload an image and enter a query to get started",
	)

	gr.HTML('<h3 style="color: #b0b0b0; margin: 24px 0 16px 0;">🔥 Similarity Heatmap</h3>')

	heatmap_output = gr.Image(
	label="Heatmap Visualization",
	type="pil",
	height=400,
	)

	gr.HTML("""
	<div style="display: flex; align-items: center; gap: 12px; padding: 12px; background: rgba(255,255,255,0.03); border-radius: 8px; margin-top: 8px;">
	<div style="width: 150px; height: 20px; background: linear-gradient(90deg, #0b0924 0%, #1f1147 20%, #3b1c6c 35%, #4a3880 50%, #3e7a8c 70%, #5ec5c0 85%, #c3f0e4 100%); border-radius: 4px;"></div>
	<span style="color: #888; font-size: 0.9rem;">Low → High Relevance (mako colormap)</span>
	</div>
	""")

	with gr.Accordion("📚 Corpus Ranking", open=True):
	ranking_output = gr.Markdown(
	value="Enable 'Compare with Corpus' to see how your document ranks",
	)

	with gr.Accordion("🤖 VLM Answer", open=True, visible=True):
	vlm_answer_output = gr.Textbox(
	label="Answer from Qwen3-VL-4B",
	lines=6,
	interactive=False,
	placeholder="Enable VLM and analyze to get an AI-generated answer...",
	)

	# Examples section
	if available_examples:
	gr.HTML('<h3 style="color: #b0b0b0; margin: 32px 0 16px 0;">📚 Example Documents (31 multilingual documents)</h3>')
	gr.HTML("""
	<div style="padding: 12px; background: rgba(102, 126, 234, 0.1); border-radius: 8px; margin-bottom: 16px;">
	<p style="color: #a0a0a0; margin: 0; font-size: 0.9rem;">
	🌍 <strong>Languages:</strong> German (DE), English (EN), French (FR), Spanish (ES)<br>
	📂 <strong>Categories:</strong> Annual Reports, ESG, Tax Forms, Scientific Papers, Energy/Hydrogen, Healthcare, Economic Forecasts
	</p>
	</div>
	""")
	gr.Examples(
	examples=available_examples,
	inputs=[image_input, query_input],
	label="Click an example to load it",
	)

	# Info section
	gr.HTML("""
	<details style="margin-top: 12px; padding: 12px; background: rgba(255,200,100,0.08); border: 1px solid rgba(255,200,100,0.2); border-radius: 8px;">
	<summary style="cursor: pointer; color: #e0c080; font-weight: 500;">ℹ️ About Heatmap Differences: ColQwen3 vs ColLFM2</summary>
	<div style="margin-top: 12px; color: #a0a0a0; font-size: 0.85rem; line-height: 1.6;">
	<p><strong style="color: #80c0ff;">ColQwen3 (Qwen-based):</strong> Uses Qwen3-VL's vision encoder which preserves strong spatial locality in patch embeddings. Each image patch maintains distinct features, allowing query tokens to differentiate between regions. Result: <em>precise, localized heatmaps</em>.</p>
	<p style="margin-top: 8px;"><strong style="color: #ffa080;">ColLFM2 (LFM2-based):</strong> Uses SigLIP2 NaFlex vision encoder with <em>pixel unshuffle</em> for efficient token reduction. This merges spatial information across patches, producing more "holistic" embeddings. Query tokens show high correlation (~0.97) across all patches. Result: <em>region-level relevance</em> rather than word-level precision.</p>
	<p style="margin-top: 8px;"><strong style="color: #80ffa0;">Why ColLFM2 still performs well for retrieval:</strong> The subtle similarity differences (e.g., 0.88 vs 0.94) are sufficient for ranking documents correctly. ColLFM2 excels at determining <em>if</em> a document is relevant, while ColQwen3 better shows <em>where</em> the relevance is.</p>
	</div>
	</details>
	""")

	# Footer
	gr.HTML("""
	<div style="text-align: center; padding: 32px 0 16px 0; border-top: 1px solid rgba(255,255,255,0.1); margin-top: 32px;">
	<p style="color: #666; font-size: 0.9rem;">
	💡 <b>Tip:</b> ColQwen3 models provide the best heatmap visualization. Enable VLM (Qwen3-VL-4B) for AI-generated answers.
	</p>
	<p style="color: #555; font-size: 0.85rem; margin-top: 8px;">
	Made with ❤️ by <a href="https://vago-solutions.ai" target="_blank" style="color: #667eea;">VAGO Solutions</a>
	</p>
	</div>
	""")

	# Event handlers
	submit_btn.click(
	fn=process_query_with_corpus,
	inputs=[model_dropdown, image_input, query_input, enable_vlm, enable_corpus],
	outputs=[score_output, result_markdown, heatmap_output, vlm_answer_output, ranking_output],
	)

	query_input.submit(
	fn=process_query_with_corpus,
	inputs=[model_dropdown, image_input, query_input, enable_vlm, enable_corpus],
	outputs=[score_output, result_markdown, heatmap_output, vlm_answer_output, ranking_output],
	)

	# Show hint when VLM is enabled
	enable_vlm.change(
	fn=on_vlm_toggle,
	inputs=[enable_vlm],
	outputs=[enable_vlm],
	)

	return demo


	if __name__ == "__main__":
	install_fa2()
	demo = create_demo()
	demo.queue(max_size=10).launch(debug=True)