Spaces:

Livengood
/

Instance-VRAM-Calculator

Running

Livengood Claude commited on 26 days ago

Commit

cecbf94

1 Parent(s): 548f0fb

Fix Python 3.9 compatibility and HuggingFace Spaces issues

Critical fixes:
- Add 'from __future__ import annotations' for Python 3.9 compatibility
- Remove type hints that caused SyntaxError on older Python
- Use gr.Tabs() with gr.TabItem() for reliable tab switching
- Remove problematic 'filter' parameter from list_models
- Simplify error handling to return messages instead of raising
- Ensure all Examples components work correctly

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <[email protected]>

Files changed (1) hide show

app.py +278 -700

app.py CHANGED Viewed

@@ -1,17 +1,7 @@
 """
 VRAM & Instance Type Calculator for HuggingFace Models
-Fetches model metadata from HF Hub and calculates:
-- Minimum VRAM required for inference and training
-- KV cache requirements at various context lengths
-- Recommended GPUs and cloud instances
-- Multi-GPU tensor parallelism estimates
-- Quantization options with detailed breakdown
-- Model comparison across multiple models
-- Throughput estimation
-- Cloud cost analysis
-- LoRA/QLoRA fine-tuning memory requirements
 """
 import gradio as gr
 from huggingface_hub import HfApi, hf_hub_download, list_models
@@ -25,31 +15,25 @@ api = HfApi()
 # GPU specs: name -> (VRAM in GB, typical cloud instance, category, hourly_cost, tflops_fp16)
 GPU_SPECS = {
-    # Consumer GPUs
     "RTX 3080": (10, "Consumer", "consumer", 0, 29.8),
     "RTX 3090": (24, "Consumer", "consumer", 0, 35.6),
     "RTX 4080": (16, "Consumer", "consumer", 0, 48.7),
     "RTX 4090": (24, "Consumer", "consumer", 0, 82.6),
     "RTX 5090": (32, "Consumer (est.)", "consumer", 0, 105.0),
-    # Apple Silicon
     "M2 Ultra": (192, "Mac Studio (Unified)", "apple", 0, 27.2),
     "M3 Max": (128, "MacBook Pro (Unified)", "apple", 0, 14.2),
     "M4 Max": (128, "MacBook Pro (Unified)", "apple", 0, 18.0),
-    # Workstation GPUs
     "RTX A6000": (48, "Workstation", "workstation", 0, 38.7),
     "L40S": (48, "AWS g6.xlarge (~$1.00/hr)", "cloud", 1.00, 91.6),
-    # Cloud GPUs
     "A10G": (24, "AWS g5.xlarge (~$1.00/hr)", "cloud", 1.00, 31.2),
     "L4": (24, "GCP g2-standard-4 (~$0.70/hr)", "cloud", 0.70, 30.3),
-    "A100 40GB": (40, "AWS p4d, GCP a2-highgpu-1g (~$3/hr)", "cloud", 3.00, 77.9),
-    "A100 80GB": (80, "AWS p4de, GCP a2-ultragpu-1g (~$5/hr)", "cloud", 5.00, 77.9),
-    "H100 80GB": (80, "AWS p5, GCP a3-highgpu (~$8/hr)", "cloud", 8.00, 267.6),
-    "H200 141GB": (141, "Coming soon (~$12/hr est.)", "cloud", 12.00, 296.0),
-    # AMD GPUs
     "MI300X": (192, "AMD Cloud Instances", "amd", 6.00, 383.0),
 }
-# Bytes per element for different dtypes
 DTYPE_BYTES = {
     "F32": 4, "float32": 4,
     "F16": 2, "float16": 2,
@@ -61,77 +45,61 @@ DTYPE_BYTES = {
     "I64": 8, "int64": 8,
 }
-# Serving framework overhead multipliers
 SERVING_FRAMEWORKS = {
     "None (raw PyTorch)": 1.20,
     "vLLM": 1.10,
-    "TGI (Text Generation Inference)": 1.15,
     "llama.cpp": 1.05,
-    "Transformers (HuggingFace)": 1.25,
     "Ollama": 1.08,
 }
-# Quantization methods with their characteristics
 QUANTIZATION_METHODS = {
-    "FP16/BF16": {"bytes_per_param": 2.0, "quality": "100%", "desc": "Full precision"},
-    "INT8 (LLM.int8)": {"bytes_per_param": 1.0, "quality": "~99%", "desc": "Good balance"},
-    "GPTQ 8-bit": {"bytes_per_param": 1.0, "quality": "~99%", "desc": "GPU optimized"},
-    "AWQ 4-bit": {"bytes_per_param": 0.5, "quality": "~97%", "desc": "Activation-aware"},
-    "GPTQ 4-bit": {"bytes_per_param": 0.5, "quality": "~95%", "desc": "GPU optimized"},
-    "GGUF Q8_0": {"bytes_per_param": 1.0, "quality": "~99%", "desc": "llama.cpp format"},
-    "GGUF Q6_K": {"bytes_per_param": 0.75, "quality": "~98%", "desc": "llama.cpp format"},
-    "GGUF Q5_K_M": {"bytes_per_param": 0.625, "quality": "~97%", "desc": "llama.cpp format"},
-    "GGUF Q4_K_M": {"bytes_per_param": 0.5, "quality": "~95%", "desc": "llama.cpp format"},
-    "GGUF Q3_K_M": {"bytes_per_param": 0.375, "quality": "~90%", "desc": "llama.cpp format"},
-    "GGUF Q2_K": {"bytes_per_param": 0.3125, "quality": "~85%", "desc": "Aggressive compression"},
 }
-def bytes_to_gb(b: int | float) -> float:
     return b / (1024 ** 3)
-def gb_to_bytes(gb: float) -> float:
-    return gb * (1024 ** 3)
 @lru_cache(maxsize=50)
-def get_model_info_cached(model_id: str):
-    """Fetch model info from HF Hub with caching."""
     try:
-        info = api.model_info(model_id, files_metadata=True)
-        return info
     except Exception as e:
-        return {"_error": str(e)}
 @lru_cache(maxsize=50)
-def get_config_cached(model_id: str) -> str:
-    """Fetch config.json with caching. Returns JSON string for cache compatibility."""
     try:
         config_path = hf_hub_download(model_id, "config.json")
         with open(config_path) as f:
             return f.read()
-    except Exception as e:
-        return json.dumps({"_error": str(e)})
-def get_model_info(model_id: str):
-    """Fetch model info from HF Hub."""
     result = get_model_info_cached(model_id)
-    if isinstance(result, dict) and "_error" in result:
-        raise gr.Error(f"Could not fetch model info: {result['_error']}")
     return result
-def get_config(model_id: str) -> dict:
-    """Get config.json for architecture details."""
     config_str = get_config_cached(model_id)
     return json.loads(config_str)
-def estimate_params_from_safetensors(info) -> tuple[int, str]:
-    """Extract parameter count and dtype from safetensors metadata."""
     if hasattr(info, 'safetensors') and info.safetensors:
         param_count = info.safetensors.total
         params_by_dtype = info.safetensors.parameters
@@ -141,8 +109,7 @@ def estimate_params_from_safetensors(info) -> tuple[int, str]:
     return 0, "F16"
-def get_head_dim(config: dict) -> int:
-    """Calculate head dimension from config, with fallbacks."""
     if "head_dim" in config:
         return config["head_dim"]
     hidden_size = config.get("hidden_size", config.get("n_embd", 0))
@@ -152,737 +119,348 @@ def get_head_dim(config: dict) -> int:
     return 128
-def estimate_kv_cache_size(
-    num_layers: int,
-    num_kv_heads: int,
-    head_dim: int,
-    context_length: int,
-    batch_size: int = 1,
-    dtype_bytes: int = 2
-) -> int:
-    """KV cache size = 2 * num_layers * batch_size * context_length * num_kv_heads * head_dim * dtype_bytes"""
     return 2 * num_layers * batch_size * context_length * num_kv_heads * head_dim * dtype_bytes
-def estimate_training_memory(param_count: int, dtype_bytes: int, optimizer: str = "AdamW") -> dict:
-    """Estimate training memory requirements."""
-    weights_bytes = param_count * dtype_bytes
-    gradients_bytes = param_count * dtype_bytes
     if optimizer == "AdamW":
-        optimizer_bytes = param_count * 4 * 2
     elif optimizer == "SGD":
-        optimizer_bytes = 0
     elif optimizer == "SGD + Momentum":
-        optimizer_bytes = param_count * 4
-    elif optimizer == "8-bit Adam":
-        optimizer_bytes = param_count * 1 * 2
     else:
-        optimizer_bytes = param_count * 4 * 2
-    return {
-        "weights": weights_bytes,
-        "gradients": gradients_bytes,
-        "optimizer": optimizer_bytes,
-        "total_base": weights_bytes + gradients_bytes + optimizer_bytes
-    }
-def calculate_multi_gpu_split(total_vram_gb: float, num_gpus: int, parallelism: str) -> dict:
-    """Calculate memory distribution across multiple GPUs."""
-    if parallelism == "Tensor Parallelism":
-        per_gpu = total_vram_gb / num_gpus
-        overhead = 0.05 * total_vram_gb
-        return {
-            "per_gpu": per_gpu + (overhead / num_gpus),
-            "total": total_vram_gb + overhead,
-            "efficiency": "High (best for inference)",
-        }
-    elif parallelism == "Pipeline Parallelism":
-        per_gpu = total_vram_gb / num_gpus
-        overhead = 0.1 * total_vram_gb
-        return {
-            "per_gpu": per_gpu + (overhead / num_gpus),
-            "total": total_vram_gb + overhead,
-            "efficiency": "Medium (good for training)",
-        }
-    else:
-        return {
-            "per_gpu": total_vram_gb,
-            "total": total_vram_gb * num_gpus,
-            "efficiency": "Low memory efficiency (training only)",
-        }
-def estimate_lora_memory(
-    param_count: int,
-    dtype_bytes: int,
-    lora_rank: int = 16,
-    lora_alpha: int = 32,
-    target_modules: int = 4,
-    use_qlora: bool = False
-) -> dict:
-    """Estimate LoRA/QLoRA fine-tuning memory requirements."""
     if use_qlora:
-        base_weights_bytes = param_count * 0.5
     else:
-        base_weights_bytes = param_count * dtype_bytes
-    lora_params_ratio = (lora_rank * 2 * target_modules) / 1000
-    lora_params = int(param_count * lora_params_ratio * 0.01)
-    lora_weights_bytes = lora_params * dtype_bytes
-    gradients_bytes = lora_params * dtype_bytes
-    optimizer_bytes = lora_params * 4 * 2
-    activation_bytes = base_weights_bytes * 0.5
-    return {
-        "base_weights": base_weights_bytes,
-        "lora_weights": lora_weights_bytes,
-        "lora_params": lora_params,
-        "gradients": gradients_bytes,
-        "optimizer": optimizer_bytes,
-        "activations": activation_bytes,
-        "total": base_weights_bytes + lora_weights_bytes + gradients_bytes + optimizer_bytes + activation_bytes,
-        "vs_full_finetune_ratio": 0.3 if use_qlora else 0.5,
-    }
-def estimate_throughput(
-    param_count: int,
-    gpu_tflops: float,
-    batch_size: int = 1,
-    context_length: int = 4096,
-    is_prefill: bool = False
-) -> dict:
-    """Estimate tokens per second throughput."""
     flops_per_token = 2 * param_count
-    peak_tokens_per_sec = (gpu_tflops * 1e12) / flops_per_token
-    memory_bandwidth_tbs = 1.0
-    bytes_per_token = param_count * 2
-    memory_bound_tokens = (memory_bandwidth_tbs * 1e12) / bytes_per_token
-    if is_prefill:
-        effective_tokens = min(peak_tokens_per_sec, memory_bound_tokens * 10) * batch_size
-    else:
-        effective_tokens = min(peak_tokens_per_sec, memory_bound_tokens) * batch_size
-    efficiency = 0.4
-    realistic_tokens = effective_tokens * efficiency
-    return {
-        "peak_theoretical": peak_tokens_per_sec,
-        "memory_bound": memory_bound_tokens,
-        "estimated_tokens_per_sec": realistic_tokens,
-        "batch_size": batch_size,
-        "is_prefill": is_prefill,
-    }
-def calculate_cost_estimate(vram_required: float, hours_per_day: float = 8, days_per_month: float = 22) -> list:
-    """Calculate cost estimates for cloud GPUs that fit the model."""
-    estimates = []
-    for gpu_name, (vram, instance, category, hourly_cost, tflops) in GPU_SPECS.items():
-        if vram >= vram_required and hourly_cost > 0:
-            daily_cost = hourly_cost * hours_per_day
-            monthly_cost = daily_cost * days_per_month
-            estimates.append({
-                "gpu": gpu_name,
-                "vram": vram,
-                "hourly": hourly_cost,
-                "daily": daily_cost,
-                "monthly": monthly_cost,
-                "instance": instance,
-            })
-    return sorted(estimates, key=lambda x: x["hourly"])
-def search_models_fn(query: str) -> list:
-    """Search HuggingFace models by name."""
     if not query or len(query) < 2:
-        return []
     try:
-        models = list(list_models(
-            search=query,
-            sort="downloads",
-            direction=-1,
-            limit=10,
-            filter="text-generation"
-        ))
-        return [m.id for m in models]
     except Exception:
-        return []
-def calculate_flash_attention_savings(kv_cache_bytes: int, context_length: int) -> dict:
-    """Estimate memory savings from Flash Attention."""
-    standard_attention_overhead = context_length * context_length * 2
-    flash_attention_overhead = context_length * 128 * 2
-    savings_bytes = standard_attention_overhead - flash_attention_overhead
-    savings_ratio = 1 - (flash_attention_overhead / max(standard_attention_overhead, 1))
-    return {
-        "standard_overhead_gb": bytes_to_gb(standard_attention_overhead),
-        "flash_overhead_gb": bytes_to_gb(flash_attention_overhead),
-        "savings_gb": bytes_to_gb(savings_bytes),
-        "savings_percent": savings_ratio * 100,
-    }
-def calculate_vram(
-    model_id: str,
-    context_length: int = 4096,
-    batch_size: int = 1,
-    mode: str = "Inference",
-    optimizer: str = "AdamW",
-    serving_framework: str = "None (raw PyTorch)",
-    num_gpus: int = 1,
-    parallelism: str = "Tensor Parallelism",
-    use_flash_attention: bool = True,
-    lora_rank: int = 16,
-    show_throughput: bool = True,
-    show_cost: bool = True
-):
-    """Main calculation function. Returns (markdown_results, chart_dataframe)."""
-    model_id = model_id.strip()
     if not model_id:
-        raise gr.Error("Please enter a model ID")
     if "/" not in model_id:
-        raise gr.Error("Model ID should be in format 'organization/model-name'")
-    info = get_model_info(model_id)
-    config = get_config(model_id)
-    results = []
-    results.append(f"## Model: [{model_id}](https://huggingface.co/{model_id})\n")
     param_count, dominant_dtype = estimate_params_from_safetensors(info)
     if param_count == 0:
-        results.append("Could not determine parameter count from safetensors metadata.\n")
-        results.append("Model may use pytorch_model.bin or other format.\n")
-        return "\n".join(results), None
     dtype_bytes = DTYPE_BYTES.get(dominant_dtype, 2)
     params_b = param_count / 1e9
-    results.append(f"**Parameters:** {params_b:.2f}B ({param_count:,})")
-    results.append(f"**Dominant dtype:** {dominant_dtype} ({dtype_bytes} bytes/param)")
-    results.append(f"**Mode:** {mode}")
-    weights_bytes = param_count * dtype_bytes
-    weights_gb = bytes_to_gb(weights_bytes)
-    results.append(f"\n### Weight Memory")
-    results.append(f"Model weights: **{weights_gb:.2f} GB**")
-    num_layers = config.get("num_hidden_layers", config.get("n_layer", 0))
-    hidden_size = config.get("hidden_size", config.get("n_embd", 0))
-    num_attention_heads = config.get("num_attention_heads", config.get("n_head", 0))
-    num_kv_heads = config.get("num_key_value_heads", num_attention_heads)
     head_dim = get_head_dim(config)
-    max_position = config.get("max_position_embeddings", config.get("n_positions", "N/A"))
-    results.append(f"\n### Architecture")
-    if "_error" in config:
-        results.append(f"Could not fetch config.json (model may be gated)")
-        kv_gb = 0
-    elif num_layers and hidden_size:
-        results.append(f"- Layers: {num_layers}")
-        results.append(f"- Hidden size: {hidden_size}")
-        results.append(f"- Attention heads: {num_attention_heads}")
-        results.append(f"- KV heads: {num_kv_heads} {'(GQA)' if num_kv_heads != num_attention_heads else '(MHA)'}")
-        results.append(f"- Head dimension: {head_dim}")
-        if isinstance(max_position, int):
-            results.append(f"- Max context: {max_position:,}")
-        else:
-            results.append(f"- Max context: {max_position}")
-        results.append(f"\n### KV Cache (batch_size={batch_size})")
-        results.append("| Context | KV Cache | + Weights | Status |")
-        results.append("|---------|----------|-----------|--------|")
-        context_points = [1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072]
-        for ctx_len in context_points:
-            if ctx_len > context_length * 2 and ctx_len > 8192:
-                break
-            kv_bytes = estimate_kv_cache_size(num_layers, num_kv_heads, head_dim, ctx_len, batch_size, dtype_bytes)
-            kv_gb_temp = bytes_to_gb(kv_bytes)
-            total_temp = weights_gb + kv_gb_temp
-            marker = " **<- selected**" if ctx_len == context_length else ""
-            results.append(f"| {ctx_len:,} | {kv_gb_temp:.2f} GB | {total_temp:.2f} GB |{marker} |")
-        kv_bytes = estimate_kv_cache_size(num_layers, num_kv_heads, head_dim, context_length, batch_size, dtype_bytes)
-        kv_gb = bytes_to_gb(kv_bytes)
-    else:
-        results.append("Could not find architecture details")
-        kv_gb = 0
-    flash_savings = None
-    if use_flash_attention and kv_gb > 0:
-        kv_bytes = estimate_kv_cache_size(num_layers, num_kv_heads, head_dim, context_length, batch_size, dtype_bytes)
-        flash_savings = calculate_flash_attention_savings(kv_bytes, context_length)
     if mode == "Training (Full)":
-        training_mem = estimate_training_memory(param_count, dtype_bytes, optimizer)
-        base_gb = bytes_to_gb(training_mem["total_base"])
         activation_gb = weights_gb * 2 * batch_size
-        if use_flash_attention and flash_savings:
-            activation_gb -= flash_savings["savings_gb"]
-            activation_gb = max(0.1, activation_gb)
-        total_gb = base_gb + kv_gb + activation_gb
-        results.append(f"\n### Training Memory Breakdown")
         results.append(f"- Weights: {weights_gb:.2f} GB")
-        results.append(f"- Gradients: {bytes_to_gb(training_mem['gradients']):.2f} GB")
-        results.append(f"- Optimizer ({optimizer}): {bytes_to_gb(training_mem['optimizer']):.2f} GB")
-        results.append(f"- KV Cache: {kv_gb:.2f} GB")
-        results.append(f"- Activations (est.): {activation_gb:.2f} GB")
-        chart_data = {
-            "Weights": weights_gb,
-            "Gradients": bytes_to_gb(training_mem['gradients']),
-            "Optimizer": bytes_to_gb(training_mem['optimizer']),
-            "KV Cache": kv_gb,
-            "Activations": activation_gb,
-        }
     elif mode == "LoRA Fine-tuning":
-        lora_mem = estimate_lora_memory(param_count, dtype_bytes, lora_rank, use_qlora=False)
-        total_gb = bytes_to_gb(lora_mem["total"])
-        results.append(f"\n### LoRA Fine-tuning (rank={lora_rank})")
-        results.append(f"- Base weights (frozen): {bytes_to_gb(lora_mem['base_weights']):.2f} GB")
-        results.append(f"- LoRA adapters: {bytes_to_gb(lora_mem['lora_weights']):.3f} GB ({lora_mem['lora_params']:,} params)")
-        results.append(f"- Gradients (LoRA only): {bytes_to_gb(lora_mem['gradients']):.3f} GB")
-        results.append(f"- Optimizer states: {bytes_to_gb(lora_mem['optimizer']):.3f} GB")
-        results.append(f"- Activations: {bytes_to_gb(lora_mem['activations']):.2f} GB")
-        results.append(f"\n*Saves ~{(1-lora_mem['vs_full_finetune_ratio'])*100:.0f}% vs full fine-tuning*")
-        chart_data = {
-            "Base Weights": bytes_to_gb(lora_mem['base_weights']),
-            "LoRA Adapters": bytes_to_gb(lora_mem['lora_weights']),
-            "Gradients": bytes_to_gb(lora_mem['gradients']),
-            "Optimizer": bytes_to_gb(lora_mem['optimizer']),
-            "Activations": bytes_to_gb(lora_mem['activations']),
-        }
     elif mode == "QLoRA Fine-tuning":
-        lora_mem = estimate_lora_memory(param_count, dtype_bytes, lora_rank, use_qlora=True)
-        total_gb = bytes_to_gb(lora_mem["total"])
-        results.append(f"\n### QLoRA Fine-tuning (4-bit base, rank={lora_rank})")
-        results.append(f"- Base weights (4-bit): {bytes_to_gb(lora_mem['base_weights']):.2f} GB")
-        results.append(f"- LoRA adapters: {bytes_to_gb(lora_mem['lora_weights']):.3f} GB ({lora_mem['lora_params']:,} params)")
-        results.append(f"- Gradients (LoRA only): {bytes_to_gb(lora_mem['gradients']):.3f} GB")
-        results.append(f"- Optimizer states: {bytes_to_gb(lora_mem['optimizer']):.3f} GB")
-        results.append(f"- Activations: {bytes_to_gb(lora_mem['activations']):.2f} GB")
-        results.append(f"\n*Saves ~{(1-lora_mem['vs_full_finetune_ratio'])*100:.0f}% vs full fine-tuning*")
-        chart_data = {
-            "Base (4-bit)": bytes_to_gb(lora_mem['base_weights']),
-            "LoRA Adapters": bytes_to_gb(lora_mem['lora_weights']),
-            "Gradients": bytes_to_gb(lora_mem['gradients']),
-            "Optimizer": bytes_to_gb(lora_mem['optimizer']),
-            "Activations": bytes_to_gb(lora_mem['activations']),
-        }
-    else:
-        framework_overhead = SERVING_FRAMEWORKS.get(serving_framework, 1.15)
-        base_total = weights_gb + kv_gb
-        overhead_gb = base_total * (framework_overhead - 1)
-        if use_flash_attention and flash_savings:
-            overhead_gb -= min(flash_savings["savings_gb"] * 0.1, overhead_gb * 0.5)
-            overhead_gb = max(0, overhead_gb)
-        total_gb = base_total + overhead_gb
-        results.append(f"\n### Inference Memory ({serving_framework})")
         results.append(f"- Weights: {weights_gb:.2f} GB")
         results.append(f"- KV Cache: {kv_gb:.2f} GB")
-        results.append(f"- Framework overhead: {overhead_gb:.2f} GB ({(framework_overhead-1)*100:.0f}%)")
-        chart_data = {
-            "Weights": weights_gb,
-            "KV Cache": kv_gb,
-            "Overhead": overhead_gb,
-        }
-    if use_flash_attention and flash_savings and flash_savings["savings_gb"] > 0.01:
-        results.append(f"\n### Flash Attention")
-        results.append(f"- Enabled: Yes")
-        results.append(f"- Peak memory savings: ~{flash_savings['savings_gb']:.2f} GB ({flash_savings['savings_percent']:.1f}%)")
-    results.append(f"\n### Total VRAM Required: **{total_gb:.2f} GB**")
     if num_gpus > 1:
-        multi_gpu = calculate_multi_gpu_split(total_gb, num_gpus, parallelism)
-        results.append(f"\n### Multi-GPU ({num_gpus}x GPUs, {parallelism})")
-        results.append(f"- Per GPU: {multi_gpu['per_gpu']:.2f} GB")
-        results.append(f"- Total across GPUs: {multi_gpu['total']:.2f} GB")
-        results.append(f"- Efficiency: {multi_gpu['efficiency']}")
-        effective_vram_needed = multi_gpu['per_gpu']
     else:
-        effective_vram_needed = total_gb
     results.append(f"\n### GPU Recommendations")
-    results.append("| GPU | VRAM | Fits? | Headroom | Est. tok/s | Instance |")
-    results.append("|-----|------|-------|----------|------------|----------|")
-    for gpu_name, (vram, instance, category, hourly_cost, tflops) in GPU_SPECS.items():
-        fits = "Yes" if vram >= effective_vram_needed else "No"
-        headroom = vram - effective_vram_needed
-        headroom_str = f"+{headroom:.1f} GB" if headroom > 0 else f"{headroom:.1f} GB"
-        if show_throughput and vram >= effective_vram_needed:
-            throughput = estimate_throughput(param_count, tflops, batch_size, context_length)
-            tok_str = f"~{throughput['estimated_tokens_per_sec']:.0f}"
-        else:
-            tok_str = "-"
-        results.append(f"| {gpu_name} | {vram} GB | {fits} | {headroom_str} | {tok_str} | {instance} |")
-    if effective_vram_needed > 24:
-        results.append(f"\n### Quantization Options")
-        results.append("To fit on consumer GPUs (24 GB or less), consider:\n")
-        results.append("| Method | Est. Size | Quality | Notes |")
-        results.append("|--------|-----------|---------|-------|")
-        for method, specs in QUANTIZATION_METHODS.items():
-            quant_size = bytes_to_gb(param_count * specs["bytes_per_param"])
-            quant_with_overhead = quant_size * 1.1
-            fits = "Yes" if quant_with_overhead <= 24 else "No"
-            results.append(f"| {method} | {quant_with_overhead:.1f} GB | {specs['quality']} | {fits} - {specs['desc']} |")
-        model_name = model_id.split('/')[-1]
-        results.append(f"\n**Tip:** Search for `{model_name} GGUF` or `{model_name} AWQ` on HuggingFace.")
     if show_cost:
-        cost_estimates = calculate_cost_estimate(effective_vram_needed)
-        if cost_estimates:
-            results.append(f"\n### Cloud Cost Estimates")
-            results.append("*Based on 8 hrs/day, 22 days/month*\n")
-            results.append("| GPU | Hourly | Daily | Monthly |")
-            results.append("|-----|--------|-------|---------|")
-            for est in cost_estimates[:5]:
-                results.append(f"| {est['gpu']} | ${est['hourly']:.2f} | ${est['daily']:.2f} | ${est['monthly']:.0f} |")
-    # Create DataFrame for chart
-    df = pd.DataFrame({
-        "Component": list(chart_data.keys()),
-        "GB": list(chart_data.values())
-    })
     return "\n".join(results), df
-def compare_models_fn(model_ids_text: str, context_length: int = 4096) -> str:
-    """Compare multiple models side by side."""
-    model_ids = [m.strip() for m in model_ids_text.split("\n") if m.strip()]
     if len(model_ids) < 2:
-        return "Please enter at least 2 model IDs (one per line)"
     if len(model_ids) > 5:
-        return "Maximum 5 models for comparison"
-    results = ["## Model Comparison\n"]
-    comparison_data = []
     for model_id in model_ids:
         try:
             info = get_model_info(model_id)
             config = get_config(model_id)
-            param_count, dominant_dtype = estimate_params_from_safetensors(info)
             if param_count == 0:
-                comparison_data.append({"model": model_id, "error": "Could not determine parameters"})
                 continue
-            dtype_bytes = DTYPE_BYTES.get(dominant_dtype, 2)
             weights_gb = bytes_to_gb(param_count * dtype_bytes)
-            num_layers = config.get("num_hidden_layers", config.get("n_layer", 0))
-            num_kv_heads = config.get("num_key_value_heads", config.get("num_attention_heads", 0))
             head_dim = get_head_dim(config)
-            kv_bytes = estimate_kv_cache_size(num_layers, num_kv_heads, head_dim, context_length, 1, dtype_bytes)
-            kv_gb = bytes_to_gb(kv_bytes)
-            total_inference = weights_gb + kv_gb
-            training_mem = estimate_training_memory(param_count, dtype_bytes)
-            training_gb = bytes_to_gb(training_mem["total_base"]) + weights_gb * 2
-            qlora_mem = estimate_lora_memory(param_count, dtype_bytes, 16, use_qlora=True)
-            qlora_gb = bytes_to_gb(qlora_mem["total"])
-            comparison_data.append({
-                "model": model_id.split("/")[-1],
-                "full_id": model_id,
-                "params": f"{param_count/1e9:.1f}B",
-                "inference_gb": total_inference,
-                "training_gb": training_gb,
-                "qlora_gb": qlora_gb,
-            })
         except Exception as e:
-            comparison_data.append({"model": model_id, "error": str(e)})
-    results.append(f"*Context length: {context_length:,}*\n")
-    results.append("| Model | Params | Inference | Training | QLoRA |")
-    results.append("|-------|--------|-----------|----------|-------|")
-    for data in comparison_data:
-        if "error" in data:
-            results.append(f"| {data['model']} | Error | - | - | - |")
-        else:
-            results.append(
-                f"| [{data['model']}](https://huggingface.co/{data['full_id']}) | "
-                f"{data['params']} | {data['inference_gb']:.1f} GB | "
-                f"{data['training_gb']:.1f} GB | {data['qlora_gb']:.1f} GB |"
-            )
-    valid_data = [d for d in comparison_data if "error" not in d]
-    if len(valid_data) >= 2:
-        results.append("\n### Recommendations")
-        min_inference = min(valid_data, key=lambda x: x["inference_gb"])
-        min_training = min(valid_data, key=lambda x: x["training_gb"])
-        min_qlora = min(valid_data, key=lambda x: x["qlora_gb"])
-        results.append(f"- **Best for inference:** {min_inference['model']} ({min_inference['inference_gb']:.1f} GB)")
-        results.append(f"- **Best for training:** {min_training['model']} ({min_training['training_gb']:.1f} GB)")
-        results.append(f"- **Best for QLoRA:** {min_qlora['model']} ({min_qlora['qlora_gb']:.1f} GB)")
     return "\n".join(results)
-def export_results_fn(result_text: str, format_type: str) -> str:
-    """Export results to different formats."""
-    if not result_text:
-        return "No results to export. Run a calculation first."
     timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
-    if format_type == "JSON":
-        lines = result_text.split("\n")
-        data = {"timestamp": timestamp, "raw_markdown": result_text, "sections": {}}
-        current_section = "header"
-        for line in lines:
-            if line.startswith("### "):
-                current_section = line.replace("### ", "").strip()
-                data["sections"][current_section] = []
-            elif line.strip():
-                if current_section not in data["sections"]:
-                    data["sections"][current_section] = []
-                data["sections"][current_section].append(line.strip())
-        return json.dumps(data, indent=2)
-    else:
-        plain = result_text.replace("**", "").replace("###", "\n===").replace("##", "\n===")
-        return f"VRAM Calculator Export - {timestamp}\n{'='*50}\n\n{plain}"
-def do_search(query: str):
-    """Search for models and return dropdown choices."""
-    if not query:
-        return gr.update(choices=[], value=None)
-    results = search_models_fn(query)
-    if results:
-        return gr.update(choices=results, value=results[0])
-    return gr.update(choices=["No models found"], value=None)
-def select_from_search(selected: str) -> str:
-    """Select a model from search results."""
-    if selected and selected != "No models found":
-        return selected
-    return ""
-# Build Gradio interface
-with gr.Blocks(title="VRAM Calculator", theme=gr.themes.Soft()) as demo:
-    gr.Markdown("# VRAM & Instance Type Calculator")
-    gr.Markdown("Estimate GPU memory requirements for HuggingFace models.")
-    with gr.Tab("Calculator"):
-        with gr.Row():
-            model_input = gr.Textbox(
-                label="Model ID",
-                placeholder="meta-llama/Llama-3.1-8B",
-                info="Full HuggingFace model ID (org/model-name)",
-                scale=2
-            )
-            search_input = gr.Textbox(
-                label="Search Models",
-                placeholder="llama 8b",
-                info="Search HuggingFace",
-                scale=1
             )
-        with gr.Row():
-            search_btn = gr.Button("Search HuggingFace", scale=1)
-            search_results = gr.Dropdown(
-                label="Search Results",
-                choices=[],
-                interactive=True,
-                scale=2
             )
-        search_btn.click(fn=do_search, inputs=[search_input], outputs=[search_results])
-        search_results.change(fn=select_from_search, inputs=[search_results], outputs=[model_input])
-        with gr.Row():
-            mode_input = gr.Radio(
-                choices=["Inference", "Training (Full)", "LoRA Fine-tuning", "QLoRA Fine-tuning"],
-                value="Inference",
-                label="Mode"
-            )
-            context_input = gr.Slider(
-                label="Context Length",
-                minimum=512,
-                maximum=131072,
-                value=4096,
-                step=512
             )
-            batch_input = gr.Slider(
-                label="Batch Size",
-                minimum=1,
-                maximum=64,
-                value=1,
-                step=1
-            )
-        with gr.Accordion("Advanced Options", open=False):
-            with gr.Row():
-                serving_input = gr.Dropdown(
-                    choices=list(SERVING_FRAMEWORKS.keys()),
-                    value="None (raw PyTorch)",
-                    label="Serving Framework"
-                )
-                optimizer_input = gr.Dropdown(
-                    choices=["AdamW", "SGD", "SGD + Momentum", "8-bit Adam"],
-                    value="AdamW",
-                    label="Optimizer (Training mode)"
-                )
-                lora_rank_input = gr.Slider(
-                    label="LoRA Rank",
-                    minimum=4,
-                    maximum=128,
-                    value=16,
-                    step=4
-                )
-            with gr.Row():
-                num_gpus_input = gr.Slider(
-                    label="Number of GPUs",
-                    minimum=1,
-                    maximum=8,
-                    value=1,
-                    step=1
-                )
-                parallelism_input = gr.Dropdown(
-                    choices=["Tensor Parallelism", "Pipeline Parallelism", "Data Parallelism"],
-                    value="Tensor Parallelism",
-                    label="Parallelism Strategy"
-                )
-                flash_attention_input = gr.Checkbox(
-                    label="Use Flash Attention",
-                    value=True
-                )
-            with gr.Row():
-                show_throughput_input = gr.Checkbox(label="Show Throughput Estimates", value=True)
-                show_cost_input = gr.Checkbox(label="Show Cost Estimates", value=True)
-        calculate_btn = gr.Button("Calculate VRAM", variant="primary", size="lg")
-        with gr.Row():
-            output = gr.Markdown(label="Results")
-            chart_output = gr.BarPlot(
-                x="Component",
-                y="GB",
-                title="Memory Breakdown",
-                height=350,
-                width=400
             )
-        calculate_btn.click(
-            fn=calculate_vram,
-            inputs=[
-                model_input, context_input, batch_input, mode_input,
-                optimizer_input, serving_input, num_gpus_input, parallelism_input,
-                flash_attention_input, lora_rank_input, show_throughput_input, show_cost_input
-            ],
-            outputs=[output, chart_output]
-        )
-        gr.Markdown("### Popular Models")
-        gr.Examples(
-            examples=[
-                ["meta-llama/Llama-3.1-8B", 4096, 1],
-                ["meta-llama/Llama-3.1-70B", 8192, 1],
-                ["mistralai/Mistral-7B-v0.1", 8192, 1],
-                ["Qwen/Qwen2.5-72B", 32768, 1],
-                ["google/gemma-2-27b", 8192, 1],
-                ["microsoft/phi-4", 16384, 1],
-            ],
-            inputs=[model_input, context_input, batch_input],
-        )
-    with gr.Tab("Compare Models"):
-        gr.Markdown("Compare VRAM requirements across multiple models. Enter model IDs one per line (2-5 models).")
-        compare_models_input = gr.Textbox(
-            label="Model IDs (one per line)",
-            placeholder="meta-llama/Llama-3.1-8B\nmistralai/Mistral-7B-v0.1\nQwen/Qwen2.5-7B",
-            lines=5,
-        )
-        compare_context_input = gr.Slider(
-            label="Context Length",
-            minimum=512,
-            maximum=131072,
-            value=4096,
-            step=512,
-        )
-        compare_btn = gr.Button("Compare Models", variant="primary")
-        compare_output = gr.Markdown(label="Comparison Results")
-        compare_btn.click(
-            fn=compare_models_fn,
-            inputs=[compare_models_input, compare_context_input],
-            outputs=[compare_output]
-        )
-        gr.Markdown("### Example Comparisons")
-        gr.Examples(
-            examples=[
-                ["meta-llama/Llama-3.1-8B\nmistralai/Mistral-7B-v0.1\nQwen/Qwen2.5-7B", 4096],
-                ["meta-llama/Llama-3.1-70B\nQwen/Qwen2.5-72B", 8192],
-            ],
-            inputs=[compare_models_input, compare_context_input],
-        )
-    with gr.Tab("Export"):
-        gr.Markdown("Export calculation results to JSON or plain text. Copy results from Calculator tab.")
-        export_input = gr.Textbox(
-            label="Paste Results Here",
-            placeholder="Paste the calculation results...",
-            lines=10,
-        )
-        export_format = gr.Radio(
-            choices=["JSON", "Plain Text"],
-            value="JSON",
-            label="Export Format"
-        )
-        export_btn = gr.Button("Export", variant="primary")
-        export_output = gr.Textbox(
-            label="Exported Data",
-            lines=15,
-            show_copy_button=True,
-        )
-        export_btn.click(
-            fn=export_results_fn,
-            inputs=[export_input, export_format],
-            outputs=[export_output]
-        )
-    gr.Markdown("""
-    ---
-    **Notes:** Estimates are approximate. Flash Attention and other optimizations can reduce peak memory.
-    Throughput estimates assume ideal conditions. Built with Gradio & HuggingFace Hub API.
-    """)
 if __name__ == "__main__":
     demo.launch()

 """
 VRAM & Instance Type Calculator for HuggingFace Models
 """
+from __future__ import annotations
 import gradio as gr
 from huggingface_hub import HfApi, hf_hub_download, list_models
 # GPU specs: name -> (VRAM in GB, typical cloud instance, category, hourly_cost, tflops_fp16)
 GPU_SPECS = {
     "RTX 3080": (10, "Consumer", "consumer", 0, 29.8),
     "RTX 3090": (24, "Consumer", "consumer", 0, 35.6),
     "RTX 4080": (16, "Consumer", "consumer", 0, 48.7),
     "RTX 4090": (24, "Consumer", "consumer", 0, 82.6),
     "RTX 5090": (32, "Consumer (est.)", "consumer", 0, 105.0),
     "M2 Ultra": (192, "Mac Studio (Unified)", "apple", 0, 27.2),
     "M3 Max": (128, "MacBook Pro (Unified)", "apple", 0, 14.2),
     "M4 Max": (128, "MacBook Pro (Unified)", "apple", 0, 18.0),
     "RTX A6000": (48, "Workstation", "workstation", 0, 38.7),
     "L40S": (48, "AWS g6.xlarge (~$1.00/hr)", "cloud", 1.00, 91.6),
     "A10G": (24, "AWS g5.xlarge (~$1.00/hr)", "cloud", 1.00, 31.2),
     "L4": (24, "GCP g2-standard-4 (~$0.70/hr)", "cloud", 0.70, 30.3),
+    "A100 40GB": (40, "AWS p4d (~$3/hr)", "cloud", 3.00, 77.9),
+    "A100 80GB": (80, "AWS p4de (~$5/hr)", "cloud", 5.00, 77.9),
+    "H100 80GB": (80, "AWS p5 (~$8/hr)", "cloud", 8.00, 267.6),
+    "H200 141GB": (141, "Coming soon (~$12/hr)", "cloud", 12.00, 296.0),
     "MI300X": (192, "AMD Cloud Instances", "amd", 6.00, 383.0),
 }
 DTYPE_BYTES = {
     "F32": 4, "float32": 4,
     "F16": 2, "float16": 2,
     "I64": 8, "int64": 8,
 }
 SERVING_FRAMEWORKS = {
     "None (raw PyTorch)": 1.20,
     "vLLM": 1.10,
+    "TGI": 1.15,
     "llama.cpp": 1.05,
+    "Transformers": 1.25,
     "Ollama": 1.08,
 }
 QUANTIZATION_METHODS = {
+    "FP16/BF16": {"bytes": 2.0, "quality": "100%", "desc": "Full precision"},
+    "INT8": {"bytes": 1.0, "quality": "~99%", "desc": "Good balance"},
+    "AWQ 4-bit": {"bytes": 0.5, "quality": "~97%", "desc": "Activation-aware"},
+    "GPTQ 4-bit": {"bytes": 0.5, "quality": "~95%", "desc": "GPU optimized"},
+    "GGUF Q8_0": {"bytes": 1.0, "quality": "~99%", "desc": "llama.cpp"},
+    "GGUF Q4_K_M": {"bytes": 0.5, "quality": "~95%", "desc": "llama.cpp"},
+    "GGUF Q2_K": {"bytes": 0.3125, "quality": "~85%", "desc": "Aggressive"},
 }
+def bytes_to_gb(b):
     return b / (1024 ** 3)
 @lru_cache(maxsize=50)
+def get_model_info_cached(model_id):
     try:
+        return api.model_info(model_id, files_metadata=True)
     except Exception as e:
+        return None
 @lru_cache(maxsize=50)
+def get_config_cached(model_id):
     try:
         config_path = hf_hub_download(model_id, "config.json")
         with open(config_path) as f:
             return f.read()
+    except Exception:
+        return "{}"
+def get_model_info(model_id):
     result = get_model_info_cached(model_id)
+    if result is None:
+        raise gr.Error(f"Could not fetch model info for {model_id}")
     return result
+def get_config(model_id):
     config_str = get_config_cached(model_id)
     return json.loads(config_str)
+def estimate_params_from_safetensors(info):
     if hasattr(info, 'safetensors') and info.safetensors:
         param_count = info.safetensors.total
         params_by_dtype = info.safetensors.parameters
     return 0, "F16"
+def get_head_dim(config):
     if "head_dim" in config:
         return config["head_dim"]
     hidden_size = config.get("hidden_size", config.get("n_embd", 0))
     return 128
+def estimate_kv_cache_size(num_layers, num_kv_heads, head_dim, context_length, batch_size, dtype_bytes):
     return 2 * num_layers * batch_size * context_length * num_kv_heads * head_dim * dtype_bytes
+def estimate_training_memory(param_count, dtype_bytes, optimizer):
+    weights = param_count * dtype_bytes
+    gradients = param_count * dtype_bytes
     if optimizer == "AdamW":
+        opt_bytes = param_count * 4 * 2
     elif optimizer == "SGD":
+        opt_bytes = 0
     elif optimizer == "SGD + Momentum":
+        opt_bytes = param_count * 4
     else:
+        opt_bytes = param_count * 2
+    return {"weights": weights, "gradients": gradients, "optimizer": opt_bytes,
+            "total": weights + gradients + opt_bytes}
+def estimate_lora_memory(param_count, dtype_bytes, lora_rank, use_qlora):
     if use_qlora:
+        base = param_count * 0.5
     else:
+        base = param_count * dtype_bytes
+    lora_params = int(param_count * lora_rank * 0.0001)
+    lora_weights = lora_params * dtype_bytes
+    gradients = lora_params * dtype_bytes
+    optimizer = lora_params * 8
+    activations = base * 0.5
+    return {"base": base, "lora": lora_weights, "lora_params": lora_params,
+            "gradients": gradients, "optimizer": optimizer, "activations": activations,
+            "total": base + lora_weights + gradients + optimizer + activations}
+def estimate_throughput(param_count, gpu_tflops, batch_size):
     flops_per_token = 2 * param_count
+    peak = (gpu_tflops * 1e12) / flops_per_token
+    memory_bound = (1e12) / (param_count * 2)
+    effective = min(peak, memory_bound) * batch_size * 0.4
+    return effective
+def calculate_cost_estimate(vram_required):
+    estimates = []
+    for gpu, (vram, instance, cat, hourly, tflops) in GPU_SPECS.items():
+        if vram >= vram_required and hourly > 0:
+            estimates.append({"gpu": gpu, "hourly": hourly, "daily": hourly * 8, "monthly": hourly * 176})
+    return sorted(estimates, key=lambda x: x["hourly"])[:5]
+def search_hf_models(query):
     if not query or len(query) < 2:
+        return gr.update(choices=[], value=None)
     try:
+        models = list(list_models(search=query, sort="downloads", direction=-1, limit=10))
+        model_ids = [m.id for m in models if hasattr(m, 'id')]
+        if model_ids:
+            return gr.update(choices=model_ids, value=model_ids[0])
+        return gr.update(choices=["No models found"], value=None)
     except Exception:
+        return gr.update(choices=["Search failed"], value=None)
+def select_searched_model(selected):
+    if selected and selected not in ["No models found", "Search failed"]:
+        return selected
+    return ""
+def run_calculation(model_id, context_length, batch_size, mode, optimizer, framework,
+                    num_gpus, parallelism, flash_attn, lora_rank, show_throughput, show_cost):
+    model_id = model_id.strip() if model_id else ""
     if not model_id:
+        return "Please enter a model ID", None
     if "/" not in model_id:
+        return "Model ID should be in format 'organization/model-name'", None
+    try:
+        info = get_model_info(model_id)
+        config = get_config(model_id)
+    except Exception as e:
+        return f"Error fetching model: {str(e)}", None
     param_count, dominant_dtype = estimate_params_from_safetensors(info)
     if param_count == 0:
+        return "Could not determine parameters. Model may use pytorch_model.bin format.", None
     dtype_bytes = DTYPE_BYTES.get(dominant_dtype, 2)
     params_b = param_count / 1e9
+    weights_gb = bytes_to_gb(param_count * dtype_bytes)
+    num_layers = config.get("num_hidden_layers", config.get("n_layer", 32))
+    num_kv_heads = config.get("num_key_value_heads", config.get("num_attention_heads", 32))
     head_dim = get_head_dim(config)
+    kv_bytes = estimate_kv_cache_size(num_layers, num_kv_heads, head_dim, context_length, batch_size, dtype_bytes)
+    kv_gb = bytes_to_gb(kv_bytes)
+    results = []
+    results.append(f"## [{model_id}](https://huggingface.co/{model_id})")
+    results.append(f"**Parameters:** {params_b:.2f}B | **Dtype:** {dominant_dtype}")
+    results.append(f"\n### Memory Breakdown")
     if mode == "Training (Full)":
+        train = estimate_training_memory(param_count, dtype_bytes, optimizer)
         activation_gb = weights_gb * 2 * batch_size
+        total_gb = bytes_to_gb(train["total"]) + kv_gb + activation_gb
         results.append(f"- Weights: {weights_gb:.2f} GB")
+        results.append(f"- Gradients: {bytes_to_gb(train['gradients']):.2f} GB")
+        results.append(f"- Optimizer: {bytes_to_gb(train['optimizer']):.2f} GB")
+        results.append(f"- Activations: {activation_gb:.2f} GB")
+        chart_data = {"Weights": weights_gb, "Gradients": bytes_to_gb(train['gradients']),
+                     "Optimizer": bytes_to_gb(train['optimizer']), "Activations": activation_gb}
     elif mode == "LoRA Fine-tuning":
+        lora = estimate_lora_memory(param_count, dtype_bytes, lora_rank, False)
+        total_gb = bytes_to_gb(lora["total"])
+        results.append(f"- Base weights: {bytes_to_gb(lora['base']):.2f} GB")
+        results.append(f"- LoRA adapters: {bytes_to_gb(lora['lora']):.3f} GB")
+        results.append(f"- Activations: {bytes_to_gb(lora['activations']):.2f} GB")
+        chart_data = {"Base": bytes_to_gb(lora['base']), "LoRA": bytes_to_gb(lora['lora']),
+                     "Activations": bytes_to_gb(lora['activations'])}
     elif mode == "QLoRA Fine-tuning":
+        lora = estimate_lora_memory(param_count, dtype_bytes, lora_rank, True)
+        total_gb = bytes_to_gb(lora["total"])
+        results.append(f"- Base weights (4-bit): {bytes_to_gb(lora['base']):.2f} GB")
+        results.append(f"- LoRA adapters: {bytes_to_gb(lora['lora']):.3f} GB")
+        results.append(f"- Activations: {bytes_to_gb(lora['activations']):.2f} GB")
+        chart_data = {"Base (4-bit)": bytes_to_gb(lora['base']), "LoRA": bytes_to_gb(lora['lora']),
+                     "Activations": bytes_to_gb(lora['activations'])}
+    else:  # Inference
+        overhead_mult = SERVING_FRAMEWORKS.get(framework, 1.15)
+        overhead_gb = (weights_gb + kv_gb) * (overhead_mult - 1)
+        total_gb = weights_gb + kv_gb + overhead_gb
         results.append(f"- Weights: {weights_gb:.2f} GB")
         results.append(f"- KV Cache: {kv_gb:.2f} GB")
+        results.append(f"- Overhead: {overhead_gb:.2f} GB")
+        chart_data = {"Weights": weights_gb, "KV Cache": kv_gb, "Overhead": overhead_gb}
     if num_gpus > 1:
+        per_gpu = total_gb / num_gpus * 1.05
+        results.append(f"\n### Multi-GPU ({num_gpus}x)")
+        results.append(f"- Per GPU: {per_gpu:.2f} GB")
+        effective_vram = per_gpu
     else:
+        effective_vram = total_gb
+    results.append(f"\n### Total VRAM: **{total_gb:.2f} GB**")
     results.append(f"\n### GPU Recommendations")
+    results.append("| GPU | VRAM | Fits | Headroom |")
+    results.append("|-----|------|------|----------|")
+    for gpu, (vram, instance, cat, cost, tflops) in GPU_SPECS.items():
+        fits = "Yes" if vram >= effective_vram else "No"
+        headroom = vram - effective_vram
+        hr_str = f"+{headroom:.1f}" if headroom > 0 else f"{headroom:.1f}"
+        results.append(f"| {gpu} | {vram}GB | {fits} | {hr_str}GB |")
     if show_cost:
+        costs = calculate_cost_estimate(effective_vram)
+        if costs:
+            results.append(f"\n### Cloud Costs")
+            results.append("| GPU | Hourly | Monthly |")
+            results.append("|-----|--------|---------|")
+            for c in costs:
+                results.append(f"| {c['gpu']} | ${c['hourly']:.2f} | ${c['monthly']:.0f} |")
+    if effective_vram > 24:
+        results.append(f"\n### Quantization Options")
+        results.append("| Method | Size | Quality |")
+        results.append("|--------|------|---------|")
+        for method, specs in QUANTIZATION_METHODS.items():
+            size = bytes_to_gb(param_count * specs["bytes"]) * 1.1
+            fits = "Yes" if size <= 24 else "No"
+            results.append(f"| {method} | {size:.1f}GB | {specs['quality']} |")
+    df = pd.DataFrame({"Component": list(chart_data.keys()), "GB": list(chart_data.values())})
     return "\n".join(results), df
+def run_comparison(models_text, context_length):
+    if not models_text:
+        return "Enter model IDs, one per line"
+    model_ids = [m.strip() for m in models_text.strip().split("\n") if m.strip()]
     if len(model_ids) < 2:
+        return "Enter at least 2 model IDs"
     if len(model_ids) > 5:
+        return "Maximum 5 models"
+    results = ["## Model Comparison", f"*Context: {context_length:,}*\n"]
+    results.append("| Model | Params | Inference | Training | QLoRA |")
+    results.append("|-------|--------|-----------|----------|-------|")
+    data = []
     for model_id in model_ids:
         try:
             info = get_model_info(model_id)
             config = get_config(model_id)
+            param_count, dtype = estimate_params_from_safetensors(info)
             if param_count == 0:
+                results.append(f"| {model_id} | Error | - | - | - |")
                 continue
+            dtype_bytes = DTYPE_BYTES.get(dtype, 2)
             weights_gb = bytes_to_gb(param_count * dtype_bytes)
+            num_layers = config.get("num_hidden_layers", 32)
+            num_kv_heads = config.get("num_key_value_heads", 32)
             head_dim = get_head_dim(config)
+            kv_gb = bytes_to_gb(estimate_kv_cache_size(num_layers, num_kv_heads, head_dim, context_length, 1, dtype_bytes))
+            inference_gb = weights_gb + kv_gb
+            train = estimate_training_memory(param_count, dtype_bytes, "AdamW")
+            training_gb = bytes_to_gb(train["total"]) + weights_gb * 2
+            qlora = estimate_lora_memory(param_count, dtype_bytes, 16, True)
+            qlora_gb = bytes_to_gb(qlora["total"])
+            name = model_id.split("/")[-1]
+            results.append(f"| {name} | {param_count/1e9:.1f}B | {inference_gb:.1f}GB | {training_gb:.1f}GB | {qlora_gb:.1f}GB |")
+            data.append({"name": name, "inference": inference_gb, "training": training_gb, "qlora": qlora_gb})
         except Exception as e:
+            results.append(f"| {model_id} | Error | - | - | - |")
+    if len(data) >= 2:
+        results.append("\n### Best Options")
+        best_inf = min(data, key=lambda x: x["inference"])
+        best_train = min(data, key=lambda x: x["training"])
+        best_qlora = min(data, key=lambda x: x["qlora"])
+        results.append(f"- Inference: {best_inf['name']} ({best_inf['inference']:.1f}GB)")
+        results.append(f"- Training: {best_train['name']} ({best_train['training']:.1f}GB)")
+        results.append(f"- QLoRA: {best_qlora['name']} ({best_qlora['qlora']:.1f}GB)")
     return "\n".join(results)
+def run_export(text, fmt):
+    if not text:
+        return "No results to export"
     timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+    if fmt == "JSON":
+        return json.dumps({"timestamp": timestamp, "content": text}, indent=2)
+    return f"Export - {timestamp}\n{'='*40}\n\n{text.replace('**', '').replace('###', '---')}"
+# Build the interface
+with gr.Blocks(title="VRAM Calculator", theme=gr.themes.Soft()) as demo:
+    gr.Markdown("# VRAM & Instance Type Calculator")
+    gr.Markdown("Estimate GPU memory for HuggingFace models")
+    with gr.Tabs() as tabs:
+        with gr.TabItem("Calculator"):
+            with gr.Row():
+                model_input = gr.Textbox(label="Model ID", placeholder="meta-llama/Llama-3.1-8B", scale=2)
+                search_input = gr.Textbox(label="Search", placeholder="llama 8b", scale=1)
+            with gr.Row():
+                search_btn = gr.Button("Search HuggingFace")
+                search_dropdown = gr.Dropdown(label="Results", choices=[], interactive=True)
+            search_btn.click(fn=search_hf_models, inputs=[search_input], outputs=[search_dropdown])
+            search_dropdown.change(fn=select_searched_model, inputs=[search_dropdown], outputs=[model_input])
+            with gr.Row():
+                mode_input = gr.Radio(
+                    choices=["Inference", "Training (Full)", "LoRA Fine-tuning", "QLoRA Fine-tuning"],
+                    value="Inference", label="Mode"
+                )
+                context_input = gr.Slider(minimum=512, maximum=131072, value=4096, step=512, label="Context Length")
+                batch_input = gr.Slider(minimum=1, maximum=64, value=1, step=1, label="Batch Size")
+            with gr.Accordion("Advanced Options", open=False):
+                with gr.Row():
+                    serving_input = gr.Dropdown(choices=list(SERVING_FRAMEWORKS.keys()), value="vLLM", label="Framework")
+                    optimizer_input = gr.Dropdown(choices=["AdamW", "SGD", "SGD + Momentum", "8-bit Adam"], value="AdamW", label="Optimizer")
+                    lora_rank_input = gr.Slider(minimum=4, maximum=128, value=16, step=4, label="LoRA Rank")
+                with gr.Row():
+                    num_gpus_input = gr.Slider(minimum=1, maximum=8, value=1, step=1, label="GPUs")
+                    parallelism_input = gr.Dropdown(choices=["Tensor", "Pipeline", "Data"], value="Tensor", label="Parallelism")
+                    flash_input = gr.Checkbox(value=True, label="Flash Attention")
+                with gr.Row():
+                    throughput_input = gr.Checkbox(value=True, label="Show Throughput")
+                    cost_input = gr.Checkbox(value=True, label="Show Costs")
+            calc_btn = gr.Button("Calculate VRAM", variant="primary", size="lg")
+            with gr.Row():
+                output_md = gr.Markdown()
+                output_chart = gr.BarPlot(x="Component", y="GB", title="Memory Breakdown", height=350, width=400)
+            calc_btn.click(
+                fn=run_calculation,
+                inputs=[model_input, context_input, batch_input, mode_input, optimizer_input,
+                       serving_input, num_gpus_input, parallelism_input, flash_input,
+                       lora_rank_input, throughput_input, cost_input],
+                outputs=[output_md, output_chart]
             )
+            gr.Markdown("### Popular Models")
+            examples = gr.Examples(
+                examples=[
+                    ["meta-llama/Llama-3.1-8B", 4096, 1],
+                    ["meta-llama/Llama-3.1-70B", 8192, 1],
+                    ["mistralai/Mistral-7B-v0.1", 4096, 1],
+                    ["Qwen/Qwen2.5-7B", 8192, 1],
+                    ["google/gemma-2-9b", 8192, 1],
+                ],
+                inputs=[model_input, context_input, batch_input],
             )
+        with gr.TabItem("Compare"):
+            gr.Markdown("Compare multiple models (one per line, 2-5 models)")
+            compare_input = gr.Textbox(
+                label="Model IDs",
+                placeholder="meta-llama/Llama-3.1-8B\nmistralai/Mistral-7B-v0.1",
+                lines=5
             )
+            compare_ctx = gr.Slider(minimum=512, maximum=131072, value=4096, step=512, label="Context")
+            compare_btn = gr.Button("Compare", variant="primary")
+            compare_output = gr.Markdown()
+            compare_btn.click(fn=run_comparison, inputs=[compare_input, compare_ctx], outputs=[compare_output])
+            gr.Examples(
+                examples=[
+                    ["meta-llama/Llama-3.1-8B\nmistralai/Mistral-7B-v0.1", 4096],
+                ],
+                inputs=[compare_input, compare_ctx],
             )
+        with gr.TabItem("Export"):
+            gr.Markdown("Export results to JSON or text")
+            export_input = gr.Textbox(label="Paste Results", lines=10, placeholder="Paste results here...")
+            export_fmt = gr.Radio(choices=["JSON", "Text"], value="JSON", label="Format")
+            export_btn = gr.Button("Export", variant="primary")
+            export_output = gr.Textbox(label="Output", lines=15, show_copy_button=True)
+            export_btn.click(fn=run_export, inputs=[export_input, export_fmt], outputs=[export_output])
+    gr.Markdown("---\n*Estimates are approximate. Built with Gradio & HuggingFace Hub.*")
 if __name__ == "__main__":
     demo.launch()