Livengood Claude commited on
Commit
b502091
·
1 Parent(s): 3df3dc4

Add enhanced features: colored GPU tables, cloud costs, more examples

Browse files

Features added:
- Color-coded status indicators (🟢🟡🔴) for GPU fit
- Separate sections for Consumer GPUs, Apple Silicon, Cloud GPUs
- Expanded cloud GPU options with hourly/daily/monthly costs
- Best value cloud recommendation
- GPU Reference tab with all hardware specs
- 12 popular model examples (Llama, Mistral, Qwen, Gemma, Phi, DeepSeek)
- Quick comparison sets for model families
- Improved memory breakdown tables
- Quantization options with fit indicators
- Soft theme for better readability

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <[email protected]>

Files changed (1) hide show
  1. app.py +214 -74
app.py CHANGED
@@ -8,22 +8,39 @@ from functools import lru_cache
8
 
9
  api = HfApi()
10
 
11
- GPU_SPECS = {
12
- "RTX 3080": (10, 0),
13
- "RTX 3090": (24, 0),
14
- "RTX 4080": (16, 0),
15
- "RTX 4090": (24, 0),
16
- "RTX 5090": (32, 0),
17
- "M2 Ultra": (192, 0),
18
- "M3 Max": (128, 0),
19
- "M4 Max": (128, 0),
20
- "RTX A6000": (48, 0),
21
- "L40S": (48, 1.00),
22
- "A10G": (24, 1.00),
 
 
 
 
 
 
 
 
 
 
 
 
23
  "L4": (24, 0.70),
 
 
 
 
24
  "A100 40GB": (40, 3.00),
25
  "A100 80GB": (80, 5.00),
26
  "H100 80GB": (80, 8.00),
 
27
  }
28
 
29
  DTYPE_BYTES = {
@@ -42,6 +59,15 @@ FRAMEWORKS = {
42
  "Ollama": 1.08,
43
  }
44
 
 
 
 
 
 
 
 
 
 
45
 
46
  def bytes_to_gb(b):
47
  return b / (1024 ** 3)
@@ -120,79 +146,135 @@ def calculate(model_id, context, batch, mode, framework, num_gpus, lora_rank):
120
  opt_gb = bytes_to_gb(params * 8)
121
  act_gb = weights_gb * 2 * batch
122
  total = weights_gb + grad_gb + opt_gb + act_gb
123
- out.append("### Training Memory")
124
- out.append("- Weights: " + str(round(weights_gb, 1)) + " GB")
125
- out.append("- Gradients: " + str(round(grad_gb, 1)) + " GB")
126
- out.append("- Optimizer: " + str(round(opt_gb, 1)) + " GB")
127
- out.append("- Activations: " + str(round(act_gb, 1)) + " GB")
 
 
128
  elif mode == "LoRA":
129
  base = weights_gb
130
  lora_params = int(params * lora_rank * 0.0001)
131
  lora_gb = bytes_to_gb(lora_params * dtype_bytes)
132
  act_gb = base * 0.3
133
  total = base + lora_gb + act_gb
134
- out.append("### LoRA Memory")
135
- out.append("- Base (frozen): " + str(round(base, 1)) + " GB")
136
- out.append("- LoRA adapters: " + str(round(lora_gb, 2)) + " GB")
137
- out.append("- Activations: " + str(round(act_gb, 1)) + " GB")
 
 
138
  elif mode == "QLoRA":
139
  base = bytes_to_gb(params * 0.5)
140
  lora_params = int(params * lora_rank * 0.0001)
141
  lora_gb = bytes_to_gb(lora_params * dtype_bytes)
142
  act_gb = base * 0.3
143
  total = base + lora_gb + act_gb
144
- out.append("### QLoRA Memory")
145
- out.append("- Base (4-bit): " + str(round(base, 1)) + " GB")
146
- out.append("- LoRA adapters: " + str(round(lora_gb, 2)) + " GB")
147
- out.append("- Activations: " + str(round(act_gb, 1)) + " GB")
 
 
148
  else:
149
  overhead = FRAMEWORKS.get(framework, 1.15)
150
  extra = (weights_gb + kv_gb) * (overhead - 1)
151
  total = weights_gb + kv_gb + extra
152
- out.append("### Inference Memory")
153
- out.append("- Weights: " + str(round(weights_gb, 1)) + " GB")
154
- out.append("- KV Cache: " + str(round(kv_gb, 1)) + " GB")
155
- out.append("- Overhead (" + framework + "): " + str(round(extra, 1)) + " GB")
 
 
156
 
157
  if num_gpus > 1:
158
  per_gpu = total / num_gpus * 1.05
159
  out.append("")
160
- out.append("**Multi-GPU (" + str(num_gpus) + "x):** " + str(round(per_gpu, 1)) + " GB/GPU")
161
  effective = per_gpu
162
  else:
163
  effective = total
164
 
165
  out.append("")
166
- out.append("## Total: " + str(round(total, 1)) + " GB")
167
 
 
168
  out.append("")
169
- out.append("### GPU Options")
170
- out.append("| GPU | VRAM | Fits | Headroom |")
171
- out.append("|-----|------|------|----------|")
172
- for gpu, (vram, cost) in GPU_SPECS.items():
173
- fits = "Yes" if vram >= effective else "No"
174
  hr = vram - effective
 
 
 
 
 
 
175
  sign = "+" if hr >= 0 else ""
176
- out.append("| " + gpu + " | " + str(vram) + "GB | " + fits + " | " + sign + str(round(hr, 1)) + "GB |")
177
 
178
- if effective > 24:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
179
  out.append("")
180
- out.append("### Quantization to fit 24GB")
181
- out.append("| Method | Size |")
182
- out.append("|--------|------|")
183
- for name, mult in [("INT8", 1.0), ("4-bit", 0.5), ("3-bit", 0.375)]:
184
- size = bytes_to_gb(params * mult) * 1.1
185
- out.append("| " + name + " | " + str(round(size, 1)) + "GB |")
186
 
187
- costs = [(gpu, cost) for gpu, (vram, cost) in GPU_SPECS.items() if vram >= effective and cost > 0]
188
- if costs:
189
- costs.sort(key=lambda x: x[1])
190
  out.append("")
191
- out.append("### Cloud Costs (8hr/day)")
192
- out.append("| GPU | $/hr | $/month |")
193
- out.append("|-----|------|---------|")
194
- for gpu, cost in costs[:4]:
195
- out.append("| " + gpu + " | $" + str(round(cost, 2)) + " | $" + str(int(cost * 176)) + " |")
 
 
196
 
197
  return "\n".join(out)
198
  except Exception as e:
@@ -210,17 +292,18 @@ def compare(models_text, context):
210
  return "Need at least 2 models"
211
 
212
  out = []
213
- out.append("## Comparison")
214
- out.append("| Model | Params | Inference | Training | QLoRA |")
215
- out.append("|-------|--------|-----------|----------|-------|")
 
216
 
217
- for mid in models[:5]:
218
  try:
219
  info = fetch_model_info(mid)
220
  config = fetch_config(mid)
221
  params, dtype = get_params(info)
222
  if params == 0:
223
- out.append("| " + mid + " | Error | - | - | - |")
224
  continue
225
 
226
  db = DTYPE_BYTES.get(dtype, 2)
@@ -234,10 +317,14 @@ def compare(models_text, context):
234
  train = w * 4 + w * 2
235
  qlora = bytes_to_gb(params * 0.5) * 1.5
236
 
237
- name = mid.split("/")[-1][:20]
238
- out.append("| " + name + " | " + str(round(params / 1e9, 1)) + "B | " + str(round(inf, 1)) + "GB | " + str(round(train, 1)) + "GB | " + str(round(qlora, 1)) + "GB |")
 
239
  except Exception:
240
- out.append("| " + mid + " | Error | - | - | - |")
 
 
 
241
 
242
  return "\n".join(out)
243
  except Exception as e:
@@ -245,16 +332,16 @@ def compare(models_text, context):
245
 
246
 
247
  # Build the interface
248
- with gr.Blocks(title="VRAM Calculator") as demo:
249
  gr.Markdown("# VRAM Calculator for LLMs")
250
- gr.Markdown("Estimate VRAM requirements for HuggingFace models")
251
 
252
  with gr.Tabs():
253
  with gr.TabItem("Calculator"):
254
  model_in = gr.Textbox(
255
  label="Model ID",
256
  placeholder="meta-llama/Llama-3.1-8B",
257
- info="Enter a HuggingFace model ID"
258
  )
259
 
260
  mode_in = gr.Radio(
@@ -269,38 +356,42 @@ with gr.Blocks(title="VRAM Calculator") as demo:
269
  maximum=131072,
270
  value=4096,
271
  step=512,
272
- label="Context Length"
 
273
  )
274
  batch_in = gr.Slider(
275
  minimum=1,
276
  maximum=64,
277
  value=1,
278
  step=1,
279
- label="Batch Size"
 
280
  )
281
 
282
  with gr.Accordion("Advanced Options", open=False):
283
  framework_in = gr.Dropdown(
284
  choices=list(FRAMEWORKS.keys()),
285
  value="vLLM",
286
- label="Framework"
287
  )
288
  gpus_in = gr.Slider(
289
  minimum=1,
290
  maximum=8,
291
  value=1,
292
  step=1,
293
- label="Number of GPUs"
 
294
  )
295
  lora_in = gr.Slider(
296
  minimum=4,
297
  maximum=128,
298
  value=16,
299
  step=4,
300
- label="LoRA Rank"
 
301
  )
302
 
303
- calc_btn = gr.Button("Calculate", variant="primary")
304
  output = gr.Markdown()
305
 
306
  calc_btn.click(
@@ -309,21 +400,32 @@ with gr.Blocks(title="VRAM Calculator") as demo:
309
  outputs=output
310
  )
311
 
 
312
  gr.Examples(
313
  examples=[
314
  ["meta-llama/Llama-3.1-8B"],
315
  ["meta-llama/Llama-3.1-70B"],
 
 
316
  ["mistralai/Mistral-7B-v0.1"],
 
 
 
 
 
 
 
317
  ],
318
  inputs=[model_in],
319
- label="Example Models"
320
  )
321
 
322
  with gr.TabItem("Compare Models"):
 
323
  cmp_in = gr.Textbox(
324
  label="Models (one per line)",
325
- lines=4,
326
- placeholder="meta-llama/Llama-3.1-8B\nmistralai/Mistral-7B-v0.1"
327
  )
328
  cmp_ctx = gr.Slider(
329
  minimum=512,
@@ -332,7 +434,7 @@ with gr.Blocks(title="VRAM Calculator") as demo:
332
  step=512,
333
  label="Context Length"
334
  )
335
- cmp_btn = gr.Button("Compare", variant="primary")
336
  cmp_out = gr.Markdown()
337
 
338
  cmp_btn.click(
@@ -341,8 +443,46 @@ with gr.Blocks(title="VRAM Calculator") as demo:
341
  outputs=cmp_out
342
  )
343
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
344
  gr.Markdown("---")
345
- gr.Markdown("*Estimates are approximate. Actual usage may vary.*")
346
 
347
  if __name__ == "__main__":
348
  demo.launch()
 
8
 
9
  api = HfApi()
10
 
11
+ # Consumer GPUs (no hourly cost)
12
+ CONSUMER_GPUS = {
13
+ "RTX 3080": 10,
14
+ "RTX 3080 Ti": 12,
15
+ "RTX 3090": 24,
16
+ "RTX 3090 Ti": 24,
17
+ "RTX 4080": 16,
18
+ "RTX 4080 Super": 16,
19
+ "RTX 4090": 24,
20
+ "RTX 5090": 32,
21
+ }
22
+
23
+ # Apple Silicon (no hourly cost)
24
+ APPLE_GPUS = {
25
+ "M1 Max": 64,
26
+ "M2 Max": 96,
27
+ "M2 Ultra": 192,
28
+ "M3 Max": 128,
29
+ "M4 Max": 128,
30
+ }
31
+
32
+ # Cloud/Datacenter GPUs (with hourly costs from major providers)
33
+ CLOUD_GPUS = {
34
+ "T4": (16, 0.35),
35
  "L4": (24, 0.70),
36
+ "A10G": (24, 1.00),
37
+ "RTX A5000": (24, 0.80),
38
+ "RTX A6000": (48, 1.50),
39
+ "L40S": (48, 1.20),
40
  "A100 40GB": (40, 3.00),
41
  "A100 80GB": (80, 5.00),
42
  "H100 80GB": (80, 8.00),
43
+ "H100 NVL": (94, 10.00),
44
  }
45
 
46
  DTYPE_BYTES = {
 
59
  "Ollama": 1.08,
60
  }
61
 
62
+ CONTEXT_PRESETS = {
63
+ "2K (fast chat)": 2048,
64
+ "4K (standard)": 4096,
65
+ "8K (extended)": 8192,
66
+ "16K (long docs)": 16384,
67
+ "32K (very long)": 32768,
68
+ "128K (full context)": 131072,
69
+ }
70
+
71
 
72
  def bytes_to_gb(b):
73
  return b / (1024 ** 3)
 
146
  opt_gb = bytes_to_gb(params * 8)
147
  act_gb = weights_gb * 2 * batch
148
  total = weights_gb + grad_gb + opt_gb + act_gb
149
+ out.append("### Training Memory Breakdown")
150
+ out.append("| Component | Size |")
151
+ out.append("|-----------|------|")
152
+ out.append("| Weights | " + str(round(weights_gb, 1)) + " GB |")
153
+ out.append("| Gradients | " + str(round(grad_gb, 1)) + " GB |")
154
+ out.append("| Optimizer (AdamW) | " + str(round(opt_gb, 1)) + " GB |")
155
+ out.append("| Activations | " + str(round(act_gb, 1)) + " GB |")
156
  elif mode == "LoRA":
157
  base = weights_gb
158
  lora_params = int(params * lora_rank * 0.0001)
159
  lora_gb = bytes_to_gb(lora_params * dtype_bytes)
160
  act_gb = base * 0.3
161
  total = base + lora_gb + act_gb
162
+ out.append("### LoRA Memory Breakdown")
163
+ out.append("| Component | Size |")
164
+ out.append("|-----------|------|")
165
+ out.append("| Base model (frozen) | " + str(round(base, 1)) + " GB |")
166
+ out.append("| LoRA adapters (rank " + str(lora_rank) + ") | " + str(round(lora_gb, 2)) + " GB |")
167
+ out.append("| Activations | " + str(round(act_gb, 1)) + " GB |")
168
  elif mode == "QLoRA":
169
  base = bytes_to_gb(params * 0.5)
170
  lora_params = int(params * lora_rank * 0.0001)
171
  lora_gb = bytes_to_gb(lora_params * dtype_bytes)
172
  act_gb = base * 0.3
173
  total = base + lora_gb + act_gb
174
+ out.append("### QLoRA Memory Breakdown")
175
+ out.append("| Component | Size |")
176
+ out.append("|-----------|------|")
177
+ out.append("| Base model (4-bit) | " + str(round(base, 1)) + " GB |")
178
+ out.append("| LoRA adapters (rank " + str(lora_rank) + ") | " + str(round(lora_gb, 2)) + " GB |")
179
+ out.append("| Activations | " + str(round(act_gb, 1)) + " GB |")
180
  else:
181
  overhead = FRAMEWORKS.get(framework, 1.15)
182
  extra = (weights_gb + kv_gb) * (overhead - 1)
183
  total = weights_gb + kv_gb + extra
184
+ out.append("### Inference Memory Breakdown")
185
+ out.append("| Component | Size |")
186
+ out.append("|-----------|------|")
187
+ out.append("| Model weights | " + str(round(weights_gb, 1)) + " GB |")
188
+ out.append("| KV Cache (" + str(context) + " ctx) | " + str(round(kv_gb, 1)) + " GB |")
189
+ out.append("| Framework overhead (" + framework + ") | " + str(round(extra, 1)) + " GB |")
190
 
191
  if num_gpus > 1:
192
  per_gpu = total / num_gpus * 1.05
193
  out.append("")
194
+ out.append("**Multi-GPU (" + str(num_gpus) + "x):** " + str(round(per_gpu, 1)) + " GB per GPU (includes 5% communication overhead)")
195
  effective = per_gpu
196
  else:
197
  effective = total
198
 
199
  out.append("")
200
+ out.append("## Total Required: " + str(round(total, 1)) + " GB")
201
 
202
+ # Consumer GPUs section with colors
203
  out.append("")
204
+ out.append("### Consumer GPUs")
205
+ out.append("| GPU | VRAM | Status | Headroom |")
206
+ out.append("|-----|------|--------|----------|")
207
+ for gpu, vram in CONSUMER_GPUS.items():
 
208
  hr = vram - effective
209
+ if hr >= 2:
210
+ status = "🟢 Good fit"
211
+ elif hr >= 0:
212
+ status = "🟡 Tight fit"
213
+ else:
214
+ status = "🔴 Too small"
215
  sign = "+" if hr >= 0 else ""
216
+ out.append("| " + gpu + " | " + str(vram) + "GB | " + status + " | " + sign + str(round(hr, 1)) + "GB |")
217
 
218
+ # Apple Silicon section
219
+ out.append("")
220
+ out.append("### Apple Silicon (Unified Memory)")
221
+ out.append("| Chip | Memory | Status | Headroom |")
222
+ out.append("|------|--------|--------|----------|")
223
+ for gpu, vram in APPLE_GPUS.items():
224
+ hr = vram - effective
225
+ if hr >= 10:
226
+ status = "🟢 Excellent"
227
+ elif hr >= 0:
228
+ status = "🟡 Usable"
229
+ else:
230
+ status = "🔴 Too small"
231
+ sign = "+" if hr >= 0 else ""
232
+ out.append("| " + gpu + " | " + str(vram) + "GB | " + status + " | " + sign + str(round(hr, 1)) + "GB |")
233
+
234
+ # Cloud GPUs section with costs
235
+ out.append("")
236
+ out.append("### Cloud GPU Options")
237
+ out.append("| GPU | VRAM | Status | $/hour | $/day (8hr) | $/month |")
238
+ out.append("|-----|------|--------|--------|-------------|---------|")
239
+
240
+ cloud_options = []
241
+ for gpu, (vram, cost) in CLOUD_GPUS.items():
242
+ hr = vram - effective
243
+ if hr >= 2:
244
+ status = "🟢 Good"
245
+ elif hr >= 0:
246
+ status = "🟡 Tight"
247
+ else:
248
+ status = "🔴 No"
249
+ daily = cost * 8
250
+ monthly = cost * 176 # 22 days * 8 hours
251
+ cloud_options.append((gpu, vram, hr, status, cost, daily, monthly))
252
+
253
+ # Sort by cost for those that fit
254
+ cloud_options.sort(key=lambda x: (x[2] < 0, x[4]))
255
+
256
+ for gpu, vram, hr, status, cost, daily, monthly in cloud_options:
257
+ sign = "+" if hr >= 0 else ""
258
+ out.append("| " + gpu + " | " + str(vram) + "GB | " + status + " | $" + str(round(cost, 2)) + " | $" + str(round(daily, 2)) + " | $" + str(int(monthly)) + " |")
259
+
260
+ # Best value recommendation
261
+ fitting_gpus = [(gpu, cost) for gpu, (vram, cost) in CLOUD_GPUS.items() if vram >= effective]
262
+ if fitting_gpus:
263
+ fitting_gpus.sort(key=lambda x: x[1])
264
+ best = fitting_gpus[0]
265
  out.append("")
266
+ out.append("**Best value cloud option:** " + best[0] + " at $" + str(round(best[1], 2)) + "/hour")
 
 
 
 
 
267
 
268
+ # Quantization suggestions if model is large
269
+ if effective > 24:
 
270
  out.append("")
271
+ out.append("### Quantization Options (to fit consumer GPUs)")
272
+ out.append("| Method | Estimated Size | Fits 24GB |")
273
+ out.append("|--------|----------------|-----------|")
274
+ for name, mult in [("INT8", 1.0), ("4-bit (GPTQ/AWQ)", 0.5), ("3-bit", 0.375), ("2-bit (extreme)", 0.25)]:
275
+ size = bytes_to_gb(params * mult) * 1.1
276
+ fits = "🟢 Yes" if size <= 24 else "🔴 No"
277
+ out.append("| " + name + " | " + str(round(size, 1)) + "GB | " + fits + " |")
278
 
279
  return "\n".join(out)
280
  except Exception as e:
 
292
  return "Need at least 2 models"
293
 
294
  out = []
295
+ out.append("## Model Comparison")
296
+ out.append("")
297
+ out.append("| Model | Params | Inference | Training | QLoRA | Fits 24GB |")
298
+ out.append("|-------|--------|-----------|----------|-------|-----------|")
299
 
300
+ for mid in models[:8]:
301
  try:
302
  info = fetch_model_info(mid)
303
  config = fetch_config(mid)
304
  params, dtype = get_params(info)
305
  if params == 0:
306
+ out.append("| " + mid + " | Error | - | - | - | - |")
307
  continue
308
 
309
  db = DTYPE_BYTES.get(dtype, 2)
 
317
  train = w * 4 + w * 2
318
  qlora = bytes_to_gb(params * 0.5) * 1.5
319
 
320
+ fits = "🟢 Yes" if inf <= 24 else "🔴 No"
321
+ name = mid.split("/")[-1][:25]
322
+ out.append("| " + name + " | " + str(round(params / 1e9, 1)) + "B | " + str(round(inf, 1)) + "GB | " + str(round(train, 1)) + "GB | " + str(round(qlora, 1)) + "GB | " + fits + " |")
323
  except Exception:
324
+ out.append("| " + mid + " | Error | - | - | - | - |")
325
+
326
+ out.append("")
327
+ out.append("*Context length: " + str(context) + " tokens*")
328
 
329
  return "\n".join(out)
330
  except Exception as e:
 
332
 
333
 
334
  # Build the interface
335
+ with gr.Blocks(title="VRAM Calculator", theme=gr.themes.Soft()) as demo:
336
  gr.Markdown("# VRAM Calculator for LLMs")
337
+ gr.Markdown("Estimate VRAM requirements for HuggingFace models - inference, training, LoRA, and QLoRA")
338
 
339
  with gr.Tabs():
340
  with gr.TabItem("Calculator"):
341
  model_in = gr.Textbox(
342
  label="Model ID",
343
  placeholder="meta-llama/Llama-3.1-8B",
344
+ info="Enter a HuggingFace model ID (e.g., organization/model-name)"
345
  )
346
 
347
  mode_in = gr.Radio(
 
356
  maximum=131072,
357
  value=4096,
358
  step=512,
359
+ label="Context Length",
360
+ info="Max tokens for KV cache"
361
  )
362
  batch_in = gr.Slider(
363
  minimum=1,
364
  maximum=64,
365
  value=1,
366
  step=1,
367
+ label="Batch Size",
368
+ info="Concurrent sequences"
369
  )
370
 
371
  with gr.Accordion("Advanced Options", open=False):
372
  framework_in = gr.Dropdown(
373
  choices=list(FRAMEWORKS.keys()),
374
  value="vLLM",
375
+ label="Inference Framework"
376
  )
377
  gpus_in = gr.Slider(
378
  minimum=1,
379
  maximum=8,
380
  value=1,
381
  step=1,
382
+ label="Number of GPUs",
383
+ info="For tensor parallelism"
384
  )
385
  lora_in = gr.Slider(
386
  minimum=4,
387
  maximum=128,
388
  value=16,
389
  step=4,
390
+ label="LoRA Rank",
391
+ info="Higher = more parameters"
392
  )
393
 
394
+ calc_btn = gr.Button("Calculate VRAM", variant="primary")
395
  output = gr.Markdown()
396
 
397
  calc_btn.click(
 
400
  outputs=output
401
  )
402
 
403
+ gr.Markdown("### Popular Models")
404
  gr.Examples(
405
  examples=[
406
  ["meta-llama/Llama-3.1-8B"],
407
  ["meta-llama/Llama-3.1-70B"],
408
+ ["meta-llama/Llama-3.2-1B"],
409
+ ["meta-llama/Llama-3.2-3B"],
410
  ["mistralai/Mistral-7B-v0.1"],
411
+ ["mistralai/Mixtral-8x7B-v0.1"],
412
+ ["Qwen/Qwen2.5-7B"],
413
+ ["Qwen/Qwen2.5-72B"],
414
+ ["google/gemma-2-9b"],
415
+ ["google/gemma-2-27b"],
416
+ ["microsoft/phi-3-mini-4k-instruct"],
417
+ ["deepseek-ai/DeepSeek-V2-Lite"],
418
  ],
419
  inputs=[model_in],
420
+ label="Click to load"
421
  )
422
 
423
  with gr.TabItem("Compare Models"):
424
+ gr.Markdown("Compare VRAM requirements across multiple models")
425
  cmp_in = gr.Textbox(
426
  label="Models (one per line)",
427
+ lines=6,
428
+ placeholder="meta-llama/Llama-3.1-8B\nmeta-llama/Llama-3.1-70B\nmistralai/Mistral-7B-v0.1\nQwen/Qwen2.5-7B"
429
  )
430
  cmp_ctx = gr.Slider(
431
  minimum=512,
 
434
  step=512,
435
  label="Context Length"
436
  )
437
+ cmp_btn = gr.Button("Compare Models", variant="primary")
438
  cmp_out = gr.Markdown()
439
 
440
  cmp_btn.click(
 
443
  outputs=cmp_out
444
  )
445
 
446
+ gr.Markdown("### Quick Comparison Sets")
447
+ gr.Examples(
448
+ examples=[
449
+ ["meta-llama/Llama-3.1-8B\nmeta-llama/Llama-3.1-70B\nmeta-llama/Llama-3.2-3B"],
450
+ ["mistralai/Mistral-7B-v0.1\nmistralai/Mixtral-8x7B-v0.1"],
451
+ ["Qwen/Qwen2.5-7B\nQwen/Qwen2.5-14B\nQwen/Qwen2.5-72B"],
452
+ ["google/gemma-2-2b\ngoogle/gemma-2-9b\ngoogle/gemma-2-27b"],
453
+ ],
454
+ inputs=[cmp_in],
455
+ label="Click to load comparison"
456
+ )
457
+
458
+ with gr.TabItem("GPU Reference"):
459
+ gr.Markdown("## GPU VRAM Reference")
460
+ gr.Markdown("### Consumer GPUs (NVIDIA GeForce)")
461
+ consumer_md = "| GPU | VRAM | Notes |\n|-----|------|-------|\n"
462
+ for gpu, vram in CONSUMER_GPUS.items():
463
+ consumer_md += "| " + gpu + " | " + str(vram) + "GB | Consumer |\n"
464
+ gr.Markdown(consumer_md)
465
+
466
+ gr.Markdown("### Apple Silicon")
467
+ apple_md = "| Chip | Unified Memory | Notes |\n|------|----------------|-------|\n"
468
+ for gpu, vram in APPLE_GPUS.items():
469
+ apple_md += "| " + gpu + " | " + str(vram) + "GB | Shared CPU/GPU |\n"
470
+ gr.Markdown(apple_md)
471
+
472
+ gr.Markdown("### Cloud/Datacenter GPUs")
473
+ cloud_md = "| GPU | VRAM | Typical $/hr | Best For |\n|-----|------|--------------|----------|\n"
474
+ for gpu, (vram, cost) in CLOUD_GPUS.items():
475
+ if vram <= 24:
476
+ use = "7B models, fine-tuning"
477
+ elif vram <= 48:
478
+ use = "13B-30B models"
479
+ else:
480
+ use = "70B+ models, training"
481
+ cloud_md += "| " + gpu + " | " + str(vram) + "GB | $" + str(round(cost, 2)) + " | " + use + " |\n"
482
+ gr.Markdown(cloud_md)
483
+
484
  gr.Markdown("---")
485
+ gr.Markdown("*Estimates are approximate. Actual usage varies by implementation, batch size, and optimizations.*")
486
 
487
  if __name__ == "__main__":
488
  demo.launch()