gary-boon Claude Opus 4.5 commited on
Commit
2860768
·
1 Parent(s): 76020ee

Add system prompt support for instruction-tuned models

Browse files

- Add prompt_style and system_prompt fields to ModelConfig
- Create prompt_formatter.py service for unified prompt handling
- Update research endpoint to use formatter with proper system prompts
- Devstral now receives proper system + user message format

This fixes the garbage token output from Devstral by properly
formatting prompts for instruction-tuned models.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <[email protected]>

backend/model_config.py CHANGED
@@ -24,6 +24,8 @@ class ModelConfig(TypedDict):
24
  min_ram_gb: float
25
  recommended_dtype: str # "fp16", "bf16", or "fp32"
26
  uses_chat_template: bool # Whether model expects instruction format
 
 
27
 
28
 
29
  # Supported models registry
@@ -43,7 +45,9 @@ SUPPORTED_MODELS: Dict[str, ModelConfig] = {
43
  "min_vram_gb": 2.0,
44
  "min_ram_gb": 4.0,
45
  "recommended_dtype": "fp16", # fp16 for GPU, fp32 for CPU
46
- "uses_chat_template": False # Base model, raw completion
 
 
47
  },
48
  "code-llama-7b": {
49
  "hf_path": "codellama/CodeLlama-7b-hf",
@@ -60,7 +64,9 @@ SUPPORTED_MODELS: Dict[str, ModelConfig] = {
60
  "min_vram_gb": 14.0, # FP16 requires ~14GB VRAM
61
  "min_ram_gb": 18.0, # FP16 requires ~18GB RAM for CPU fallback
62
  "recommended_dtype": "fp16",
63
- "uses_chat_template": False # Base model, raw completion
 
 
64
  },
65
  "devstral-small": {
66
  "hf_path": "mistralai/Devstral-Small-2507",
@@ -77,7 +83,9 @@ SUPPORTED_MODELS: Dict[str, ModelConfig] = {
77
  "min_vram_gb": 48.0, # BF16 requires ~48GB VRAM
78
  "min_ram_gb": 96.0, # BF16 requires ~96GB RAM for CPU fallback
79
  "recommended_dtype": "bf16", # Devstral requires bfloat16
80
- "uses_chat_template": True # Instruction-tuned, requires chat format
 
 
81
  }
82
  }
83
 
 
24
  min_ram_gb: float
25
  recommended_dtype: str # "fp16", "bf16", or "fp32"
26
  uses_chat_template: bool # Whether model expects instruction format
27
+ prompt_style: str # "completion" | "instruction" - how to format prompts
28
+ system_prompt: Optional[str] # Default system prompt for instruction models
29
 
30
 
31
  # Supported models registry
 
45
  "min_vram_gb": 2.0,
46
  "min_ram_gb": 4.0,
47
  "recommended_dtype": "fp16", # fp16 for GPU, fp32 for CPU
48
+ "uses_chat_template": False, # Base model, raw completion
49
+ "prompt_style": "completion", # Raw text continuation
50
+ "system_prompt": None # Base models don't use system prompts
51
  },
52
  "code-llama-7b": {
53
  "hf_path": "codellama/CodeLlama-7b-hf",
 
64
  "min_vram_gb": 14.0, # FP16 requires ~14GB VRAM
65
  "min_ram_gb": 18.0, # FP16 requires ~18GB RAM for CPU fallback
66
  "recommended_dtype": "fp16",
67
+ "uses_chat_template": False, # Base model, raw completion
68
+ "prompt_style": "completion", # Raw text continuation
69
+ "system_prompt": None # Base models don't use system prompts
70
  },
71
  "devstral-small": {
72
  "hf_path": "mistralai/Devstral-Small-2507",
 
83
  "min_vram_gb": 48.0, # BF16 requires ~48GB VRAM
84
  "min_ram_gb": 96.0, # BF16 requires ~96GB RAM for CPU fallback
85
  "recommended_dtype": "bf16", # Devstral requires bfloat16
86
+ "uses_chat_template": True, # Instruction-tuned, requires chat format
87
+ "prompt_style": "instruction", # Requires system + user messages
88
+ "system_prompt": "You are an expert Python programmer. Continue the code provided by the user. Output only valid Python code, no explanations or markdown."
89
  }
90
  }
91
 
backend/model_service.py CHANGED
@@ -1487,38 +1487,32 @@ async def analyze_research_attention(request: Dict[str, Any], authenticated: boo
1487
 
1488
  logger.info(f"Research attention analysis: prompt_len={len(prompt)}, max_tokens={max_tokens}")
1489
 
1490
- # Check if model uses chat template (instruct models like Devstral)
1491
  from .model_config import get_model_config
 
1492
  model_config = get_model_config(manager.model_id)
1493
- uses_chat_template = model_config.get("uses_chat_template", False) if model_config else False
1494
-
1495
- # Format prompt for chat/instruct models
1496
- if uses_chat_template:
1497
- # Check if tokenizer has a chat template actually configured (not just the method)
1498
- has_template = (
1499
- hasattr(manager.tokenizer, 'chat_template') and
1500
- manager.tokenizer.chat_template is not None
1501
- )
1502
- if has_template:
1503
- # Use tokenizer's built-in chat template
1504
- messages = [{"role": "user", "content": f"Complete the following code:\n{prompt}"}]
1505
- formatted_prompt = manager.tokenizer.apply_chat_template(
1506
- messages,
1507
- tokenize=False,
1508
- add_generation_prompt=True
1509
- )
1510
- logger.info(f"Applied chat template for {manager.model_id}")
1511
- else:
1512
- # Fallback: Manual Mistral-style instruction format
1513
- # Keep it simple - no newlines (they become special tokens)
1514
- formatted_prompt = f"<s>[INST] Continue this Python code: {prompt} [/INST]"
1515
- logger.info(f"Applied manual instruction format for {manager.model_id}")
1516
- # Use temperature=0 for instruct models (fully deterministic code)
1517
  temperature = 0.0
1518
  logger.info(f"Using temperature={temperature} for deterministic instruct model output")
1519
- else:
1520
- # Base model - use raw prompt
1521
- formatted_prompt = prompt
1522
 
1523
  # Tokenize and prepare
1524
  inputs = manager.tokenizer(formatted_prompt, return_tensors="pt").to(manager.device)
 
1487
 
1488
  logger.info(f"Research attention analysis: prompt_len={len(prompt)}, max_tokens={max_tokens}")
1489
 
1490
+ # Get model config for prompt formatting
1491
  from .model_config import get_model_config
1492
+ from .prompt_formatter import format_prompt
1493
  model_config = get_model_config(manager.model_id)
1494
+
1495
+ # Get optional system prompt override from request
1496
+ system_prompt_override = request.get("system_prompt")
1497
+
1498
+ # Format prompt using the unified formatter
1499
+ formatted_prompt = format_prompt(
1500
+ prompt=prompt,
1501
+ model_config=model_config or {},
1502
+ tokenizer=manager.tokenizer,
1503
+ system_prompt_override=system_prompt_override
1504
+ )
1505
+
1506
+ # Log formatting details
1507
+ prompt_style = model_config.get("prompt_style", "completion") if model_config else "completion"
1508
+ logger.info(f"Formatted prompt for {manager.model_id} using style={prompt_style}")
1509
+ if prompt_style == "instruction":
1510
+ logger.info(f"Formatted prompt preview: {formatted_prompt[:200]}...")
1511
+
1512
+ # Use temperature=0 for instruct models (fully deterministic code)
1513
+ if prompt_style == "instruction":
 
 
 
 
1514
  temperature = 0.0
1515
  logger.info(f"Using temperature={temperature} for deterministic instruct model output")
 
 
 
1516
 
1517
  # Tokenize and prepare
1518
  inputs = manager.tokenizer(formatted_prompt, return_tensors="pt").to(manager.device)
backend/prompt_formatter.py ADDED
@@ -0,0 +1,128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Prompt Formatter Service
3
+ Handles formatting prompts appropriately for different model types:
4
+ - Completion models: Raw text continuation
5
+ - Instruction models: System prompt + user message with chat template
6
+ """
7
+
8
+ from typing import Dict, Optional, Any
9
+
10
+
11
+ class PromptFormatter:
12
+ """
13
+ Unified prompt formatting for different model types.
14
+
15
+ Completion models (CodeGen, Code Llama base):
16
+ - Pass prompt through unchanged
17
+ - Model treats it as text to continue
18
+
19
+ Instruction models (Devstral, instruct variants):
20
+ - Wrap with system prompt + user message
21
+ - Use tokenizer's chat_template if available
22
+ - Fallback to manual Mistral format
23
+ """
24
+
25
+ def format(
26
+ self,
27
+ prompt: str,
28
+ model_config: Dict[str, Any],
29
+ tokenizer: Any,
30
+ system_prompt_override: Optional[str] = None
31
+ ) -> str:
32
+ """
33
+ Format a prompt appropriately for the model type.
34
+
35
+ Args:
36
+ prompt: The user's input (e.g., "def quicksort(arr):")
37
+ model_config: Model configuration from model_config.py
38
+ tokenizer: HuggingFace tokenizer for the model
39
+ system_prompt_override: Optional override for the default system prompt
40
+
41
+ Returns:
42
+ Formatted prompt ready for tokenization
43
+ """
44
+ prompt_style = model_config.get("prompt_style", "completion")
45
+
46
+ if prompt_style == "instruction":
47
+ return self._format_instruction(
48
+ prompt,
49
+ model_config,
50
+ tokenizer,
51
+ system_prompt_override
52
+ )
53
+
54
+ # Completion style: return raw prompt
55
+ return prompt
56
+
57
+ def _format_instruction(
58
+ self,
59
+ prompt: str,
60
+ model_config: Dict[str, Any],
61
+ tokenizer: Any,
62
+ system_prompt_override: Optional[str] = None
63
+ ) -> str:
64
+ """
65
+ Format prompt for instruction-tuned models.
66
+
67
+ Uses the tokenizer's chat_template if available,
68
+ otherwise falls back to manual Mistral format.
69
+ """
70
+ # Get system prompt (override > model default > generic fallback)
71
+ system_prompt = system_prompt_override or model_config.get("system_prompt")
72
+ if not system_prompt:
73
+ system_prompt = "You are a helpful coding assistant. Continue the code provided."
74
+
75
+ # Build messages list
76
+ messages = [
77
+ {"role": "system", "content": system_prompt},
78
+ {"role": "user", "content": prompt}
79
+ ]
80
+
81
+ # Try tokenizer's native chat template first
82
+ if hasattr(tokenizer, 'chat_template') and tokenizer.chat_template is not None:
83
+ try:
84
+ formatted = tokenizer.apply_chat_template(
85
+ messages,
86
+ tokenize=False,
87
+ add_generation_prompt=True
88
+ )
89
+ return formatted
90
+ except Exception as e:
91
+ # Fall through to manual format if template fails
92
+ print(f"Warning: chat_template failed, using manual format: {e}")
93
+
94
+ # Fallback: Manual Mistral/Llama format
95
+ return self._manual_mistral_format(prompt, system_prompt)
96
+
97
+ def _manual_mistral_format(self, prompt: str, system_prompt: str) -> str:
98
+ """
99
+ Manual Mistral instruction format as fallback.
100
+
101
+ Format: <s>[INST] {system}\n\n{user} [/INST]
102
+ """
103
+ return f"<s>[INST] {system_prompt}\n\n{prompt} [/INST]"
104
+
105
+
106
+ # Singleton instance for convenience
107
+ _formatter = PromptFormatter()
108
+
109
+
110
+ def format_prompt(
111
+ prompt: str,
112
+ model_config: Dict[str, Any],
113
+ tokenizer: Any,
114
+ system_prompt_override: Optional[str] = None
115
+ ) -> str:
116
+ """
117
+ Convenience function to format a prompt.
118
+
119
+ Args:
120
+ prompt: The user's input (e.g., "def quicksort(arr):")
121
+ model_config: Model configuration from model_config.py
122
+ tokenizer: HuggingFace tokenizer for the model
123
+ system_prompt_override: Optional override for the default system prompt
124
+
125
+ Returns:
126
+ Formatted prompt ready for tokenization
127
+ """
128
+ return _formatter.format(prompt, model_config, tokenizer, system_prompt_override)