Spaces:

visualisable-ai
/

api

Running on CPU Upgrade

gary-boon Claude Opus 4.5 commited on 7 days ago

Commit

2860768

1 Parent(s): 76020ee

Add system prompt support for instruction-tuned models

- Add prompt_style and system_prompt fields to ModelConfig
- Create prompt_formatter.py service for unified prompt handling
- Update research endpoint to use formatter with proper system prompts
- Devstral now receives proper system + user message format

This fixes the garbage token output from Devstral by properly
formatting prompts for instruction-tuned models.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <[email protected]>

Files changed (3) hide show

backend/model_config.py +11 -3
backend/model_service.py +22 -28
backend/prompt_formatter.py +128 -0

backend/model_config.py CHANGED Viewed

@@ -24,6 +24,8 @@ class ModelConfig(TypedDict):
     min_ram_gb: float
     recommended_dtype: str  # "fp16", "bf16", or "fp32"
     uses_chat_template: bool  # Whether model expects instruction format
 # Supported models registry
@@ -43,7 +45,9 @@ SUPPORTED_MODELS: Dict[str, ModelConfig] = {
         "min_vram_gb": 2.0,
         "min_ram_gb": 4.0,
         "recommended_dtype": "fp16",  # fp16 for GPU, fp32 for CPU
-        "uses_chat_template": False  # Base model, raw completion
     },
     "code-llama-7b": {
         "hf_path": "codellama/CodeLlama-7b-hf",
@@ -60,7 +64,9 @@ SUPPORTED_MODELS: Dict[str, ModelConfig] = {
         "min_vram_gb": 14.0,   # FP16 requires ~14GB VRAM
         "min_ram_gb": 18.0,    # FP16 requires ~18GB RAM for CPU fallback
         "recommended_dtype": "fp16",
-        "uses_chat_template": False  # Base model, raw completion
     },
     "devstral-small": {
         "hf_path": "mistralai/Devstral-Small-2507",
@@ -77,7 +83,9 @@ SUPPORTED_MODELS: Dict[str, ModelConfig] = {
         "min_vram_gb": 48.0,   # BF16 requires ~48GB VRAM
         "min_ram_gb": 96.0,    # BF16 requires ~96GB RAM for CPU fallback
         "recommended_dtype": "bf16",  # Devstral requires bfloat16
-        "uses_chat_template": True  # Instruction-tuned, requires chat format
     }
 }

     min_ram_gb: float
     recommended_dtype: str  # "fp16", "bf16", or "fp32"
     uses_chat_template: bool  # Whether model expects instruction format
+    prompt_style: str  # "completion" | "instruction" - how to format prompts
+    system_prompt: Optional[str]  # Default system prompt for instruction models
 # Supported models registry
         "min_vram_gb": 2.0,
         "min_ram_gb": 4.0,
         "recommended_dtype": "fp16",  # fp16 for GPU, fp32 for CPU
+        "uses_chat_template": False,  # Base model, raw completion
+        "prompt_style": "completion",  # Raw text continuation
+        "system_prompt": None  # Base models don't use system prompts
     },
     "code-llama-7b": {
         "hf_path": "codellama/CodeLlama-7b-hf",
         "min_vram_gb": 14.0,   # FP16 requires ~14GB VRAM
         "min_ram_gb": 18.0,    # FP16 requires ~18GB RAM for CPU fallback
         "recommended_dtype": "fp16",
+        "uses_chat_template": False,  # Base model, raw completion
+        "prompt_style": "completion",  # Raw text continuation
+        "system_prompt": None  # Base models don't use system prompts
     },
     "devstral-small": {
         "hf_path": "mistralai/Devstral-Small-2507",
         "min_vram_gb": 48.0,   # BF16 requires ~48GB VRAM
         "min_ram_gb": 96.0,    # BF16 requires ~96GB RAM for CPU fallback
         "recommended_dtype": "bf16",  # Devstral requires bfloat16
+        "uses_chat_template": True,  # Instruction-tuned, requires chat format
+        "prompt_style": "instruction",  # Requires system + user messages
+        "system_prompt": "You are an expert Python programmer. Continue the code provided by the user. Output only valid Python code, no explanations or markdown."
     }
 }

backend/model_service.py CHANGED Viewed

@@ -1487,38 +1487,32 @@ async def analyze_research_attention(request: Dict[str, Any], authenticated: boo
         logger.info(f"Research attention analysis: prompt_len={len(prompt)}, max_tokens={max_tokens}")
-        # Check if model uses chat template (instruct models like Devstral)
         from .model_config import get_model_config
         model_config = get_model_config(manager.model_id)
-        uses_chat_template = model_config.get("uses_chat_template", False) if model_config else False
-        # Format prompt for chat/instruct models
-        if uses_chat_template:
-            # Check if tokenizer has a chat template actually configured (not just the method)
-            has_template = (
-                hasattr(manager.tokenizer, 'chat_template') and
-                manager.tokenizer.chat_template is not None
-            )
-            if has_template:
-                # Use tokenizer's built-in chat template
-                messages = [{"role": "user", "content": f"Complete the following code:\n{prompt}"}]
-                formatted_prompt = manager.tokenizer.apply_chat_template(
-                    messages,
-                    tokenize=False,
-                    add_generation_prompt=True
-                )
-                logger.info(f"Applied chat template for {manager.model_id}")
-            else:
-                # Fallback: Manual Mistral-style instruction format
-                # Keep it simple - no newlines (they become special tokens)
-                formatted_prompt = f"<s>[INST] Continue this Python code: {prompt} [/INST]"
-                logger.info(f"Applied manual instruction format for {manager.model_id}")
-            # Use temperature=0 for instruct models (fully deterministic code)
             temperature = 0.0
             logger.info(f"Using temperature={temperature} for deterministic instruct model output")
-        else:
-            # Base model - use raw prompt
-            formatted_prompt = prompt
         # Tokenize and prepare
         inputs = manager.tokenizer(formatted_prompt, return_tensors="pt").to(manager.device)

         logger.info(f"Research attention analysis: prompt_len={len(prompt)}, max_tokens={max_tokens}")
+        # Get model config for prompt formatting
         from .model_config import get_model_config
+        from .prompt_formatter import format_prompt
         model_config = get_model_config(manager.model_id)
+        # Get optional system prompt override from request
+        system_prompt_override = request.get("system_prompt")
+        # Format prompt using the unified formatter
+        formatted_prompt = format_prompt(
+            prompt=prompt,
+            model_config=model_config or {},
+            tokenizer=manager.tokenizer,
+            system_prompt_override=system_prompt_override
+        )
+        # Log formatting details
+        prompt_style = model_config.get("prompt_style", "completion") if model_config else "completion"
+        logger.info(f"Formatted prompt for {manager.model_id} using style={prompt_style}")
+        if prompt_style == "instruction":
+            logger.info(f"Formatted prompt preview: {formatted_prompt[:200]}...")
+        # Use temperature=0 for instruct models (fully deterministic code)
+        if prompt_style == "instruction":
             temperature = 0.0
             logger.info(f"Using temperature={temperature} for deterministic instruct model output")
         # Tokenize and prepare
         inputs = manager.tokenizer(formatted_prompt, return_tensors="pt").to(manager.device)

backend/prompt_formatter.py ADDED Viewed

	@@ -0,0 +1,128 @@

+"""
+Prompt Formatter Service
+Handles formatting prompts appropriately for different model types:
+- Completion models: Raw text continuation
+- Instruction models: System prompt + user message with chat template
+"""
+from typing import Dict, Optional, Any
+class PromptFormatter:
+    """
+    Unified prompt formatting for different model types.
+    Completion models (CodeGen, Code Llama base):
+        - Pass prompt through unchanged
+        - Model treats it as text to continue
+    Instruction models (Devstral, instruct variants):
+        - Wrap with system prompt + user message
+        - Use tokenizer's chat_template if available
+        - Fallback to manual Mistral format
+    """
+    def format(
+        self,
+        prompt: str,
+        model_config: Dict[str, Any],
+        tokenizer: Any,
+        system_prompt_override: Optional[str] = None
+    ) -> str:
+        """
+        Format a prompt appropriately for the model type.
+        Args:
+            prompt: The user's input (e.g., "def quicksort(arr):")
+            model_config: Model configuration from model_config.py
+            tokenizer: HuggingFace tokenizer for the model
+            system_prompt_override: Optional override for the default system prompt
+        Returns:
+            Formatted prompt ready for tokenization
+        """
+        prompt_style = model_config.get("prompt_style", "completion")
+        if prompt_style == "instruction":
+            return self._format_instruction(
+                prompt,
+                model_config,
+                tokenizer,
+                system_prompt_override
+            )
+        # Completion style: return raw prompt
+        return prompt
+    def _format_instruction(
+        self,
+        prompt: str,
+        model_config: Dict[str, Any],
+        tokenizer: Any,
+        system_prompt_override: Optional[str] = None
+    ) -> str:
+        """
+        Format prompt for instruction-tuned models.
+        Uses the tokenizer's chat_template if available,
+        otherwise falls back to manual Mistral format.
+        """
+        # Get system prompt (override > model default > generic fallback)
+        system_prompt = system_prompt_override or model_config.get("system_prompt")
+        if not system_prompt:
+            system_prompt = "You are a helpful coding assistant. Continue the code provided."
+        # Build messages list
+        messages = [
+            {"role": "system", "content": system_prompt},
+            {"role": "user", "content": prompt}
+        ]
+        # Try tokenizer's native chat template first
+        if hasattr(tokenizer, 'chat_template') and tokenizer.chat_template is not None:
+            try:
+                formatted = tokenizer.apply_chat_template(
+                    messages,
+                    tokenize=False,
+                    add_generation_prompt=True
+                )
+                return formatted
+            except Exception as e:
+                # Fall through to manual format if template fails
+                print(f"Warning: chat_template failed, using manual format: {e}")
+        # Fallback: Manual Mistral/Llama format
+        return self._manual_mistral_format(prompt, system_prompt)
+    def _manual_mistral_format(self, prompt: str, system_prompt: str) -> str:
+        """
+        Manual Mistral instruction format as fallback.
+        Format: <s>[INST] {system}\n\n{user} [/INST]
+        """
+        return f"<s>[INST] {system_prompt}\n\n{prompt} [/INST]"
+# Singleton instance for convenience
+_formatter = PromptFormatter()
+def format_prompt(
+    prompt: str,
+    model_config: Dict[str, Any],
+    tokenizer: Any,
+    system_prompt_override: Optional[str] = None
+) -> str:
+    """
+    Convenience function to format a prompt.
+    Args:
+        prompt: The user's input (e.g., "def quicksort(arr):")
+        model_config: Model configuration from model_config.py
+        tokenizer: HuggingFace tokenizer for the model
+        system_prompt_override: Optional override for the default system prompt
+    Returns:
+        Formatted prompt ready for tokenization
+    """
+    return _formatter.format(prompt, model_config, tokenizer, system_prompt_override)