gary-boon Claude Opus 4.5 commited on
Commit
5333b21
·
1 Parent(s): ba27c0c

fix: Use eager attention for output_attentions support

Browse files

Devstral defaults to SDPA (flash attention) which doesn't support
output_attentions=True. Force eager attention to enable capturing
attention weights for visualization.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <[email protected]>

Files changed (1) hide show
  1. backend/model_service.py +3 -1
backend/model_service.py CHANGED
@@ -189,11 +189,13 @@ class ModelManager:
189
  logger.info(f" Max context: {self.max_context}, Batch size: {self.batch_size}")
190
 
191
  # Load model with configured dtype
 
192
  self.model = AutoModelForCausalLM.from_pretrained(
193
  self.model_name,
194
  torch_dtype=self.dtype,
195
  low_cpu_mem_usage=True,
196
- trust_remote_code=True
 
197
  ).to(self.device)
198
 
199
  # Load tokenizer
 
189
  logger.info(f" Max context: {self.max_context}, Batch size: {self.batch_size}")
190
 
191
  # Load model with configured dtype
192
+ # Use eager attention to support output_attentions=True for visualization
193
  self.model = AutoModelForCausalLM.from_pretrained(
194
  self.model_name,
195
  torch_dtype=self.dtype,
196
  low_cpu_mem_usage=True,
197
+ trust_remote_code=True,
198
+ attn_implementation="eager"
199
  ).to(self.device)
200
 
201
  # Load tokenizer