Instructions to use moondream/moondream3-preview with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use moondream/moondream3-preview with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("image-text-to-text", model="moondream/moondream3-preview", trust_remote_code=True)# Load model directly from transformers import AutoModelForCausalLM model = AutoModelForCausalLM.from_pretrained("moondream/moondream3-preview", trust_remote_code=True, dtype="auto") - Notebooks
- Google Colab
- Kaggle
- Local Apps
- vLLM
How to use moondream/moondream3-preview with vLLM:
Install from pip and serve model
# Install vLLM from pip: pip install vllm # Start the vLLM server: vllm serve "moondream/moondream3-preview" # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:8000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "moondream/moondream3-preview", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }'Use Docker
docker model run hf.co/moondream/moondream3-preview
- SGLang
How to use moondream/moondream3-preview with SGLang:
Install from pip and serve model
# Install SGLang from pip: pip install sglang # Start the SGLang server: python3 -m sglang.launch_server \ --model-path "moondream/moondream3-preview" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "moondream/moondream3-preview", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }'Use Docker images
docker run --gpus all \ --shm-size 32g \ -p 30000:30000 \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HF_TOKEN=<secret>" \ --ipc=host \ lmsysorg/sglang:latest \ python3 -m sglang.launch_server \ --model-path "moondream/moondream3-preview" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "moondream/moondream3-preview", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }' - Docker Model Runner
How to use moondream/moondream3-preview with Docker Model Runner:
docker model run hf.co/moondream/moondream3-preview
| import torch | |
| import torch.nn as nn | |
| from transformers import PreTrainedModel, PretrainedConfig | |
| from typing import Union | |
| from .config import MoondreamConfig | |
| from .moondream import MoondreamModel | |
| # Files sometimes don't get loaded without these... | |
| from .image_crops import * | |
| from .vision import * | |
| from .text import * | |
| from .region import * | |
| from .utils import * | |
| def extract_question(text): | |
| prefix = "<image>\n\nQuestion: " | |
| suffix = "\n\nAnswer:" | |
| if text.startswith(prefix) and text.endswith(suffix): | |
| return text[len(prefix) : -len(suffix)] | |
| else: | |
| return None | |
| class HfConfig(PretrainedConfig): | |
| _auto_class = "AutoConfig" | |
| model_type = "moondream3" | |
| def __init__(self, **kwargs): | |
| super().__init__(**kwargs) | |
| self.config = {"skills": ["query", "caption", "detect", "point"]} | |
| class HfMoondream(PreTrainedModel): | |
| _auto_class = "AutoModelForCausalLM" | |
| config_class = HfConfig | |
| def __init__(self, config): | |
| super().__init__(config) | |
| self.model = MoondreamModel( | |
| MoondreamConfig.from_dict(config.config), setup_caches=False | |
| ) | |
| self._is_kv_cache_setup = False | |
| self.post_init() | |
| def from_pretrained(cls, *args, **kwargs): | |
| output = super().from_pretrained(*args, **kwargs) | |
| model = output[0] if isinstance(output, tuple) else output | |
| model.model._refresh_runtime_buffers() | |
| return output | |
| def _setup_caches(self): | |
| if not self._is_kv_cache_setup: | |
| self.model._setup_caches() | |
| self._is_kv_cache_setup = True | |
| def encode_image(self): | |
| self._setup_caches() | |
| return self.model.encode_image | |
| def query(self): | |
| self._setup_caches() | |
| return self.model.query | |
| def caption(self): | |
| self._setup_caches() | |
| return self.model.caption | |
| def detect(self): | |
| self._setup_caches() | |
| return self.model.detect | |
| def point(self): | |
| self._setup_caches() | |
| return self.model.point | |
| def detect_gaze(self): | |
| self._setup_caches() | |
| return self.model.detect_gaze | |
| def answer_question( | |
| self, | |
| image_embeds, | |
| question, | |
| tokenizer=None, | |
| chat_history="", | |
| result_queue=None, | |
| max_new_tokens=256, | |
| **kwargs | |
| ): | |
| answer = self.query(image_embeds, question)["answer"].strip() | |
| if result_queue is not None: | |
| result_queue.put(answer) | |
| return answer | |
| def batch_answer(self, images, prompts, tokenizer=None, **kwargs): | |
| answers = [] | |
| for image, prompt in zip(images, prompts): | |
| answers.append(self.query(image, prompt)["answer"].strip()) | |
| return answers | |
| def _unsupported_exception(self): | |
| raise NotImplementedError( | |
| "This method is not supported in the latest version of moondream. " | |
| "Consider upgrading to the updated API spec, or alternately pin " | |
| "to 'revision=2024-08-26'." | |
| ) | |
| def generate(self, image_embeds, prompt, tokenizer, max_new_tokens=128, **kwargs): | |
| """ | |
| Function definition remains unchanged for backwards compatibility. | |
| Be aware that tokenizer, max_new_takens, and kwargs are ignored. | |
| """ | |
| prompt_extracted = extract_question(prompt) | |
| if prompt_extracted is not None: | |
| answer = self.model.query( | |
| image=image_embeds, question=prompt_extracted, stream=False | |
| )["answer"] | |
| else: | |
| image_embeds = self.encode_image(image_embeds) | |
| prompt_tokens = torch.tensor( | |
| [self.model.tokenizer.encode(prompt).ids], | |
| device=self.device, | |
| ) | |
| def generator(): | |
| for token in self.model._generate_answer( | |
| prompt_tokens, | |
| image_embeds.kv_cache, | |
| image_embeds.pos, | |
| max_new_tokens, | |
| ): | |
| yield token | |
| answer = "".join(list(generator())) | |
| return [answer] | |
| def get_input_embeddings(self) -> nn.Embedding: | |
| """ | |
| Lazily wrap the raw parameter `self.model.text.wte` in a real | |
| `nn.Embedding` layer so that HF mix-ins recognise it. The wrapper | |
| **shares** the weight tensor—no copy is made. | |
| """ | |
| if not hasattr(self, "_input_embeddings"): | |
| self._input_embeddings = nn.Embedding.from_pretrained( | |
| self.model.text.wte, # tensor created in text.py | |
| freeze=True, # set to False if you need it trainable | |
| ) | |
| return self._input_embeddings | |
| def set_input_embeddings(self, value: Union[nn.Embedding, nn.Module]) -> None: | |
| """ | |
| Lets HF functions (e.g. `resize_token_embeddings`) replace or resize the | |
| embeddings and keeps everything tied to `self.model.text.wte`. | |
| """ | |
| # 1. point the low-level parameter to the new weight matrix | |
| self.model.text.wte = value.weight | |
| # 2. keep a reference for get_input_embeddings() | |
| self._input_embeddings = value | |
| def input_embeds( | |
| self, | |
| input_ids: Union[torch.LongTensor, list, tuple], | |
| *, | |
| device: torch.device | None = None | |
| ) -> torch.FloatTensor: | |
| """ | |
| Back-compat wrapper that turns token IDs into embeddings. | |
| Example: | |
| ids = torch.tensor([[1, 2, 3]]) | |
| embeds = model.input_embeds(ids) # (1, 3, hidden_dim) | |
| """ | |
| if not torch.is_tensor(input_ids): | |
| input_ids = torch.as_tensor(input_ids) | |
| if device is not None: | |
| input_ids = input_ids.to(device) | |
| return self.get_input_embeddings()(input_ids) | |