| import sys |
| from pathlib import Path |
| from typing import Any |
|
|
| from transformers.configuration_utils import PretrainedConfig |
| from transformers.models.qwen2 import Qwen2Config |
| from autogaze.vision_encoders.siglip.configuration_siglip import SiglipVisionConfig |
|
|
|
|
| class NVILAConfig(PretrainedConfig): |
| model_type = "nvila" |
| sub_configs = { |
| "text_config": Qwen2Config, |
| "vision_config": SiglipVisionConfig, |
| } |
| _auto_class = "AutoConfig" |
|
|
| def __init__( |
| self, |
| *, |
| text_config: dict[str, Any] | None = None, |
| vision_config: dict[str, Any] | None = None, |
| image_token_id: int | None = None, |
| video_token_id: int | None = None, |
| max_batch_size_siglip: int = 16, |
| **kwargs, |
| ): |
| self.text_config = Qwen2Config(**text_config) if text_config is not None else Qwen2Config() |
| self.vision_config = SiglipVisionConfig(**vision_config) if vision_config is not None else SiglipVisionConfig() |
|
|
| self.image_token_id = image_token_id if image_token_id is not None else -1 |
| self.video_token_id = video_token_id if video_token_id is not None else -1 |
| self.max_batch_size_siglip = max_batch_size_siglip |
|
|
| super().__init__(**kwargs) |
|
|