NVILA-8B-HD-Video / configuration_nvila.py
Danny Yin
release
73b433d
import sys
from pathlib import Path
from typing import Any
from transformers.configuration_utils import PretrainedConfig
from transformers.models.qwen2 import Qwen2Config
from autogaze.vision_encoders.siglip.configuration_siglip import SiglipVisionConfig
class NVILAConfig(PretrainedConfig):
model_type = "nvila"
sub_configs = {
"text_config": Qwen2Config,
"vision_config": SiglipVisionConfig,
}
_auto_class = "AutoConfig"
def __init__(
self,
*,
text_config: dict[str, Any] | None = None,
vision_config: dict[str, Any] | None = None,
image_token_id: int | None = None,
video_token_id: int | None = None,
max_batch_size_siglip: int = 16,
**kwargs,
):
self.text_config = Qwen2Config(**text_config) if text_config is not None else Qwen2Config()
self.vision_config = SiglipVisionConfig(**vision_config) if vision_config is not None else SiglipVisionConfig()
self.image_token_id = image_token_id if image_token_id is not None else -1
self.video_token_id = video_token_id if video_token_id is not None else -1
self.max_batch_size_siglip = max_batch_size_siglip
super().__init__(**kwargs)