{ "model_type": "vit-captioner-bias-decoder", "feature_extractor": "google/vit-base-patch16-224-in21k", "vocab_size": 75460, "seq_len": 32, "feature_dim": 768, "training_epochs": 3, "dataset": "ROCO-radiology (train + val + test)", "trainable": "Decoder + ViT biases only", "description": "ROCO radiology captioner trained for 3 epochs on full dataset using cached ViT features." }