Upload Javanese pruned model

Browse files

Files changed (5) hide show

README.md +52 -0
config.json +78 -0
model.safetensors +3 -0
tokenizer.json +0 -0
tokenizer_config.json +17 -0

README.md ADDED Viewed

	@@ -0,0 +1,52 @@

+---
+pipeline_tag: fill-mask
+language: jav
+license: mit
+tags:
+  - trimmed
+library_name: transformers
+base_model: jhu-clsp/mmBERT-base
+base_model_relation: quantized
+datasets:
+  - Lumberjackk/fineweb-2-trimming
+---
+# mmBERT-base-jav-32768
+This model is a 55.86% smaller version of [jhu-clsp/mmBERT-base](https://huggingface.co/jhu-clsp/mmBERT-base) optimized for Javanese language via vocabulary size reduction using the [trimming](https://huggingface.co/blog/introduction-to-trimming) method.
+This trimmed model should perform similarly to the original model with only 32,768 tokens and a much smaller memory footprint. However, it may not perform well for other languages as tokens not commonly used in the selected languages were removed from the vocabulary.
+## Model Statistics
+| Metric | Original | Trimmed | Reduction |
+|--------|----------|---------|-----------|
+| **Vocabulary size** | 256,000 tokens | 32,768 tokens | **87.20%** |
+| **Model size** | 306,939,648 params | 135,497,472 params | **55.86%** |
+![image](https://cdn-uploads.huggingface.co/production/uploads/613b0a62a14099d5afed7830/3bAHdqRvu-haO_RxyOwVo.png)
+## Mining Dataset Statistics
+- **Number of texts used for mining**: 200,000 texts
+- **Dataset**: [Lumberjackk/fineweb-2-trimming](https://huggingface.co/datasets/Lumberjackk/fineweb-2-trimming)
+## Usage
+```python
+from transformers import AutoModel, AutoTokenizer
+model_name = "Lumberjackk/mmBERT-base-jav-32768"
+model = AutoModel.from_pretrained(model_name)
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+```
+## Citation
+#### mmBERT
+```
+@misc{marone2025mmbertmodernmultilingualencoder,
+      title={mmBERT: A Modern Multilingual Encoder with Annealed Language Learning},
+      author={Marc Marone and Orion Weller and William Fleshman and Eugene Yang and Dawn Lawrie and Benjamin Van Durme},
+      year={2025},
+      eprint={2509.06888},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL},
+      url={https://arxiv.org/abs/2509.06888},
+}
+```

config.json ADDED Viewed

	@@ -0,0 +1,78 @@

+{
+  "architectures": [
+    "ModernBertModel"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 2,
+  "classifier_activation": "gelu",
+  "classifier_bias": false,
+  "classifier_dropout": 0.0,
+  "classifier_pooling": "mean",
+  "cls_token_id": 1,
+  "decoder_bias": true,
+  "deterministic_flash_attn": false,
+  "dtype": "float32",
+  "embedding_dropout": 0.0,
+  "eos_token_id": 1,
+  "global_attn_every_n_layers": 3,
+  "gradient_checkpointing": false,
+  "hidden_activation": "gelu",
+  "hidden_size": 768,
+  "initializer_cutoff_factor": 2.0,
+  "initializer_range": 0.02,
+  "intermediate_size": 1152,
+  "layer_norm_eps": 1e-05,
+  "layer_types": [
+    "full_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "full_attention"
+  ],
+  "local_attention": 128,
+  "mask_token_id": 4,
+  "max_position_embeddings": 8192,
+  "mlp_bias": false,
+  "mlp_dropout": 0.0,
+  "model_type": "modernbert",
+  "norm_bias": false,
+  "norm_eps": 1e-05,
+  "num_attention_heads": 12,
+  "num_hidden_layers": 22,
+  "pad_token_id": 0,
+  "position_embedding_type": "sans_pos",
+  "rope_parameters": {
+    "full_attention": {
+      "rope_theta": 160000,
+      "rope_type": "default"
+    },
+    "sliding_attention": {
+      "rope_theta": 160000,
+      "rope_type": "default"
+    }
+  },
+  "sep_token_id": 1,
+  "sparse_pred_ignore_index": -100,
+  "sparse_prediction": false,
+  "tie_word_embeddings": true,
+  "transformers_version": "5.3.0.dev0",
+  "vocab_size": 16384
+}

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:40f97a3522f136af06757a57961b31111eae58c8acbb42ffdfc22db4a71b49ed
+size 491671256

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,17 @@

+{
+  "backend": "tokenizers",
+  "bos_token": "<bos>",
+  "cls_token": "<bos>",
+  "eos_token": "<eos>",
+  "mask_token": "<mask>",
+  "model_max_length": 8192,
+  "pad_token": "<pad>",
+  "padding_side": "right",
+  "sep_token": "<eos>",
+  "tokenizer_class": "TokenizersBackend",
+  "unk_token": "<unk>",
+  "model_input_names": [
+    "input_ids",
+    "attention_mask"
+  ]
+}