Upload folder using huggingface_hub

Browse files

Files changed (7) hide show

config.json +95 -0
model.safetensors +3 -0
pl_config.yaml +214 -0
special_tokens_map.json +7 -0
tokenizer.json +0 -0
tokenizer_config.json +63 -0
vocab.txt +0 -0

config.json ADDED Viewed

	@@ -0,0 +1,95 @@

+{
+  "absolute_positional_embedding_type": null,
+  "architectures": [
+    "TiteForPreTraining"
+  ],
+  "dropout_prob": 0.1,
+  "hidden_act": "gelu_pytorch_tanh",
+  "hidden_sizes": [
+    768,
+    768,
+    768,
+    1024,
+    1024,
+    1024,
+    1280,
+    1280,
+    1280,
+    1536,
+    1536,
+    1536
+  ],
+  "initializer_range": 0.02,
+  "intermediate_sizes": [
+    3072,
+    3072,
+    3072,
+    4096,
+    4096,
+    4096,
+    5120,
+    5120,
+    5120,
+    6144,
+    6144,
+    6144
+  ],
+  "kernel_sizes": [
+    null,
+    null,
+    null,
+    2,
+    2,
+    2,
+    2,
+    2,
+    2,
+    2,
+    2,
+    2
+  ],
+  "layer_norm_eps": 1e-12,
+  "max_position_embeddings": 512,
+  "model_type": "tite",
+  "norm_location": "post",
+  "norm_type": "layer",
+  "num_attention_heads": [
+    12,
+    12,
+    12,
+    16,
+    16,
+    16,
+    20,
+    20,
+    20,
+    24,
+    24,
+    24
+  ],
+  "num_hidden_layers": 12,
+  "pad_token_id": 0,
+  "pooling_implementation": "triton",
+  "pooling_location": "intra",
+  "positional_embedding_type": null,
+  "relative_positional_embedding_type": "rotary",
+  "rope_implementation": "eager",
+  "rotary_interleaved": true,
+  "strides": [
+    null,
+    null,
+    null,
+    2,
+    2,
+    2,
+    2,
+    2,
+    2,
+    2,
+    2,
+    2
+  ],
+  "torch_dtype": "float32",
+  "transformers_version": "4.52.4",
+  "vocab_size": 30522
+}

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3169c5972b94cc524fe3f501ba8d4f627fa7eb83a01c3e093d6b925533ded5b8
+size 1125291984

pl_config.yaml ADDED Viewed

	@@ -0,0 +1,214 @@

+# lightning.pytorch==2.5.2
+seed_everything: 42
+trainer:
+  accelerator: auto
+  strategy: auto
+  devices: auto
+  num_nodes: 1
+  precision: bf16-mixed
+  callbacks:
+  - class_path: lightning.pytorch.callbacks.ModelCheckpoint
+    init_args:
+      dirpath: null
+      filename: null
+      monitor: null
+      verbose: false
+      save_last: null
+      save_top_k: 1
+      save_weights_only: false
+      mode: min
+      auto_insert_metric_name: true
+      every_n_train_steps: null
+      train_time_interval: null
+      every_n_epochs: null
+      save_on_train_epoch_end: null
+      enable_version_counter: true
+  fast_dev_run: false
+  max_epochs: null
+  min_epochs: null
+  max_steps: 200000
+  min_steps: null
+  max_time: null
+  limit_train_batches: null
+  limit_val_batches: null
+  limit_test_batches: null
+  limit_predict_batches: null
+  overfit_batches: 0.0
+  val_check_interval: 50000
+  check_val_every_n_epoch: 1
+  num_sanity_val_steps: null
+  log_every_n_steps: null
+  enable_checkpointing: null
+  enable_progress_bar: false
+  enable_model_summary: null
+  accumulate_grad_batches: 2
+  gradient_clip_val: 1
+  gradient_clip_algorithm: null
+  deterministic: null
+  benchmark: null
+  inference_mode: true
+  use_distributed_sampler: true
+  profiler: null
+  detect_anomaly: false
+  barebones: false
+  plugins: null
+  sync_batchnorm: false
+  reload_dataloaders_every_n_epochs: 0
+  default_root_dir: null
+  model_registry: null
+model:
+  class_path: tite.module.TiteModule
+  init_args:
+    model:
+      class_path: tite.model.TiteForPreTraining
+      init_args:
+        config:
+          class_path: tite.model.TiteConfig
+          init_args:
+            vocab_size: 30522
+            num_hidden_layers: 12
+            hidden_sizes:
+            - 768
+            - 768
+            - 768
+            - 1024
+            - 1024
+            - 1024
+            - 1280
+            - 1280
+            - 1280
+            - 1536
+            - 1536
+            - 1536
+            num_attention_heads:
+            - 12
+            - 12
+            - 12
+            - 16
+            - 16
+            - 16
+            - 20
+            - 20
+            - 20
+            - 24
+            - 24
+            - 24
+            intermediate_sizes:
+            - 3072
+            - 3072
+            - 3072
+            - 4096
+            - 4096
+            - 4096
+            - 5120
+            - 5120
+            - 5120
+            - 6144
+            - 6144
+            - 6144
+            kernel_sizes:
+            - null
+            - null
+            - null
+            - 2
+            - 2
+            - 2
+            - 2
+            - 2
+            - 2
+            - 2
+            - 2
+            - 2
+            strides:
+            - null
+            - null
+            - null
+            - 2
+            - 2
+            - 2
+            - 2
+            - 2
+            - 2
+            - 2
+            - 2
+            - 2
+            dropout_prob: 0.1
+            max_position_embeddings: 512
+            initializer_range: 0.02
+            layer_norm_eps: 1.0e-12
+            pad_token_id: 0
+            hidden_act: gelu_pytorch_tanh
+            absolute_positional_embedding_type: null
+            relative_positional_embedding_type: rotary
+            pooling_location: intra
+            rotary_interleaved: true
+            norm_location: post
+            norm_type: layer
+            pooling_implementation: triton
+            rope_implementation: eager
+            positional_embedding_type: null
+        enhanced_masked_auto_encoding: true
+        bow_auto_encoding: true
+    tokenizer:
+      class_path: tite.model.TiteTokenizer
+      init_args:
+        vocab_file: tokenizers/tite/vocab.txt
+        tokenizer_file: tokenizers/tite/tokenizer.json
+        do_lower_case: true
+        unk_token: '[UNK]'
+        sep_token: '[SEP]'
+        pad_token: '[PAD]'
+        cls_token: '[CLS]'
+        mask_token: '[MASK]'
+        tokenize_chinese_chars: true
+        strip_accents: null
+      dict_kwargs:
+        model_max_length: 512
+    validate_on_glue: true
+    validate_on_trec_dl: true
+    log_gradients: false
+    compile: true
+data:
+  class_path: tite.datasets.FineWebDataModule
+  init_args:
+    collator:
+      class_path: tite.datasets.TransformationCollator
+      init_args:
+        text_keys:
+        - text
+        - null
+        string_transformations: null
+        token_transformations:
+        - class_path: tite.transformation.TokenMask
+          init_args:
+            mask_id: 103
+            mask_prob: 0.3
+            transformation_prob: 1.0
+        max_length: 512
+    path: HuggingFaceFW/fineweb-edu
+    batch_size: 128
+    seed: null
+    num_workers: 8
+    streaming: true
+lr_scheduler:
+  class_path: tite.utils.lr_schedulers.SigmoidLRSchedulerWithLinearWarmup
+  init_args:
+    num_warmup_steps: 3000
+    final_value: 0.02
+    num_delay_steps: 0
+optimizer:
+  class_path: tite.utils.adamw.AdamWNoWeightDecayBiasNorm
+  init_args:
+    lr: 0.0001
+    betas:
+    - 0.9
+    - 0.999
+    eps: 1.0e-08
+    weight_decay: 0.01
+    amsgrad: false
+    maximize: false
+    foreach: null
+    capturable: false
+    differentiable: false
+    fused: null
+ckpt_path: null

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "cls_token": "[CLS]",
+  "mask_token": "[MASK]",
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "unk_token": "[UNK]"
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,63 @@

+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "[PAD]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100": {
+      "content": "[UNK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "101": {
+      "content": "[CLS]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "102": {
+      "content": "[SEP]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "103": {
+      "content": "[MASK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "clean_up_tokenization_spaces": false,
+  "cls_token": "[CLS]",
+  "do_lower_case": true,
+  "extra_special_tokens": {},
+  "mask_token": "[MASK]",
+  "max_length": 512,
+  "model_max_length": 512,
+  "pad_to_multiple_of": 8,
+  "pad_token": "[PAD]",
+  "pad_token_type_id": 0,
+  "padding_side": "right",
+  "sep_token": "[SEP]",
+  "stride": 0,
+  "strip_accents": null,
+  "tokenize_chinese_chars": true,
+  "tokenizer_class": "TiteTokenizer",
+  "truncation_side": "right",
+  "truncation_strategy": "longest_first",
+  "unk_token": "[UNK]"
+}

vocab.txt ADDED Viewed

The diff for this file is too large to render. See raw diff