fschlatt commited on
Commit
ce84b1e
·
verified ·
1 Parent(s): df8dfcc

Upload folder using huggingface_hub

Browse files
config.json ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "absolute_positional_embedding_type": null,
3
+ "architectures": [
4
+ "TiteForPreTraining"
5
+ ],
6
+ "dropout_prob": 0.1,
7
+ "hidden_act": "gelu_pytorch_tanh",
8
+ "hidden_sizes": [
9
+ 768,
10
+ 768,
11
+ 768,
12
+ 1024,
13
+ 1024,
14
+ 1024,
15
+ 1280,
16
+ 1280,
17
+ 1280,
18
+ 1536,
19
+ 1536,
20
+ 1536
21
+ ],
22
+ "initializer_range": 0.02,
23
+ "intermediate_sizes": [
24
+ 3072,
25
+ 3072,
26
+ 3072,
27
+ 4096,
28
+ 4096,
29
+ 4096,
30
+ 5120,
31
+ 5120,
32
+ 5120,
33
+ 6144,
34
+ 6144,
35
+ 6144
36
+ ],
37
+ "kernel_sizes": [
38
+ null,
39
+ null,
40
+ null,
41
+ 2,
42
+ 2,
43
+ 2,
44
+ 2,
45
+ 2,
46
+ 2,
47
+ 2,
48
+ 2,
49
+ 2
50
+ ],
51
+ "layer_norm_eps": 1e-12,
52
+ "max_position_embeddings": 512,
53
+ "model_type": "tite",
54
+ "norm_location": "post",
55
+ "norm_type": "layer",
56
+ "num_attention_heads": [
57
+ 12,
58
+ 12,
59
+ 12,
60
+ 16,
61
+ 16,
62
+ 16,
63
+ 20,
64
+ 20,
65
+ 20,
66
+ 24,
67
+ 24,
68
+ 24
69
+ ],
70
+ "num_hidden_layers": 12,
71
+ "pad_token_id": 0,
72
+ "pooling_implementation": "triton",
73
+ "pooling_location": "intra",
74
+ "positional_embedding_type": null,
75
+ "relative_positional_embedding_type": "rotary",
76
+ "rope_implementation": "eager",
77
+ "rotary_interleaved": true,
78
+ "strides": [
79
+ null,
80
+ null,
81
+ null,
82
+ 2,
83
+ 2,
84
+ 2,
85
+ 2,
86
+ 2,
87
+ 2,
88
+ 2,
89
+ 2,
90
+ 2
91
+ ],
92
+ "torch_dtype": "float32",
93
+ "transformers_version": "4.52.4",
94
+ "vocab_size": 30522
95
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3169c5972b94cc524fe3f501ba8d4f627fa7eb83a01c3e093d6b925533ded5b8
3
+ size 1125291984
pl_config.yaml ADDED
@@ -0,0 +1,214 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # lightning.pytorch==2.5.2
2
+ seed_everything: 42
3
+ trainer:
4
+ accelerator: auto
5
+ strategy: auto
6
+ devices: auto
7
+ num_nodes: 1
8
+ precision: bf16-mixed
9
+ callbacks:
10
+ - class_path: lightning.pytorch.callbacks.ModelCheckpoint
11
+ init_args:
12
+ dirpath: null
13
+ filename: null
14
+ monitor: null
15
+ verbose: false
16
+ save_last: null
17
+ save_top_k: 1
18
+ save_weights_only: false
19
+ mode: min
20
+ auto_insert_metric_name: true
21
+ every_n_train_steps: null
22
+ train_time_interval: null
23
+ every_n_epochs: null
24
+ save_on_train_epoch_end: null
25
+ enable_version_counter: true
26
+ fast_dev_run: false
27
+ max_epochs: null
28
+ min_epochs: null
29
+ max_steps: 200000
30
+ min_steps: null
31
+ max_time: null
32
+ limit_train_batches: null
33
+ limit_val_batches: null
34
+ limit_test_batches: null
35
+ limit_predict_batches: null
36
+ overfit_batches: 0.0
37
+ val_check_interval: 50000
38
+ check_val_every_n_epoch: 1
39
+ num_sanity_val_steps: null
40
+ log_every_n_steps: null
41
+ enable_checkpointing: null
42
+ enable_progress_bar: false
43
+ enable_model_summary: null
44
+ accumulate_grad_batches: 2
45
+ gradient_clip_val: 1
46
+ gradient_clip_algorithm: null
47
+ deterministic: null
48
+ benchmark: null
49
+ inference_mode: true
50
+ use_distributed_sampler: true
51
+ profiler: null
52
+ detect_anomaly: false
53
+ barebones: false
54
+ plugins: null
55
+ sync_batchnorm: false
56
+ reload_dataloaders_every_n_epochs: 0
57
+ default_root_dir: null
58
+ model_registry: null
59
+ model:
60
+ class_path: tite.module.TiteModule
61
+ init_args:
62
+ model:
63
+ class_path: tite.model.TiteForPreTraining
64
+ init_args:
65
+ config:
66
+ class_path: tite.model.TiteConfig
67
+ init_args:
68
+ vocab_size: 30522
69
+ num_hidden_layers: 12
70
+ hidden_sizes:
71
+ - 768
72
+ - 768
73
+ - 768
74
+ - 1024
75
+ - 1024
76
+ - 1024
77
+ - 1280
78
+ - 1280
79
+ - 1280
80
+ - 1536
81
+ - 1536
82
+ - 1536
83
+ num_attention_heads:
84
+ - 12
85
+ - 12
86
+ - 12
87
+ - 16
88
+ - 16
89
+ - 16
90
+ - 20
91
+ - 20
92
+ - 20
93
+ - 24
94
+ - 24
95
+ - 24
96
+ intermediate_sizes:
97
+ - 3072
98
+ - 3072
99
+ - 3072
100
+ - 4096
101
+ - 4096
102
+ - 4096
103
+ - 5120
104
+ - 5120
105
+ - 5120
106
+ - 6144
107
+ - 6144
108
+ - 6144
109
+ kernel_sizes:
110
+ - null
111
+ - null
112
+ - null
113
+ - 2
114
+ - 2
115
+ - 2
116
+ - 2
117
+ - 2
118
+ - 2
119
+ - 2
120
+ - 2
121
+ - 2
122
+ strides:
123
+ - null
124
+ - null
125
+ - null
126
+ - 2
127
+ - 2
128
+ - 2
129
+ - 2
130
+ - 2
131
+ - 2
132
+ - 2
133
+ - 2
134
+ - 2
135
+ dropout_prob: 0.1
136
+ max_position_embeddings: 512
137
+ initializer_range: 0.02
138
+ layer_norm_eps: 1.0e-12
139
+ pad_token_id: 0
140
+ hidden_act: gelu_pytorch_tanh
141
+ absolute_positional_embedding_type: null
142
+ relative_positional_embedding_type: rotary
143
+ pooling_location: intra
144
+ rotary_interleaved: true
145
+ norm_location: post
146
+ norm_type: layer
147
+ pooling_implementation: triton
148
+ rope_implementation: eager
149
+ positional_embedding_type: null
150
+ enhanced_masked_auto_encoding: true
151
+ bow_auto_encoding: true
152
+ tokenizer:
153
+ class_path: tite.model.TiteTokenizer
154
+ init_args:
155
+ vocab_file: tokenizers/tite/vocab.txt
156
+ tokenizer_file: tokenizers/tite/tokenizer.json
157
+ do_lower_case: true
158
+ unk_token: '[UNK]'
159
+ sep_token: '[SEP]'
160
+ pad_token: '[PAD]'
161
+ cls_token: '[CLS]'
162
+ mask_token: '[MASK]'
163
+ tokenize_chinese_chars: true
164
+ strip_accents: null
165
+ dict_kwargs:
166
+ model_max_length: 512
167
+ validate_on_glue: true
168
+ validate_on_trec_dl: true
169
+ log_gradients: false
170
+ compile: true
171
+ data:
172
+ class_path: tite.datasets.FineWebDataModule
173
+ init_args:
174
+ collator:
175
+ class_path: tite.datasets.TransformationCollator
176
+ init_args:
177
+ text_keys:
178
+ - text
179
+ - null
180
+ string_transformations: null
181
+ token_transformations:
182
+ - class_path: tite.transformation.TokenMask
183
+ init_args:
184
+ mask_id: 103
185
+ mask_prob: 0.3
186
+ transformation_prob: 1.0
187
+ max_length: 512
188
+ path: HuggingFaceFW/fineweb-edu
189
+ batch_size: 128
190
+ seed: null
191
+ num_workers: 8
192
+ streaming: true
193
+ lr_scheduler:
194
+ class_path: tite.utils.lr_schedulers.SigmoidLRSchedulerWithLinearWarmup
195
+ init_args:
196
+ num_warmup_steps: 3000
197
+ final_value: 0.02
198
+ num_delay_steps: 0
199
+ optimizer:
200
+ class_path: tite.utils.adamw.AdamWNoWeightDecayBiasNorm
201
+ init_args:
202
+ lr: 0.0001
203
+ betas:
204
+ - 0.9
205
+ - 0.999
206
+ eps: 1.0e-08
207
+ weight_decay: 0.01
208
+ amsgrad: false
209
+ maximize: false
210
+ foreach: null
211
+ capturable: false
212
+ differentiable: false
213
+ fused: null
214
+ ckpt_path: null
special_tokens_map.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "[CLS]",
3
+ "mask_token": "[MASK]",
4
+ "pad_token": "[PAD]",
5
+ "sep_token": "[SEP]",
6
+ "unk_token": "[UNK]"
7
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[PAD]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "100": {
12
+ "content": "[UNK]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "101": {
20
+ "content": "[CLS]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "102": {
28
+ "content": "[SEP]",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "103": {
36
+ "content": "[MASK]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "clean_up_tokenization_spaces": false,
45
+ "cls_token": "[CLS]",
46
+ "do_lower_case": true,
47
+ "extra_special_tokens": {},
48
+ "mask_token": "[MASK]",
49
+ "max_length": 512,
50
+ "model_max_length": 512,
51
+ "pad_to_multiple_of": 8,
52
+ "pad_token": "[PAD]",
53
+ "pad_token_type_id": 0,
54
+ "padding_side": "right",
55
+ "sep_token": "[SEP]",
56
+ "stride": 0,
57
+ "strip_accents": null,
58
+ "tokenize_chinese_chars": true,
59
+ "tokenizer_class": "TiteTokenizer",
60
+ "truncation_side": "right",
61
+ "truncation_strategy": "longest_first",
62
+ "unk_token": "[UNK]"
63
+ }
vocab.txt ADDED
The diff for this file is too large to render. See raw diff