n0w0f commited on
Commit
3e24a2f
·
verified ·
1 Parent(s): 405dc56

Upload folder using huggingface_hub

Browse files
config.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "BertForMaskedLM"
4
+ ],
5
+ "attention_probs_dropout_prob": 0.1,
6
+ "classifier_dropout": null,
7
+ "dtype": "float32",
8
+ "gradient_checkpointing": false,
9
+ "hidden_act": "gelu",
10
+ "hidden_dropout_prob": 0.1,
11
+ "hidden_size": 512,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 3072,
14
+ "layer_norm_eps": 1e-12,
15
+ "max_position_embeddings": 1024,
16
+ "model_type": "bert",
17
+ "num_attention_heads": 8,
18
+ "num_hidden_layers": 4,
19
+ "pad_token_id": 0,
20
+ "position_embedding_type": "absolute",
21
+ "transformers_version": "4.57.6",
22
+ "type_vocab_size": 2,
23
+ "use_cache": true,
24
+ "vocab_size": 30522
25
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:357b8e65d7d9b036190922ac184ca746274f953d347baeec8b626ff48e5486d8
3
+ size 133031496
optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f9ccd87af85a2958de00c3090ab85d6d5cdc1aefdeeaa8077a3a7270ac83f5af
3
+ size 266109515
rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:344d546b52136b7b336b91a4db8709ac3ff3ac644fbaf7ae1ee9e638ac334fee
3
+ size 14645
scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b9f025cb099f6a1c3c828b0e0495822252290f6b856f0800fe2bb140dc812b20
3
+ size 1465
special_tokens_map.json ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "[BOS]",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "cls_token": {
10
+ "content": "[CLS]",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "eos_token": {
17
+ "content": "[EOS]",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "mask_token": {
24
+ "content": "[MASK]",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ },
30
+ "pad_token": {
31
+ "content": "[PAD]",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false
36
+ },
37
+ "sep_token": {
38
+ "content": "[SEP]",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false
43
+ },
44
+ "unk_token": {
45
+ "content": "[UNK]",
46
+ "lstrip": false,
47
+ "normalized": false,
48
+ "rstrip": false,
49
+ "single_word": false
50
+ }
51
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[UNK]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "[CLS]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "[SEP]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3": {
28
+ "content": "[PAD]",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "4": {
36
+ "content": "[MASK]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ },
43
+ "30001": {
44
+ "content": "[EOS]",
45
+ "lstrip": false,
46
+ "normalized": false,
47
+ "rstrip": false,
48
+ "single_word": false,
49
+ "special": true
50
+ },
51
+ "30002": {
52
+ "content": "[BOS]",
53
+ "lstrip": false,
54
+ "normalized": false,
55
+ "rstrip": false,
56
+ "single_word": false,
57
+ "special": true
58
+ }
59
+ },
60
+ "bos_token": "[BOS]",
61
+ "clean_up_tokenization_spaces": false,
62
+ "cls_token": "[CLS]",
63
+ "eos_token": "[EOS]",
64
+ "extra_special_tokens": {},
65
+ "mask_token": "[MASK]",
66
+ "model_max_length": 1000000000000000019884624838656,
67
+ "pad_token": "[PAD]",
68
+ "sep_token": "[SEP]",
69
+ "tokenizer_class": "PreTrainedTokenizerFast",
70
+ "unk_token": "[UNK]"
71
+ }
trainer_state.json ADDED
@@ -0,0 +1,1843 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": 6000,
3
+ "best_metric": 0.13940538465976715,
4
+ "best_model_checkpoint": "/data/alamparan/mattext_ckpt/results_2m/2026-02-06/03-23-25/pretrain/checkpoints/robocrys_rep_test-pretrain/checkpoint-6000",
5
+ "epoch": 0.5780903747952597,
6
+ "eval_steps": 50,
7
+ "global_step": 6000,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.004817419789960497,
14
+ "grad_norm": 1.3140820264816284,
15
+ "learning_rate": 0.00019998111571442338,
16
+ "loss": 5.9543,
17
+ "step": 50
18
+ },
19
+ {
20
+ "epoch": 0.004817419789960497,
21
+ "eval_loss": 4.040468692779541,
22
+ "eval_runtime": 85.9858,
23
+ "eval_samples_per_second": 221.013,
24
+ "eval_steps_per_second": 4.605,
25
+ "step": 50
26
+ },
27
+ {
28
+ "epoch": 0.009634839579920994,
29
+ "grad_norm": 0.8951072692871094,
30
+ "learning_rate": 0.0001999618460352635,
31
+ "loss": 3.7958,
32
+ "step": 100
33
+ },
34
+ {
35
+ "epoch": 0.009634839579920994,
36
+ "eval_loss": 3.572112798690796,
37
+ "eval_runtime": 87.5351,
38
+ "eval_samples_per_second": 217.102,
39
+ "eval_steps_per_second": 4.524,
40
+ "step": 100
41
+ },
42
+ {
43
+ "epoch": 0.014452259369881492,
44
+ "grad_norm": 1.1552037000656128,
45
+ "learning_rate": 0.00019994257635610368,
46
+ "loss": 3.4955,
47
+ "step": 150
48
+ },
49
+ {
50
+ "epoch": 0.014452259369881492,
51
+ "eval_loss": 3.365239381790161,
52
+ "eval_runtime": 86.7746,
53
+ "eval_samples_per_second": 219.004,
54
+ "eval_steps_per_second": 4.564,
55
+ "step": 150
56
+ },
57
+ {
58
+ "epoch": 0.019269679159841988,
59
+ "grad_norm": 1.1728101968765259,
60
+ "learning_rate": 0.00019992330667694384,
61
+ "loss": 3.3176,
62
+ "step": 200
63
+ },
64
+ {
65
+ "epoch": 0.019269679159841988,
66
+ "eval_loss": 3.2046337127685547,
67
+ "eval_runtime": 86.8432,
68
+ "eval_samples_per_second": 218.831,
69
+ "eval_steps_per_second": 4.56,
70
+ "step": 200
71
+ },
72
+ {
73
+ "epoch": 0.024087098949802484,
74
+ "grad_norm": 0.8054835200309753,
75
+ "learning_rate": 0.000199904036997784,
76
+ "loss": 3.1815,
77
+ "step": 250
78
+ },
79
+ {
80
+ "epoch": 0.024087098949802484,
81
+ "eval_loss": 3.065305709838867,
82
+ "eval_runtime": 85.5782,
83
+ "eval_samples_per_second": 222.066,
84
+ "eval_steps_per_second": 4.627,
85
+ "step": 250
86
+ },
87
+ {
88
+ "epoch": 0.028904518739762984,
89
+ "grad_norm": 1.2382686138153076,
90
+ "learning_rate": 0.00019988476731862417,
91
+ "loss": 3.0462,
92
+ "step": 300
93
+ },
94
+ {
95
+ "epoch": 0.028904518739762984,
96
+ "eval_loss": 2.937901020050049,
97
+ "eval_runtime": 86.036,
98
+ "eval_samples_per_second": 220.884,
99
+ "eval_steps_per_second": 4.603,
100
+ "step": 300
101
+ },
102
+ {
103
+ "epoch": 0.03372193852972348,
104
+ "grad_norm": 1.245282530784607,
105
+ "learning_rate": 0.00019986549763946433,
106
+ "loss": 2.9173,
107
+ "step": 350
108
+ },
109
+ {
110
+ "epoch": 0.03372193852972348,
111
+ "eval_loss": 2.7557334899902344,
112
+ "eval_runtime": 88.8138,
113
+ "eval_samples_per_second": 213.976,
114
+ "eval_steps_per_second": 4.459,
115
+ "step": 350
116
+ },
117
+ {
118
+ "epoch": 0.038539358319683976,
119
+ "grad_norm": 1.2060555219650269,
120
+ "learning_rate": 0.00019984622796030447,
121
+ "loss": 2.7187,
122
+ "step": 400
123
+ },
124
+ {
125
+ "epoch": 0.038539358319683976,
126
+ "eval_loss": 2.480039119720459,
127
+ "eval_runtime": 88.2775,
128
+ "eval_samples_per_second": 215.276,
129
+ "eval_steps_per_second": 4.486,
130
+ "step": 400
131
+ },
132
+ {
133
+ "epoch": 0.04335677810964447,
134
+ "grad_norm": 1.9137617349624634,
135
+ "learning_rate": 0.00019982695828114463,
136
+ "loss": 2.3768,
137
+ "step": 450
138
+ },
139
+ {
140
+ "epoch": 0.04335677810964447,
141
+ "eval_loss": 1.9087786674499512,
142
+ "eval_runtime": 88.071,
143
+ "eval_samples_per_second": 215.78,
144
+ "eval_steps_per_second": 4.496,
145
+ "step": 450
146
+ },
147
+ {
148
+ "epoch": 0.04817419789960497,
149
+ "grad_norm": 1.4742215871810913,
150
+ "learning_rate": 0.00019980768860198477,
151
+ "loss": 1.7234,
152
+ "step": 500
153
+ },
154
+ {
155
+ "epoch": 0.04817419789960497,
156
+ "eval_loss": 1.158734679222107,
157
+ "eval_runtime": 88.5849,
158
+ "eval_samples_per_second": 214.529,
159
+ "eval_steps_per_second": 4.47,
160
+ "step": 500
161
+ },
162
+ {
163
+ "epoch": 0.05299161768956547,
164
+ "grad_norm": 1.0141704082489014,
165
+ "learning_rate": 0.00019978841892282493,
166
+ "loss": 1.1418,
167
+ "step": 550
168
+ },
169
+ {
170
+ "epoch": 0.05299161768956547,
171
+ "eval_loss": 0.8516036868095398,
172
+ "eval_runtime": 87.5002,
173
+ "eval_samples_per_second": 217.188,
174
+ "eval_steps_per_second": 4.526,
175
+ "step": 550
176
+ },
177
+ {
178
+ "epoch": 0.05780903747952597,
179
+ "grad_norm": 0.8414612412452698,
180
+ "learning_rate": 0.0001997691492436651,
181
+ "loss": 0.881,
182
+ "step": 600
183
+ },
184
+ {
185
+ "epoch": 0.05780903747952597,
186
+ "eval_loss": 0.7400951385498047,
187
+ "eval_runtime": 88.6088,
188
+ "eval_samples_per_second": 214.471,
189
+ "eval_steps_per_second": 4.469,
190
+ "step": 600
191
+ },
192
+ {
193
+ "epoch": 0.06262645726948646,
194
+ "grad_norm": 0.694267213344574,
195
+ "learning_rate": 0.00019974987956450526,
196
+ "loss": 0.813,
197
+ "step": 650
198
+ },
199
+ {
200
+ "epoch": 0.06262645726948646,
201
+ "eval_loss": 0.6765585541725159,
202
+ "eval_runtime": 90.0063,
203
+ "eval_samples_per_second": 211.141,
204
+ "eval_steps_per_second": 4.4,
205
+ "step": 650
206
+ },
207
+ {
208
+ "epoch": 0.06744387705944696,
209
+ "grad_norm": 0.6236635446548462,
210
+ "learning_rate": 0.00019973060988534542,
211
+ "loss": 0.7301,
212
+ "step": 700
213
+ },
214
+ {
215
+ "epoch": 0.06744387705944696,
216
+ "eval_loss": 0.6367717981338501,
217
+ "eval_runtime": 90.1272,
218
+ "eval_samples_per_second": 210.857,
219
+ "eval_steps_per_second": 4.394,
220
+ "step": 700
221
+ },
222
+ {
223
+ "epoch": 0.07226129684940746,
224
+ "grad_norm": 0.663361132144928,
225
+ "learning_rate": 0.00019971134020618558,
226
+ "loss": 0.6936,
227
+ "step": 750
228
+ },
229
+ {
230
+ "epoch": 0.07226129684940746,
231
+ "eval_loss": 0.603342592716217,
232
+ "eval_runtime": 88.3855,
233
+ "eval_samples_per_second": 215.013,
234
+ "eval_steps_per_second": 4.48,
235
+ "step": 750
236
+ },
237
+ {
238
+ "epoch": 0.07707871663936795,
239
+ "grad_norm": 0.584972083568573,
240
+ "learning_rate": 0.00019969207052702575,
241
+ "loss": 0.6492,
242
+ "step": 800
243
+ },
244
+ {
245
+ "epoch": 0.07707871663936795,
246
+ "eval_loss": 0.5670896768569946,
247
+ "eval_runtime": 88.8592,
248
+ "eval_samples_per_second": 213.867,
249
+ "eval_steps_per_second": 4.456,
250
+ "step": 800
251
+ },
252
+ {
253
+ "epoch": 0.08189613642932846,
254
+ "grad_norm": 0.6252435445785522,
255
+ "learning_rate": 0.00019967280084786588,
256
+ "loss": 0.6554,
257
+ "step": 850
258
+ },
259
+ {
260
+ "epoch": 0.08189613642932846,
261
+ "eval_loss": 0.5390716791152954,
262
+ "eval_runtime": 88.7469,
263
+ "eval_samples_per_second": 214.137,
264
+ "eval_steps_per_second": 4.462,
265
+ "step": 850
266
+ },
267
+ {
268
+ "epoch": 0.08671355621928895,
269
+ "grad_norm": 0.5381043553352356,
270
+ "learning_rate": 0.00019965353116870605,
271
+ "loss": 0.5847,
272
+ "step": 900
273
+ },
274
+ {
275
+ "epoch": 0.08671355621928895,
276
+ "eval_loss": 0.5227247476577759,
277
+ "eval_runtime": 87.1283,
278
+ "eval_samples_per_second": 218.115,
279
+ "eval_steps_per_second": 4.545,
280
+ "step": 900
281
+ },
282
+ {
283
+ "epoch": 0.09153097600924945,
284
+ "grad_norm": 0.4695926308631897,
285
+ "learning_rate": 0.0001996342614895462,
286
+ "loss": 0.5787,
287
+ "step": 950
288
+ },
289
+ {
290
+ "epoch": 0.09153097600924945,
291
+ "eval_loss": 0.5046476721763611,
292
+ "eval_runtime": 88.0204,
293
+ "eval_samples_per_second": 215.905,
294
+ "eval_steps_per_second": 4.499,
295
+ "step": 950
296
+ },
297
+ {
298
+ "epoch": 0.09634839579920994,
299
+ "grad_norm": 0.49100059270858765,
300
+ "learning_rate": 0.00019961499181038637,
301
+ "loss": 0.5795,
302
+ "step": 1000
303
+ },
304
+ {
305
+ "epoch": 0.09634839579920994,
306
+ "eval_loss": 0.4904717803001404,
307
+ "eval_runtime": 87.871,
308
+ "eval_samples_per_second": 216.272,
309
+ "eval_steps_per_second": 4.507,
310
+ "step": 1000
311
+ },
312
+ {
313
+ "epoch": 0.10116581558917044,
314
+ "grad_norm": 0.48738670349121094,
315
+ "learning_rate": 0.00019959572213122654,
316
+ "loss": 0.5372,
317
+ "step": 1050
318
+ },
319
+ {
320
+ "epoch": 0.10116581558917044,
321
+ "eval_loss": 0.4725087285041809,
322
+ "eval_runtime": 87.3641,
323
+ "eval_samples_per_second": 217.526,
324
+ "eval_steps_per_second": 4.533,
325
+ "step": 1050
326
+ },
327
+ {
328
+ "epoch": 0.10598323537913094,
329
+ "grad_norm": 0.5932963490486145,
330
+ "learning_rate": 0.0001995764524520667,
331
+ "loss": 0.5074,
332
+ "step": 1100
333
+ },
334
+ {
335
+ "epoch": 0.10598323537913094,
336
+ "eval_loss": 0.4346506893634796,
337
+ "eval_runtime": 87.5476,
338
+ "eval_samples_per_second": 217.07,
339
+ "eval_steps_per_second": 4.523,
340
+ "step": 1100
341
+ },
342
+ {
343
+ "epoch": 0.11080065516909143,
344
+ "grad_norm": 0.5904502272605896,
345
+ "learning_rate": 0.00019955718277290684,
346
+ "loss": 0.4764,
347
+ "step": 1150
348
+ },
349
+ {
350
+ "epoch": 0.11080065516909143,
351
+ "eval_loss": 0.4015448987483978,
352
+ "eval_runtime": 88.5379,
353
+ "eval_samples_per_second": 214.642,
354
+ "eval_steps_per_second": 4.473,
355
+ "step": 1150
356
+ },
357
+ {
358
+ "epoch": 0.11561807495905194,
359
+ "grad_norm": 0.46411070227622986,
360
+ "learning_rate": 0.000199537913093747,
361
+ "loss": 0.4346,
362
+ "step": 1200
363
+ },
364
+ {
365
+ "epoch": 0.11561807495905194,
366
+ "eval_loss": 0.38101693987846375,
367
+ "eval_runtime": 87.5237,
368
+ "eval_samples_per_second": 217.13,
369
+ "eval_steps_per_second": 4.524,
370
+ "step": 1200
371
+ },
372
+ {
373
+ "epoch": 0.12043549474901243,
374
+ "grad_norm": 0.49205589294433594,
375
+ "learning_rate": 0.00019951864341458714,
376
+ "loss": 0.4151,
377
+ "step": 1250
378
+ },
379
+ {
380
+ "epoch": 0.12043549474901243,
381
+ "eval_loss": 0.3670279085636139,
382
+ "eval_runtime": 87.1762,
383
+ "eval_samples_per_second": 217.995,
384
+ "eval_steps_per_second": 4.543,
385
+ "step": 1250
386
+ },
387
+ {
388
+ "epoch": 0.12525291453897291,
389
+ "grad_norm": 0.5221843123435974,
390
+ "learning_rate": 0.0001994993737354273,
391
+ "loss": 0.3951,
392
+ "step": 1300
393
+ },
394
+ {
395
+ "epoch": 0.12525291453897291,
396
+ "eval_loss": 0.3488354980945587,
397
+ "eval_runtime": 87.8499,
398
+ "eval_samples_per_second": 216.324,
399
+ "eval_steps_per_second": 4.508,
400
+ "step": 1300
401
+ },
402
+ {
403
+ "epoch": 0.13007033432893342,
404
+ "grad_norm": 0.5032587647438049,
405
+ "learning_rate": 0.00019948010405626746,
406
+ "loss": 0.3593,
407
+ "step": 1350
408
+ },
409
+ {
410
+ "epoch": 0.13007033432893342,
411
+ "eval_loss": 0.33173102140426636,
412
+ "eval_runtime": 86.3749,
413
+ "eval_samples_per_second": 220.018,
414
+ "eval_steps_per_second": 4.585,
415
+ "step": 1350
416
+ },
417
+ {
418
+ "epoch": 0.13488775411889392,
419
+ "grad_norm": 0.5051923394203186,
420
+ "learning_rate": 0.00019946083437710763,
421
+ "loss": 0.3526,
422
+ "step": 1400
423
+ },
424
+ {
425
+ "epoch": 0.13488775411889392,
426
+ "eval_loss": 0.30538395047187805,
427
+ "eval_runtime": 89.1978,
428
+ "eval_samples_per_second": 213.055,
429
+ "eval_steps_per_second": 4.44,
430
+ "step": 1400
431
+ },
432
+ {
433
+ "epoch": 0.13970517390885442,
434
+ "grad_norm": 0.46636977791786194,
435
+ "learning_rate": 0.0001994415646979478,
436
+ "loss": 0.3299,
437
+ "step": 1450
438
+ },
439
+ {
440
+ "epoch": 0.13970517390885442,
441
+ "eval_loss": 0.28566229343414307,
442
+ "eval_runtime": 87.6248,
443
+ "eval_samples_per_second": 216.879,
444
+ "eval_steps_per_second": 4.519,
445
+ "step": 1450
446
+ },
447
+ {
448
+ "epoch": 0.14452259369881493,
449
+ "grad_norm": 0.4614977240562439,
450
+ "learning_rate": 0.00019942229501878795,
451
+ "loss": 0.2968,
452
+ "step": 1500
453
+ },
454
+ {
455
+ "epoch": 0.14452259369881493,
456
+ "eval_loss": 0.2682988941669464,
457
+ "eval_runtime": 88.341,
458
+ "eval_samples_per_second": 215.121,
459
+ "eval_steps_per_second": 4.483,
460
+ "step": 1500
461
+ },
462
+ {
463
+ "epoch": 0.1493400134887754,
464
+ "grad_norm": 0.40138185024261475,
465
+ "learning_rate": 0.00019940302533962812,
466
+ "loss": 0.2822,
467
+ "step": 1550
468
+ },
469
+ {
470
+ "epoch": 0.1493400134887754,
471
+ "eval_loss": 0.2586437165737152,
472
+ "eval_runtime": 87.7499,
473
+ "eval_samples_per_second": 216.57,
474
+ "eval_steps_per_second": 4.513,
475
+ "step": 1550
476
+ },
477
+ {
478
+ "epoch": 0.1541574332787359,
479
+ "grad_norm": 0.4057703912258148,
480
+ "learning_rate": 0.00019938375566046825,
481
+ "loss": 0.271,
482
+ "step": 1600
483
+ },
484
+ {
485
+ "epoch": 0.1541574332787359,
486
+ "eval_loss": 0.25495249032974243,
487
+ "eval_runtime": 87.2931,
488
+ "eval_samples_per_second": 217.703,
489
+ "eval_steps_per_second": 4.536,
490
+ "step": 1600
491
+ },
492
+ {
493
+ "epoch": 0.1589748530686964,
494
+ "grad_norm": 0.32808950543403625,
495
+ "learning_rate": 0.00019936448598130842,
496
+ "loss": 0.2547,
497
+ "step": 1650
498
+ },
499
+ {
500
+ "epoch": 0.1589748530686964,
501
+ "eval_loss": 0.243699312210083,
502
+ "eval_runtime": 86.1589,
503
+ "eval_samples_per_second": 220.569,
504
+ "eval_steps_per_second": 4.596,
505
+ "step": 1650
506
+ },
507
+ {
508
+ "epoch": 0.1637922728586569,
509
+ "grad_norm": 0.40255042910575867,
510
+ "learning_rate": 0.00019934521630214858,
511
+ "loss": 0.2549,
512
+ "step": 1700
513
+ },
514
+ {
515
+ "epoch": 0.1637922728586569,
516
+ "eval_loss": 0.24029429256916046,
517
+ "eval_runtime": 88.882,
518
+ "eval_samples_per_second": 213.812,
519
+ "eval_steps_per_second": 4.455,
520
+ "step": 1700
521
+ },
522
+ {
523
+ "epoch": 0.1686096926486174,
524
+ "grad_norm": 0.4509395956993103,
525
+ "learning_rate": 0.00019932594662298874,
526
+ "loss": 0.2541,
527
+ "step": 1750
528
+ },
529
+ {
530
+ "epoch": 0.1686096926486174,
531
+ "eval_loss": 0.23613183200359344,
532
+ "eval_runtime": 87.9005,
533
+ "eval_samples_per_second": 216.199,
534
+ "eval_steps_per_second": 4.505,
535
+ "step": 1750
536
+ },
537
+ {
538
+ "epoch": 0.1734271124385779,
539
+ "grad_norm": 0.3515077531337738,
540
+ "learning_rate": 0.0001993066769438289,
541
+ "loss": 0.2385,
542
+ "step": 1800
543
+ },
544
+ {
545
+ "epoch": 0.1734271124385779,
546
+ "eval_loss": 0.2291395217180252,
547
+ "eval_runtime": 87.3005,
548
+ "eval_samples_per_second": 217.685,
549
+ "eval_steps_per_second": 4.536,
550
+ "step": 1800
551
+ },
552
+ {
553
+ "epoch": 0.1782445322285384,
554
+ "grad_norm": 0.3758944272994995,
555
+ "learning_rate": 0.00019928740726466907,
556
+ "loss": 0.2406,
557
+ "step": 1850
558
+ },
559
+ {
560
+ "epoch": 0.1782445322285384,
561
+ "eval_loss": 0.22721637785434723,
562
+ "eval_runtime": 87.3928,
563
+ "eval_samples_per_second": 217.455,
564
+ "eval_steps_per_second": 4.531,
565
+ "step": 1850
566
+ },
567
+ {
568
+ "epoch": 0.1830619520184989,
569
+ "grad_norm": 0.33480963110923767,
570
+ "learning_rate": 0.0001992681375855092,
571
+ "loss": 0.2327,
572
+ "step": 1900
573
+ },
574
+ {
575
+ "epoch": 0.1830619520184989,
576
+ "eval_loss": 0.22441919147968292,
577
+ "eval_runtime": 87.7197,
578
+ "eval_samples_per_second": 216.645,
579
+ "eval_steps_per_second": 4.514,
580
+ "step": 1900
581
+ },
582
+ {
583
+ "epoch": 0.1878793718084594,
584
+ "grad_norm": 0.44033002853393555,
585
+ "learning_rate": 0.00019924886790634937,
586
+ "loss": 0.2327,
587
+ "step": 1950
588
+ },
589
+ {
590
+ "epoch": 0.1878793718084594,
591
+ "eval_loss": 0.21973223984241486,
592
+ "eval_runtime": 86.9627,
593
+ "eval_samples_per_second": 218.53,
594
+ "eval_steps_per_second": 4.554,
595
+ "step": 1950
596
+ },
597
+ {
598
+ "epoch": 0.19269679159841988,
599
+ "grad_norm": 0.36066603660583496,
600
+ "learning_rate": 0.0001992295982271895,
601
+ "loss": 0.2204,
602
+ "step": 2000
603
+ },
604
+ {
605
+ "epoch": 0.19269679159841988,
606
+ "eval_loss": 0.21755366027355194,
607
+ "eval_runtime": 85.9832,
608
+ "eval_samples_per_second": 221.02,
609
+ "eval_steps_per_second": 4.606,
610
+ "step": 2000
611
+ },
612
+ {
613
+ "epoch": 0.19751421138838038,
614
+ "grad_norm": 0.3927019536495209,
615
+ "learning_rate": 0.00019921032854802967,
616
+ "loss": 0.2227,
617
+ "step": 2050
618
+ },
619
+ {
620
+ "epoch": 0.19751421138838038,
621
+ "eval_loss": 0.21245667338371277,
622
+ "eval_runtime": 88.7726,
623
+ "eval_samples_per_second": 214.075,
624
+ "eval_steps_per_second": 4.461,
625
+ "step": 2050
626
+ },
627
+ {
628
+ "epoch": 0.20233163117834088,
629
+ "grad_norm": 0.33693307638168335,
630
+ "learning_rate": 0.00019919105886886983,
631
+ "loss": 0.2193,
632
+ "step": 2100
633
+ },
634
+ {
635
+ "epoch": 0.20233163117834088,
636
+ "eval_loss": 0.21231026947498322,
637
+ "eval_runtime": 86.8589,
638
+ "eval_samples_per_second": 218.792,
639
+ "eval_steps_per_second": 4.559,
640
+ "step": 2100
641
+ },
642
+ {
643
+ "epoch": 0.20714905096830138,
644
+ "grad_norm": 0.32883742451667786,
645
+ "learning_rate": 0.00019917178918971,
646
+ "loss": 0.2163,
647
+ "step": 2150
648
+ },
649
+ {
650
+ "epoch": 0.20714905096830138,
651
+ "eval_loss": 0.20846392214298248,
652
+ "eval_runtime": 86.2766,
653
+ "eval_samples_per_second": 220.268,
654
+ "eval_steps_per_second": 4.59,
655
+ "step": 2150
656
+ },
657
+ {
658
+ "epoch": 0.2119664707582619,
659
+ "grad_norm": 0.3935041129589081,
660
+ "learning_rate": 0.00019915251951055016,
661
+ "loss": 0.216,
662
+ "step": 2200
663
+ },
664
+ {
665
+ "epoch": 0.2119664707582619,
666
+ "eval_loss": 0.206617072224617,
667
+ "eval_runtime": 87.55,
668
+ "eval_samples_per_second": 217.065,
669
+ "eval_steps_per_second": 4.523,
670
+ "step": 2200
671
+ },
672
+ {
673
+ "epoch": 0.21678389054822236,
674
+ "grad_norm": 0.41742655634880066,
675
+ "learning_rate": 0.00019913324983139032,
676
+ "loss": 0.2046,
677
+ "step": 2250
678
+ },
679
+ {
680
+ "epoch": 0.21678389054822236,
681
+ "eval_loss": 0.2045588344335556,
682
+ "eval_runtime": 86.8857,
683
+ "eval_samples_per_second": 218.724,
684
+ "eval_steps_per_second": 4.558,
685
+ "step": 2250
686
+ },
687
+ {
688
+ "epoch": 0.22160131033818287,
689
+ "grad_norm": 0.3324650526046753,
690
+ "learning_rate": 0.00019911398015223049,
691
+ "loss": 0.206,
692
+ "step": 2300
693
+ },
694
+ {
695
+ "epoch": 0.22160131033818287,
696
+ "eval_loss": 0.2034013867378235,
697
+ "eval_runtime": 88.0292,
698
+ "eval_samples_per_second": 215.883,
699
+ "eval_steps_per_second": 4.499,
700
+ "step": 2300
701
+ },
702
+ {
703
+ "epoch": 0.22641873012814337,
704
+ "grad_norm": 0.3411875367164612,
705
+ "learning_rate": 0.00019909471047307062,
706
+ "loss": 0.2021,
707
+ "step": 2350
708
+ },
709
+ {
710
+ "epoch": 0.22641873012814337,
711
+ "eval_loss": 0.19746768474578857,
712
+ "eval_runtime": 88.0101,
713
+ "eval_samples_per_second": 215.93,
714
+ "eval_steps_per_second": 4.499,
715
+ "step": 2350
716
+ },
717
+ {
718
+ "epoch": 0.23123614991810387,
719
+ "grad_norm": 0.3764231503009796,
720
+ "learning_rate": 0.00019907544079391079,
721
+ "loss": 0.2028,
722
+ "step": 2400
723
+ },
724
+ {
725
+ "epoch": 0.23123614991810387,
726
+ "eval_loss": 0.19666016101837158,
727
+ "eval_runtime": 87.9384,
728
+ "eval_samples_per_second": 216.106,
729
+ "eval_steps_per_second": 4.503,
730
+ "step": 2400
731
+ },
732
+ {
733
+ "epoch": 0.23605356970806435,
734
+ "grad_norm": 0.377541720867157,
735
+ "learning_rate": 0.00019905617111475095,
736
+ "loss": 0.1995,
737
+ "step": 2450
738
+ },
739
+ {
740
+ "epoch": 0.23605356970806435,
741
+ "eval_loss": 0.19164888560771942,
742
+ "eval_runtime": 88.6219,
743
+ "eval_samples_per_second": 214.439,
744
+ "eval_steps_per_second": 4.468,
745
+ "step": 2450
746
+ },
747
+ {
748
+ "epoch": 0.24087098949802485,
749
+ "grad_norm": 0.38512399792671204,
750
+ "learning_rate": 0.0001990369014355911,
751
+ "loss": 0.1947,
752
+ "step": 2500
753
+ },
754
+ {
755
+ "epoch": 0.24087098949802485,
756
+ "eval_loss": 0.1909172534942627,
757
+ "eval_runtime": 86.7278,
758
+ "eval_samples_per_second": 219.122,
759
+ "eval_steps_per_second": 4.566,
760
+ "step": 2500
761
+ },
762
+ {
763
+ "epoch": 0.24568840928798535,
764
+ "grad_norm": 0.29089486598968506,
765
+ "learning_rate": 0.00019901763175643128,
766
+ "loss": 0.1934,
767
+ "step": 2550
768
+ },
769
+ {
770
+ "epoch": 0.24568840928798535,
771
+ "eval_loss": 0.19215822219848633,
772
+ "eval_runtime": 87.4982,
773
+ "eval_samples_per_second": 217.193,
774
+ "eval_steps_per_second": 4.526,
775
+ "step": 2550
776
+ },
777
+ {
778
+ "epoch": 0.25050582907794583,
779
+ "grad_norm": 0.34340256452560425,
780
+ "learning_rate": 0.00019899836207727144,
781
+ "loss": 0.1893,
782
+ "step": 2600
783
+ },
784
+ {
785
+ "epoch": 0.25050582907794583,
786
+ "eval_loss": 0.18751804530620575,
787
+ "eval_runtime": 89.0572,
788
+ "eval_samples_per_second": 213.391,
789
+ "eval_steps_per_second": 4.447,
790
+ "step": 2600
791
+ },
792
+ {
793
+ "epoch": 0.25532324886790636,
794
+ "grad_norm": 0.31367331743240356,
795
+ "learning_rate": 0.00019897909239811158,
796
+ "loss": 0.1831,
797
+ "step": 2650
798
+ },
799
+ {
800
+ "epoch": 0.25532324886790636,
801
+ "eval_loss": 0.1869846135377884,
802
+ "eval_runtime": 88.3755,
803
+ "eval_samples_per_second": 215.037,
804
+ "eval_steps_per_second": 4.481,
805
+ "step": 2650
806
+ },
807
+ {
808
+ "epoch": 0.26014066865786684,
809
+ "grad_norm": 0.35607218742370605,
810
+ "learning_rate": 0.00019895982271895174,
811
+ "loss": 0.1906,
812
+ "step": 2700
813
+ },
814
+ {
815
+ "epoch": 0.26014066865786684,
816
+ "eval_loss": 0.1840076744556427,
817
+ "eval_runtime": 88.0613,
818
+ "eval_samples_per_second": 215.804,
819
+ "eval_steps_per_second": 4.497,
820
+ "step": 2700
821
+ },
822
+ {
823
+ "epoch": 0.26495808844782737,
824
+ "grad_norm": 0.29454657435417175,
825
+ "learning_rate": 0.00019894055303979188,
826
+ "loss": 0.1878,
827
+ "step": 2750
828
+ },
829
+ {
830
+ "epoch": 0.26495808844782737,
831
+ "eval_loss": 0.1864839792251587,
832
+ "eval_runtime": 87.4663,
833
+ "eval_samples_per_second": 217.272,
834
+ "eval_steps_per_second": 4.527,
835
+ "step": 2750
836
+ },
837
+ {
838
+ "epoch": 0.26977550823778784,
839
+ "grad_norm": 0.33838963508605957,
840
+ "learning_rate": 0.00019892128336063204,
841
+ "loss": 0.1821,
842
+ "step": 2800
843
+ },
844
+ {
845
+ "epoch": 0.26977550823778784,
846
+ "eval_loss": 0.1847812533378601,
847
+ "eval_runtime": 88.0891,
848
+ "eval_samples_per_second": 215.736,
849
+ "eval_steps_per_second": 4.495,
850
+ "step": 2800
851
+ },
852
+ {
853
+ "epoch": 0.2745929280277483,
854
+ "grad_norm": 0.2440441995859146,
855
+ "learning_rate": 0.0001989020136814722,
856
+ "loss": 0.1783,
857
+ "step": 2850
858
+ },
859
+ {
860
+ "epoch": 0.2745929280277483,
861
+ "eval_loss": 0.18052709102630615,
862
+ "eval_runtime": 85.9682,
863
+ "eval_samples_per_second": 221.059,
864
+ "eval_steps_per_second": 4.606,
865
+ "step": 2850
866
+ },
867
+ {
868
+ "epoch": 0.27941034781770885,
869
+ "grad_norm": 0.33384960889816284,
870
+ "learning_rate": 0.00019888274400231237,
871
+ "loss": 0.1796,
872
+ "step": 2900
873
+ },
874
+ {
875
+ "epoch": 0.27941034781770885,
876
+ "eval_loss": 0.17909078299999237,
877
+ "eval_runtime": 87.5176,
878
+ "eval_samples_per_second": 217.145,
879
+ "eval_steps_per_second": 4.525,
880
+ "step": 2900
881
+ },
882
+ {
883
+ "epoch": 0.2842277676076693,
884
+ "grad_norm": 0.29702430963516235,
885
+ "learning_rate": 0.00019886347432315253,
886
+ "loss": 0.1785,
887
+ "step": 2950
888
+ },
889
+ {
890
+ "epoch": 0.2842277676076693,
891
+ "eval_loss": 0.17997683584690094,
892
+ "eval_runtime": 87.9757,
893
+ "eval_samples_per_second": 216.014,
894
+ "eval_steps_per_second": 4.501,
895
+ "step": 2950
896
+ },
897
+ {
898
+ "epoch": 0.28904518739762985,
899
+ "grad_norm": 0.3510948121547699,
900
+ "learning_rate": 0.0001988442046439927,
901
+ "loss": 0.1765,
902
+ "step": 3000
903
+ },
904
+ {
905
+ "epoch": 0.28904518739762985,
906
+ "eval_loss": 0.1761734038591385,
907
+ "eval_runtime": 88.1258,
908
+ "eval_samples_per_second": 215.646,
909
+ "eval_steps_per_second": 4.494,
910
+ "step": 3000
911
+ },
912
+ {
913
+ "epoch": 0.29386260718759033,
914
+ "grad_norm": 0.37029772996902466,
915
+ "learning_rate": 0.00019882493496483286,
916
+ "loss": 0.1737,
917
+ "step": 3050
918
+ },
919
+ {
920
+ "epoch": 0.29386260718759033,
921
+ "eval_loss": 0.17449568212032318,
922
+ "eval_runtime": 88.3713,
923
+ "eval_samples_per_second": 215.047,
924
+ "eval_steps_per_second": 4.481,
925
+ "step": 3050
926
+ },
927
+ {
928
+ "epoch": 0.2986800269775508,
929
+ "grad_norm": 0.2771267592906952,
930
+ "learning_rate": 0.000198805665285673,
931
+ "loss": 0.1761,
932
+ "step": 3100
933
+ },
934
+ {
935
+ "epoch": 0.2986800269775508,
936
+ "eval_loss": 0.1765695959329605,
937
+ "eval_runtime": 87.9476,
938
+ "eval_samples_per_second": 216.083,
939
+ "eval_steps_per_second": 4.503,
940
+ "step": 3100
941
+ },
942
+ {
943
+ "epoch": 0.30349744676751134,
944
+ "grad_norm": 0.338558167219162,
945
+ "learning_rate": 0.00019878639560651316,
946
+ "loss": 0.1766,
947
+ "step": 3150
948
+ },
949
+ {
950
+ "epoch": 0.30349744676751134,
951
+ "eval_loss": 0.17348217964172363,
952
+ "eval_runtime": 86.4764,
953
+ "eval_samples_per_second": 219.759,
954
+ "eval_steps_per_second": 4.579,
955
+ "step": 3150
956
+ },
957
+ {
958
+ "epoch": 0.3083148665574718,
959
+ "grad_norm": 0.36884650588035583,
960
+ "learning_rate": 0.00019876712592735332,
961
+ "loss": 0.1697,
962
+ "step": 3200
963
+ },
964
+ {
965
+ "epoch": 0.3083148665574718,
966
+ "eval_loss": 0.1715058535337448,
967
+ "eval_runtime": 88.0269,
968
+ "eval_samples_per_second": 215.888,
969
+ "eval_steps_per_second": 4.499,
970
+ "step": 3200
971
+ },
972
+ {
973
+ "epoch": 0.31313228634743234,
974
+ "grad_norm": 0.2907465994358063,
975
+ "learning_rate": 0.00019874785624819348,
976
+ "loss": 0.1747,
977
+ "step": 3250
978
+ },
979
+ {
980
+ "epoch": 0.31313228634743234,
981
+ "eval_loss": 0.17374737560749054,
982
+ "eval_runtime": 87.3911,
983
+ "eval_samples_per_second": 217.459,
984
+ "eval_steps_per_second": 4.531,
985
+ "step": 3250
986
+ },
987
+ {
988
+ "epoch": 0.3179497061373928,
989
+ "grad_norm": 0.3033406734466553,
990
+ "learning_rate": 0.00019872858656903365,
991
+ "loss": 0.1706,
992
+ "step": 3300
993
+ },
994
+ {
995
+ "epoch": 0.3179497061373928,
996
+ "eval_loss": 0.17150533199310303,
997
+ "eval_runtime": 88.1112,
998
+ "eval_samples_per_second": 215.682,
999
+ "eval_steps_per_second": 4.494,
1000
+ "step": 3300
1001
+ },
1002
+ {
1003
+ "epoch": 0.3227671259273533,
1004
+ "grad_norm": 0.3115890324115753,
1005
+ "learning_rate": 0.0001987093168898738,
1006
+ "loss": 0.1665,
1007
+ "step": 3350
1008
+ },
1009
+ {
1010
+ "epoch": 0.3227671259273533,
1011
+ "eval_loss": 0.16980254650115967,
1012
+ "eval_runtime": 87.4444,
1013
+ "eval_samples_per_second": 217.327,
1014
+ "eval_steps_per_second": 4.529,
1015
+ "step": 3350
1016
+ },
1017
+ {
1018
+ "epoch": 0.3275845457173138,
1019
+ "grad_norm": 0.25616976618766785,
1020
+ "learning_rate": 0.00019869004721071395,
1021
+ "loss": 0.1655,
1022
+ "step": 3400
1023
+ },
1024
+ {
1025
+ "epoch": 0.3275845457173138,
1026
+ "eval_loss": 0.17015036940574646,
1027
+ "eval_runtime": 88.3575,
1028
+ "eval_samples_per_second": 215.081,
1029
+ "eval_steps_per_second": 4.482,
1030
+ "step": 3400
1031
+ },
1032
+ {
1033
+ "epoch": 0.3324019655072743,
1034
+ "grad_norm": 0.23346827924251556,
1035
+ "learning_rate": 0.0001986707775315541,
1036
+ "loss": 0.1673,
1037
+ "step": 3450
1038
+ },
1039
+ {
1040
+ "epoch": 0.3324019655072743,
1041
+ "eval_loss": 0.1687326282262802,
1042
+ "eval_runtime": 90.3569,
1043
+ "eval_samples_per_second": 210.321,
1044
+ "eval_steps_per_second": 4.383,
1045
+ "step": 3450
1046
+ },
1047
+ {
1048
+ "epoch": 0.3372193852972348,
1049
+ "grad_norm": 0.3347044289112091,
1050
+ "learning_rate": 0.00019865150785239425,
1051
+ "loss": 0.163,
1052
+ "step": 3500
1053
+ },
1054
+ {
1055
+ "epoch": 0.3372193852972348,
1056
+ "eval_loss": 0.17024628818035126,
1057
+ "eval_runtime": 88.9183,
1058
+ "eval_samples_per_second": 213.724,
1059
+ "eval_steps_per_second": 4.454,
1060
+ "step": 3500
1061
+ },
1062
+ {
1063
+ "epoch": 0.3420368050871953,
1064
+ "grad_norm": 0.34929919242858887,
1065
+ "learning_rate": 0.0001986322381732344,
1066
+ "loss": 0.1624,
1067
+ "step": 3550
1068
+ },
1069
+ {
1070
+ "epoch": 0.3420368050871953,
1071
+ "eval_loss": 0.16716831922531128,
1072
+ "eval_runtime": 89.8228,
1073
+ "eval_samples_per_second": 211.572,
1074
+ "eval_steps_per_second": 4.409,
1075
+ "step": 3550
1076
+ },
1077
+ {
1078
+ "epoch": 0.3468542248771558,
1079
+ "grad_norm": 0.2455097734928131,
1080
+ "learning_rate": 0.00019861296849407457,
1081
+ "loss": 0.1633,
1082
+ "step": 3600
1083
+ },
1084
+ {
1085
+ "epoch": 0.3468542248771558,
1086
+ "eval_loss": 0.16651391983032227,
1087
+ "eval_runtime": 86.8189,
1088
+ "eval_samples_per_second": 218.892,
1089
+ "eval_steps_per_second": 4.561,
1090
+ "step": 3600
1091
+ },
1092
+ {
1093
+ "epoch": 0.3516716446671163,
1094
+ "grad_norm": 0.28978198766708374,
1095
+ "learning_rate": 0.00019859369881491474,
1096
+ "loss": 0.1646,
1097
+ "step": 3650
1098
+ },
1099
+ {
1100
+ "epoch": 0.3516716446671163,
1101
+ "eval_loss": 0.16727794706821442,
1102
+ "eval_runtime": 87.7801,
1103
+ "eval_samples_per_second": 216.496,
1104
+ "eval_steps_per_second": 4.511,
1105
+ "step": 3650
1106
+ },
1107
+ {
1108
+ "epoch": 0.3564890644570768,
1109
+ "grad_norm": 0.292510062456131,
1110
+ "learning_rate": 0.0001985744291357549,
1111
+ "loss": 0.1625,
1112
+ "step": 3700
1113
+ },
1114
+ {
1115
+ "epoch": 0.3564890644570768,
1116
+ "eval_loss": 0.16462790966033936,
1117
+ "eval_runtime": 87.2466,
1118
+ "eval_samples_per_second": 217.819,
1119
+ "eval_steps_per_second": 4.539,
1120
+ "step": 3700
1121
+ },
1122
+ {
1123
+ "epoch": 0.36130648424703726,
1124
+ "grad_norm": 0.26186755299568176,
1125
+ "learning_rate": 0.00019855515945659506,
1126
+ "loss": 0.1583,
1127
+ "step": 3750
1128
+ },
1129
+ {
1130
+ "epoch": 0.36130648424703726,
1131
+ "eval_loss": 0.16536560654640198,
1132
+ "eval_runtime": 85.6294,
1133
+ "eval_samples_per_second": 221.933,
1134
+ "eval_steps_per_second": 4.625,
1135
+ "step": 3750
1136
+ },
1137
+ {
1138
+ "epoch": 0.3661239040369978,
1139
+ "grad_norm": 0.25864389538764954,
1140
+ "learning_rate": 0.00019853588977743523,
1141
+ "loss": 0.163,
1142
+ "step": 3800
1143
+ },
1144
+ {
1145
+ "epoch": 0.3661239040369978,
1146
+ "eval_loss": 0.16156961023807526,
1147
+ "eval_runtime": 88.8605,
1148
+ "eval_samples_per_second": 213.863,
1149
+ "eval_steps_per_second": 4.456,
1150
+ "step": 3800
1151
+ },
1152
+ {
1153
+ "epoch": 0.37094132382695827,
1154
+ "grad_norm": 0.298779159784317,
1155
+ "learning_rate": 0.00019851662009827536,
1156
+ "loss": 0.1582,
1157
+ "step": 3850
1158
+ },
1159
+ {
1160
+ "epoch": 0.37094132382695827,
1161
+ "eval_loss": 0.16503678262233734,
1162
+ "eval_runtime": 90.0855,
1163
+ "eval_samples_per_second": 210.955,
1164
+ "eval_steps_per_second": 4.396,
1165
+ "step": 3850
1166
+ },
1167
+ {
1168
+ "epoch": 0.3757587436169188,
1169
+ "grad_norm": 0.30580171942710876,
1170
+ "learning_rate": 0.00019849735041911553,
1171
+ "loss": 0.1568,
1172
+ "step": 3900
1173
+ },
1174
+ {
1175
+ "epoch": 0.3757587436169188,
1176
+ "eval_loss": 0.16093286871910095,
1177
+ "eval_runtime": 87.9051,
1178
+ "eval_samples_per_second": 216.188,
1179
+ "eval_steps_per_second": 4.505,
1180
+ "step": 3900
1181
+ },
1182
+ {
1183
+ "epoch": 0.3805761634068793,
1184
+ "grad_norm": 0.26903071999549866,
1185
+ "learning_rate": 0.0001984780807399557,
1186
+ "loss": 0.156,
1187
+ "step": 3950
1188
+ },
1189
+ {
1190
+ "epoch": 0.3805761634068793,
1191
+ "eval_loss": 0.16180996596813202,
1192
+ "eval_runtime": 87.2728,
1193
+ "eval_samples_per_second": 217.754,
1194
+ "eval_steps_per_second": 4.537,
1195
+ "step": 3950
1196
+ },
1197
+ {
1198
+ "epoch": 0.38539358319683975,
1199
+ "grad_norm": 0.25209519267082214,
1200
+ "learning_rate": 0.00019845881106079585,
1201
+ "loss": 0.1559,
1202
+ "step": 4000
1203
+ },
1204
+ {
1205
+ "epoch": 0.38539358319683975,
1206
+ "eval_loss": 0.15961618721485138,
1207
+ "eval_runtime": 89.2611,
1208
+ "eval_samples_per_second": 212.903,
1209
+ "eval_steps_per_second": 4.436,
1210
+ "step": 4000
1211
+ },
1212
+ {
1213
+ "epoch": 0.3902110029868003,
1214
+ "grad_norm": 0.29754438996315,
1215
+ "learning_rate": 0.00019843954138163602,
1216
+ "loss": 0.1546,
1217
+ "step": 4050
1218
+ },
1219
+ {
1220
+ "epoch": 0.3902110029868003,
1221
+ "eval_loss": 0.16072513163089752,
1222
+ "eval_runtime": 87.9158,
1223
+ "eval_samples_per_second": 216.161,
1224
+ "eval_steps_per_second": 4.504,
1225
+ "step": 4050
1226
+ },
1227
+ {
1228
+ "epoch": 0.39502842277676076,
1229
+ "grad_norm": 0.259056031703949,
1230
+ "learning_rate": 0.00019842027170247618,
1231
+ "loss": 0.1576,
1232
+ "step": 4100
1233
+ },
1234
+ {
1235
+ "epoch": 0.39502842277676076,
1236
+ "eval_loss": 0.16210326552391052,
1237
+ "eval_runtime": 87.5661,
1238
+ "eval_samples_per_second": 217.025,
1239
+ "eval_steps_per_second": 4.522,
1240
+ "step": 4100
1241
+ },
1242
+ {
1243
+ "epoch": 0.3998458425667213,
1244
+ "grad_norm": 0.2900582253932953,
1245
+ "learning_rate": 0.00019840100202331634,
1246
+ "loss": 0.1557,
1247
+ "step": 4150
1248
+ },
1249
+ {
1250
+ "epoch": 0.3998458425667213,
1251
+ "eval_loss": 0.16127543151378632,
1252
+ "eval_runtime": 87.2874,
1253
+ "eval_samples_per_second": 217.718,
1254
+ "eval_steps_per_second": 4.537,
1255
+ "step": 4150
1256
+ },
1257
+ {
1258
+ "epoch": 0.40466326235668176,
1259
+ "grad_norm": 0.3155025243759155,
1260
+ "learning_rate": 0.00019838173234415648,
1261
+ "loss": 0.151,
1262
+ "step": 4200
1263
+ },
1264
+ {
1265
+ "epoch": 0.40466326235668176,
1266
+ "eval_loss": 0.15659989416599274,
1267
+ "eval_runtime": 87.5007,
1268
+ "eval_samples_per_second": 217.187,
1269
+ "eval_steps_per_second": 4.526,
1270
+ "step": 4200
1271
+ },
1272
+ {
1273
+ "epoch": 0.40948068214664224,
1274
+ "grad_norm": 0.20887872576713562,
1275
+ "learning_rate": 0.00019836246266499664,
1276
+ "loss": 0.149,
1277
+ "step": 4250
1278
+ },
1279
+ {
1280
+ "epoch": 0.40948068214664224,
1281
+ "eval_loss": 0.1579655408859253,
1282
+ "eval_runtime": 88.1039,
1283
+ "eval_samples_per_second": 215.7,
1284
+ "eval_steps_per_second": 4.495,
1285
+ "step": 4250
1286
+ },
1287
+ {
1288
+ "epoch": 0.41429810193660277,
1289
+ "grad_norm": 0.2782476544380188,
1290
+ "learning_rate": 0.00019834319298583678,
1291
+ "loss": 0.1519,
1292
+ "step": 4300
1293
+ },
1294
+ {
1295
+ "epoch": 0.41429810193660277,
1296
+ "eval_loss": 0.15905822813510895,
1297
+ "eval_runtime": 89.4392,
1298
+ "eval_samples_per_second": 212.479,
1299
+ "eval_steps_per_second": 4.428,
1300
+ "step": 4300
1301
+ },
1302
+ {
1303
+ "epoch": 0.41911552172656324,
1304
+ "grad_norm": 0.2640698254108429,
1305
+ "learning_rate": 0.00019832392330667694,
1306
+ "loss": 0.1499,
1307
+ "step": 4350
1308
+ },
1309
+ {
1310
+ "epoch": 0.41911552172656324,
1311
+ "eval_loss": 0.15701065957546234,
1312
+ "eval_runtime": 88.5724,
1313
+ "eval_samples_per_second": 214.559,
1314
+ "eval_steps_per_second": 4.471,
1315
+ "step": 4350
1316
+ },
1317
+ {
1318
+ "epoch": 0.4239329415165238,
1319
+ "grad_norm": 0.2606890797615051,
1320
+ "learning_rate": 0.0001983046536275171,
1321
+ "loss": 0.1509,
1322
+ "step": 4400
1323
+ },
1324
+ {
1325
+ "epoch": 0.4239329415165238,
1326
+ "eval_loss": 0.1540563404560089,
1327
+ "eval_runtime": 87.9838,
1328
+ "eval_samples_per_second": 215.994,
1329
+ "eval_steps_per_second": 4.501,
1330
+ "step": 4400
1331
+ },
1332
+ {
1333
+ "epoch": 0.42875036130648425,
1334
+ "grad_norm": 0.32414618134498596,
1335
+ "learning_rate": 0.00019828538394835727,
1336
+ "loss": 0.1476,
1337
+ "step": 4450
1338
+ },
1339
+ {
1340
+ "epoch": 0.42875036130648425,
1341
+ "eval_loss": 0.15688583254814148,
1342
+ "eval_runtime": 85.8129,
1343
+ "eval_samples_per_second": 221.459,
1344
+ "eval_steps_per_second": 4.615,
1345
+ "step": 4450
1346
+ },
1347
+ {
1348
+ "epoch": 0.4335677810964447,
1349
+ "grad_norm": 0.31708353757858276,
1350
+ "learning_rate": 0.00019826611426919743,
1351
+ "loss": 0.1477,
1352
+ "step": 4500
1353
+ },
1354
+ {
1355
+ "epoch": 0.4335677810964447,
1356
+ "eval_loss": 0.15801727771759033,
1357
+ "eval_runtime": 89.5976,
1358
+ "eval_samples_per_second": 212.104,
1359
+ "eval_steps_per_second": 4.42,
1360
+ "step": 4500
1361
+ },
1362
+ {
1363
+ "epoch": 0.43838520088640526,
1364
+ "grad_norm": 0.23212596774101257,
1365
+ "learning_rate": 0.0001982468445900376,
1366
+ "loss": 0.146,
1367
+ "step": 4550
1368
+ },
1369
+ {
1370
+ "epoch": 0.43838520088640526,
1371
+ "eval_loss": 0.15200358629226685,
1372
+ "eval_runtime": 87.3248,
1373
+ "eval_samples_per_second": 217.624,
1374
+ "eval_steps_per_second": 4.535,
1375
+ "step": 4550
1376
+ },
1377
+ {
1378
+ "epoch": 0.44320262067636573,
1379
+ "grad_norm": 0.2829053997993469,
1380
+ "learning_rate": 0.00019822757491087773,
1381
+ "loss": 0.1465,
1382
+ "step": 4600
1383
+ },
1384
+ {
1385
+ "epoch": 0.44320262067636573,
1386
+ "eval_loss": 0.1542133092880249,
1387
+ "eval_runtime": 85.3408,
1388
+ "eval_samples_per_second": 222.684,
1389
+ "eval_steps_per_second": 4.64,
1390
+ "step": 4600
1391
+ },
1392
+ {
1393
+ "epoch": 0.4480200404663262,
1394
+ "grad_norm": 0.26959696412086487,
1395
+ "learning_rate": 0.0001982083052317179,
1396
+ "loss": 0.1494,
1397
+ "step": 4650
1398
+ },
1399
+ {
1400
+ "epoch": 0.4480200404663262,
1401
+ "eval_loss": 0.15412269532680511,
1402
+ "eval_runtime": 87.9426,
1403
+ "eval_samples_per_second": 216.095,
1404
+ "eval_steps_per_second": 4.503,
1405
+ "step": 4650
1406
+ },
1407
+ {
1408
+ "epoch": 0.45283746025628674,
1409
+ "grad_norm": 0.2577558755874634,
1410
+ "learning_rate": 0.00019818903555255806,
1411
+ "loss": 0.1465,
1412
+ "step": 4700
1413
+ },
1414
+ {
1415
+ "epoch": 0.45283746025628674,
1416
+ "eval_loss": 0.15427736937999725,
1417
+ "eval_runtime": 88.297,
1418
+ "eval_samples_per_second": 215.228,
1419
+ "eval_steps_per_second": 4.485,
1420
+ "step": 4700
1421
+ },
1422
+ {
1423
+ "epoch": 0.4576548800462472,
1424
+ "grad_norm": 0.23728205263614655,
1425
+ "learning_rate": 0.00019816976587339822,
1426
+ "loss": 0.1425,
1427
+ "step": 4750
1428
+ },
1429
+ {
1430
+ "epoch": 0.4576548800462472,
1431
+ "eval_loss": 0.15123674273490906,
1432
+ "eval_runtime": 87.7725,
1433
+ "eval_samples_per_second": 216.514,
1434
+ "eval_steps_per_second": 4.512,
1435
+ "step": 4750
1436
+ },
1437
+ {
1438
+ "epoch": 0.46247229983620775,
1439
+ "grad_norm": 0.2485925406217575,
1440
+ "learning_rate": 0.00019815049619423839,
1441
+ "loss": 0.1422,
1442
+ "step": 4800
1443
+ },
1444
+ {
1445
+ "epoch": 0.46247229983620775,
1446
+ "eval_loss": 0.15153329074382782,
1447
+ "eval_runtime": 87.8581,
1448
+ "eval_samples_per_second": 216.303,
1449
+ "eval_steps_per_second": 4.507,
1450
+ "step": 4800
1451
+ },
1452
+ {
1453
+ "epoch": 0.4672897196261682,
1454
+ "grad_norm": 0.22593127191066742,
1455
+ "learning_rate": 0.00019813122651507855,
1456
+ "loss": 0.1452,
1457
+ "step": 4850
1458
+ },
1459
+ {
1460
+ "epoch": 0.4672897196261682,
1461
+ "eval_loss": 0.15019385516643524,
1462
+ "eval_runtime": 88.2356,
1463
+ "eval_samples_per_second": 215.378,
1464
+ "eval_steps_per_second": 4.488,
1465
+ "step": 4850
1466
+ },
1467
+ {
1468
+ "epoch": 0.4721071394161287,
1469
+ "grad_norm": 0.2340419441461563,
1470
+ "learning_rate": 0.0001981119568359187,
1471
+ "loss": 0.1409,
1472
+ "step": 4900
1473
+ },
1474
+ {
1475
+ "epoch": 0.4721071394161287,
1476
+ "eval_loss": 0.14823836088180542,
1477
+ "eval_runtime": 87.0907,
1478
+ "eval_samples_per_second": 218.209,
1479
+ "eval_steps_per_second": 4.547,
1480
+ "step": 4900
1481
+ },
1482
+ {
1483
+ "epoch": 0.4769245592060892,
1484
+ "grad_norm": 0.24335741996765137,
1485
+ "learning_rate": 0.00019809268715675885,
1486
+ "loss": 0.1442,
1487
+ "step": 4950
1488
+ },
1489
+ {
1490
+ "epoch": 0.4769245592060892,
1491
+ "eval_loss": 0.14839066565036774,
1492
+ "eval_runtime": 88.1982,
1493
+ "eval_samples_per_second": 215.469,
1494
+ "eval_steps_per_second": 4.49,
1495
+ "step": 4950
1496
+ },
1497
+ {
1498
+ "epoch": 0.4817419789960497,
1499
+ "grad_norm": 0.2178025096654892,
1500
+ "learning_rate": 0.000198073417477599,
1501
+ "loss": 0.1397,
1502
+ "step": 5000
1503
+ },
1504
+ {
1505
+ "epoch": 0.4817419789960497,
1506
+ "eval_loss": 0.14846083521842957,
1507
+ "eval_runtime": 88.7021,
1508
+ "eval_samples_per_second": 214.245,
1509
+ "eval_steps_per_second": 4.464,
1510
+ "step": 5000
1511
+ },
1512
+ {
1513
+ "epoch": 0.48655939878601023,
1514
+ "grad_norm": 0.2502117156982422,
1515
+ "learning_rate": 0.00019805414779843915,
1516
+ "loss": 0.1412,
1517
+ "step": 5050
1518
+ },
1519
+ {
1520
+ "epoch": 0.48655939878601023,
1521
+ "eval_loss": 0.14794325828552246,
1522
+ "eval_runtime": 87.2101,
1523
+ "eval_samples_per_second": 217.911,
1524
+ "eval_steps_per_second": 4.541,
1525
+ "step": 5050
1526
+ },
1527
+ {
1528
+ "epoch": 0.4913768185759707,
1529
+ "grad_norm": 0.26042014360427856,
1530
+ "learning_rate": 0.0001980348781192793,
1531
+ "loss": 0.1407,
1532
+ "step": 5100
1533
+ },
1534
+ {
1535
+ "epoch": 0.4913768185759707,
1536
+ "eval_loss": 0.14844238758087158,
1537
+ "eval_runtime": 87.5921,
1538
+ "eval_samples_per_second": 216.96,
1539
+ "eval_steps_per_second": 4.521,
1540
+ "step": 5100
1541
+ },
1542
+ {
1543
+ "epoch": 0.4961942383659312,
1544
+ "grad_norm": 0.27736595273017883,
1545
+ "learning_rate": 0.00019801560844011948,
1546
+ "loss": 0.1392,
1547
+ "step": 5150
1548
+ },
1549
+ {
1550
+ "epoch": 0.4961942383659312,
1551
+ "eval_loss": 0.1466793715953827,
1552
+ "eval_runtime": 87.8863,
1553
+ "eval_samples_per_second": 216.234,
1554
+ "eval_steps_per_second": 4.506,
1555
+ "step": 5150
1556
+ },
1557
+ {
1558
+ "epoch": 0.5010116581558917,
1559
+ "grad_norm": 0.26440733671188354,
1560
+ "learning_rate": 0.00019799633876095964,
1561
+ "loss": 0.1362,
1562
+ "step": 5200
1563
+ },
1564
+ {
1565
+ "epoch": 0.5010116581558917,
1566
+ "eval_loss": 0.1452549546957016,
1567
+ "eval_runtime": 87.8213,
1568
+ "eval_samples_per_second": 216.394,
1569
+ "eval_steps_per_second": 4.509,
1570
+ "step": 5200
1571
+ },
1572
+ {
1573
+ "epoch": 0.5058290779458522,
1574
+ "grad_norm": 0.2524762749671936,
1575
+ "learning_rate": 0.0001979770690817998,
1576
+ "loss": 0.1411,
1577
+ "step": 5250
1578
+ },
1579
+ {
1580
+ "epoch": 0.5058290779458522,
1581
+ "eval_loss": 0.14888718724250793,
1582
+ "eval_runtime": 87.8648,
1583
+ "eval_samples_per_second": 216.287,
1584
+ "eval_steps_per_second": 4.507,
1585
+ "step": 5250
1586
+ },
1587
+ {
1588
+ "epoch": 0.5106464977358127,
1589
+ "grad_norm": 0.2830604612827301,
1590
+ "learning_rate": 0.00019795779940263997,
1591
+ "loss": 0.1373,
1592
+ "step": 5300
1593
+ },
1594
+ {
1595
+ "epoch": 0.5106464977358127,
1596
+ "eval_loss": 0.14802096784114838,
1597
+ "eval_runtime": 86.7846,
1598
+ "eval_samples_per_second": 218.979,
1599
+ "eval_steps_per_second": 4.563,
1600
+ "step": 5300
1601
+ },
1602
+ {
1603
+ "epoch": 0.5154639175257731,
1604
+ "grad_norm": 0.23933689296245575,
1605
+ "learning_rate": 0.0001979385297234801,
1606
+ "loss": 0.1356,
1607
+ "step": 5350
1608
+ },
1609
+ {
1610
+ "epoch": 0.5154639175257731,
1611
+ "eval_loss": 0.14858801662921906,
1612
+ "eval_runtime": 86.8659,
1613
+ "eval_samples_per_second": 218.774,
1614
+ "eval_steps_per_second": 4.559,
1615
+ "step": 5350
1616
+ },
1617
+ {
1618
+ "epoch": 0.5202813373157337,
1619
+ "grad_norm": 0.23482471704483032,
1620
+ "learning_rate": 0.00019791926004432027,
1621
+ "loss": 0.1375,
1622
+ "step": 5400
1623
+ },
1624
+ {
1625
+ "epoch": 0.5202813373157337,
1626
+ "eval_loss": 0.14439742267131805,
1627
+ "eval_runtime": 86.5083,
1628
+ "eval_samples_per_second": 219.678,
1629
+ "eval_steps_per_second": 4.578,
1630
+ "step": 5400
1631
+ },
1632
+ {
1633
+ "epoch": 0.5250987571056942,
1634
+ "grad_norm": 0.2511419653892517,
1635
+ "learning_rate": 0.00019789999036516043,
1636
+ "loss": 0.1338,
1637
+ "step": 5450
1638
+ },
1639
+ {
1640
+ "epoch": 0.5250987571056942,
1641
+ "eval_loss": 0.14546434581279755,
1642
+ "eval_runtime": 88.2612,
1643
+ "eval_samples_per_second": 215.316,
1644
+ "eval_steps_per_second": 4.487,
1645
+ "step": 5450
1646
+ },
1647
+ {
1648
+ "epoch": 0.5299161768956547,
1649
+ "grad_norm": 0.2835318148136139,
1650
+ "learning_rate": 0.0001978807206860006,
1651
+ "loss": 0.1373,
1652
+ "step": 5500
1653
+ },
1654
+ {
1655
+ "epoch": 0.5299161768956547,
1656
+ "eval_loss": 0.14437447488307953,
1657
+ "eval_runtime": 87.9924,
1658
+ "eval_samples_per_second": 215.973,
1659
+ "eval_steps_per_second": 4.5,
1660
+ "step": 5500
1661
+ },
1662
+ {
1663
+ "epoch": 0.5347335966856152,
1664
+ "grad_norm": 0.2739470899105072,
1665
+ "learning_rate": 0.00019786145100684076,
1666
+ "loss": 0.1377,
1667
+ "step": 5550
1668
+ },
1669
+ {
1670
+ "epoch": 0.5347335966856152,
1671
+ "eval_loss": 0.14675094187259674,
1672
+ "eval_runtime": 88.4052,
1673
+ "eval_samples_per_second": 214.965,
1674
+ "eval_steps_per_second": 4.479,
1675
+ "step": 5550
1676
+ },
1677
+ {
1678
+ "epoch": 0.5395510164755757,
1679
+ "grad_norm": 0.23613734543323517,
1680
+ "learning_rate": 0.00019784218132768092,
1681
+ "loss": 0.1375,
1682
+ "step": 5600
1683
+ },
1684
+ {
1685
+ "epoch": 0.5395510164755757,
1686
+ "eval_loss": 0.14566753804683685,
1687
+ "eval_runtime": 87.4914,
1688
+ "eval_samples_per_second": 217.21,
1689
+ "eval_steps_per_second": 4.526,
1690
+ "step": 5600
1691
+ },
1692
+ {
1693
+ "epoch": 0.5443684362655362,
1694
+ "grad_norm": 0.2323722243309021,
1695
+ "learning_rate": 0.00019782291164852108,
1696
+ "loss": 0.1321,
1697
+ "step": 5650
1698
+ },
1699
+ {
1700
+ "epoch": 0.5443684362655362,
1701
+ "eval_loss": 0.1452152580022812,
1702
+ "eval_runtime": 89.1135,
1703
+ "eval_samples_per_second": 213.256,
1704
+ "eval_steps_per_second": 4.444,
1705
+ "step": 5650
1706
+ },
1707
+ {
1708
+ "epoch": 0.5491858560554966,
1709
+ "grad_norm": 0.29475587606430054,
1710
+ "learning_rate": 0.00019780364196936122,
1711
+ "loss": 0.1335,
1712
+ "step": 5700
1713
+ },
1714
+ {
1715
+ "epoch": 0.5491858560554966,
1716
+ "eval_loss": 0.142301544547081,
1717
+ "eval_runtime": 85.457,
1718
+ "eval_samples_per_second": 222.381,
1719
+ "eval_steps_per_second": 4.634,
1720
+ "step": 5700
1721
+ },
1722
+ {
1723
+ "epoch": 0.5540032758454572,
1724
+ "grad_norm": 0.29040685296058655,
1725
+ "learning_rate": 0.00019778437229020138,
1726
+ "loss": 0.1339,
1727
+ "step": 5750
1728
+ },
1729
+ {
1730
+ "epoch": 0.5540032758454572,
1731
+ "eval_loss": 0.14422039687633514,
1732
+ "eval_runtime": 88.9367,
1733
+ "eval_samples_per_second": 213.68,
1734
+ "eval_steps_per_second": 4.453,
1735
+ "step": 5750
1736
+ },
1737
+ {
1738
+ "epoch": 0.5588206956354177,
1739
+ "grad_norm": 0.28566643595695496,
1740
+ "learning_rate": 0.00019776510261104152,
1741
+ "loss": 0.1334,
1742
+ "step": 5800
1743
+ },
1744
+ {
1745
+ "epoch": 0.5588206956354177,
1746
+ "eval_loss": 0.14271041750907898,
1747
+ "eval_runtime": 87.7795,
1748
+ "eval_samples_per_second": 216.497,
1749
+ "eval_steps_per_second": 4.511,
1750
+ "step": 5800
1751
+ },
1752
+ {
1753
+ "epoch": 0.5636381154253781,
1754
+ "grad_norm": 0.2813100218772888,
1755
+ "learning_rate": 0.00019774583293188168,
1756
+ "loss": 0.1347,
1757
+ "step": 5850
1758
+ },
1759
+ {
1760
+ "epoch": 0.5636381154253781,
1761
+ "eval_loss": 0.14340683817863464,
1762
+ "eval_runtime": 87.6902,
1763
+ "eval_samples_per_second": 216.717,
1764
+ "eval_steps_per_second": 4.516,
1765
+ "step": 5850
1766
+ },
1767
+ {
1768
+ "epoch": 0.5684555352153386,
1769
+ "grad_norm": 0.2560277581214905,
1770
+ "learning_rate": 0.00019772656325272185,
1771
+ "loss": 0.1335,
1772
+ "step": 5900
1773
+ },
1774
+ {
1775
+ "epoch": 0.5684555352153386,
1776
+ "eval_loss": 0.1437900960445404,
1777
+ "eval_runtime": 88.7352,
1778
+ "eval_samples_per_second": 214.165,
1779
+ "eval_steps_per_second": 4.463,
1780
+ "step": 5900
1781
+ },
1782
+ {
1783
+ "epoch": 0.5732729550052992,
1784
+ "grad_norm": 0.2642715275287628,
1785
+ "learning_rate": 0.000197707293573562,
1786
+ "loss": 0.1313,
1787
+ "step": 5950
1788
+ },
1789
+ {
1790
+ "epoch": 0.5732729550052992,
1791
+ "eval_loss": 0.14153434336185455,
1792
+ "eval_runtime": 89.9067,
1793
+ "eval_samples_per_second": 211.375,
1794
+ "eval_steps_per_second": 4.405,
1795
+ "step": 5950
1796
+ },
1797
+ {
1798
+ "epoch": 0.5780903747952597,
1799
+ "grad_norm": 0.2073492407798767,
1800
+ "learning_rate": 0.00019768802389440217,
1801
+ "loss": 0.1323,
1802
+ "step": 6000
1803
+ },
1804
+ {
1805
+ "epoch": 0.5780903747952597,
1806
+ "eval_loss": 0.13940538465976715,
1807
+ "eval_runtime": 89.7245,
1808
+ "eval_samples_per_second": 211.804,
1809
+ "eval_steps_per_second": 4.414,
1810
+ "step": 6000
1811
+ }
1812
+ ],
1813
+ "logging_steps": 50,
1814
+ "max_steps": 518950,
1815
+ "num_input_tokens_seen": 0,
1816
+ "num_train_epochs": 50,
1817
+ "save_steps": 1000,
1818
+ "stateful_callbacks": {
1819
+ "EarlyStoppingCallback": {
1820
+ "args": {
1821
+ "early_stopping_patience": 10,
1822
+ "early_stopping_threshold": 0.001
1823
+ },
1824
+ "attributes": {
1825
+ "early_stopping_patience_counter": 0
1826
+ }
1827
+ },
1828
+ "TrainerControl": {
1829
+ "args": {
1830
+ "should_epoch_stop": false,
1831
+ "should_evaluate": false,
1832
+ "should_log": false,
1833
+ "should_save": true,
1834
+ "should_training_stop": false
1835
+ },
1836
+ "attributes": {}
1837
+ }
1838
+ },
1839
+ "total_flos": 1.21054265081856e+17,
1840
+ "train_batch_size": 192,
1841
+ "trial_name": null,
1842
+ "trial_params": null
1843
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1a527f086366d366847d11f10c57e6cc777a7a700779f165accea7b1f13ea243
3
+ size 15761