Thecoder3281f's picture
Upload 9 files
4d86992 verified
{
"best_global_step": 199000,
"best_metric": 0.003153804922476411,
"best_model_checkpoint": "./models/t5-small-separated-augmented-200k\\checkpoint-199000",
"epoch": 1.9558180698031469,
"eval_steps": 1000,
"global_step": 200000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.004889545174507868,
"grad_norm": 0.5060574412345886,
"learning_rate": 1.7465e-05,
"loss": 6.6771,
"step": 500
},
{
"epoch": 0.009779090349015735,
"grad_norm": 0.35827475786209106,
"learning_rate": 3.4965e-05,
"loss": 0.1893,
"step": 1000
},
{
"epoch": 0.009779090349015735,
"eval_accuracy": 0.9878454285714285,
"eval_loss": 0.09678807854652405,
"eval_runtime": 54.2373,
"eval_samples_per_second": 553.125,
"eval_steps_per_second": 34.57,
"step": 1000
},
{
"epoch": 0.014668635523523602,
"grad_norm": 0.34889769554138184,
"learning_rate": 5.2465e-05,
"loss": 0.1447,
"step": 1500
},
{
"epoch": 0.01955818069803147,
"grad_norm": 0.23819516599178314,
"learning_rate": 6.9965e-05,
"loss": 0.1206,
"step": 2000
},
{
"epoch": 0.01955818069803147,
"eval_accuracy": 0.9902656666666667,
"eval_loss": 0.073659747838974,
"eval_runtime": 53.4879,
"eval_samples_per_second": 560.875,
"eval_steps_per_second": 35.055,
"step": 2000
},
{
"epoch": 0.024447725872539336,
"grad_norm": 0.22346411645412445,
"learning_rate": 8.7465e-05,
"loss": 0.1088,
"step": 2500
},
{
"epoch": 0.029337271047047205,
"grad_norm": 0.1609542965888977,
"learning_rate": 0.000104965,
"loss": 0.1005,
"step": 3000
},
{
"epoch": 0.029337271047047205,
"eval_accuracy": 0.9918363809523809,
"eval_loss": 0.05878164619207382,
"eval_runtime": 55.0708,
"eval_samples_per_second": 544.753,
"eval_steps_per_second": 34.047,
"step": 3000
},
{
"epoch": 0.03422681622155507,
"grad_norm": 0.16877996921539307,
"learning_rate": 0.000122465,
"loss": 0.0921,
"step": 3500
},
{
"epoch": 0.03911636139606294,
"grad_norm": 0.19500480592250824,
"learning_rate": 0.00013996499999999998,
"loss": 0.0857,
"step": 4000
},
{
"epoch": 0.03911636139606294,
"eval_accuracy": 0.992877380952381,
"eval_loss": 0.05044129863381386,
"eval_runtime": 52.8632,
"eval_samples_per_second": 567.503,
"eval_steps_per_second": 35.469,
"step": 4000
},
{
"epoch": 0.0440059065705708,
"grad_norm": 0.2271735668182373,
"learning_rate": 0.000157465,
"loss": 0.0785,
"step": 4500
},
{
"epoch": 0.04889545174507867,
"grad_norm": 0.15599773824214935,
"learning_rate": 0.000174965,
"loss": 0.0743,
"step": 5000
},
{
"epoch": 0.04889545174507867,
"eval_accuracy": 0.993641380952381,
"eval_loss": 0.04416767507791519,
"eval_runtime": 53.9114,
"eval_samples_per_second": 556.469,
"eval_steps_per_second": 34.779,
"step": 5000
},
{
"epoch": 0.05378499691958654,
"grad_norm": 0.13649936020374298,
"learning_rate": 0.000192465,
"loss": 0.0696,
"step": 5500
},
{
"epoch": 0.05867454209409441,
"grad_norm": 0.17215129733085632,
"learning_rate": 0.000209965,
"loss": 0.0669,
"step": 6000
},
{
"epoch": 0.05867454209409441,
"eval_accuracy": 0.994305380952381,
"eval_loss": 0.03922554850578308,
"eval_runtime": 53.2344,
"eval_samples_per_second": 563.546,
"eval_steps_per_second": 35.222,
"step": 6000
},
{
"epoch": 0.06356408726860227,
"grad_norm": 0.21248804032802582,
"learning_rate": 0.00022746500000000002,
"loss": 0.0636,
"step": 6500
},
{
"epoch": 0.06845363244311015,
"grad_norm": 0.2209671139717102,
"learning_rate": 0.000244965,
"loss": 0.062,
"step": 7000
},
{
"epoch": 0.06845363244311015,
"eval_accuracy": 0.9948234285714286,
"eval_loss": 0.035148605704307556,
"eval_runtime": 53.6983,
"eval_samples_per_second": 558.677,
"eval_steps_per_second": 34.917,
"step": 7000
},
{
"epoch": 0.07334317761761801,
"grad_norm": 0.16804896295070648,
"learning_rate": 0.000262465,
"loss": 0.0584,
"step": 7500
},
{
"epoch": 0.07823272279212588,
"grad_norm": 0.13331238925457,
"learning_rate": 0.000279965,
"loss": 0.0576,
"step": 8000
},
{
"epoch": 0.07823272279212588,
"eval_accuracy": 0.995110619047619,
"eval_loss": 0.03278239071369171,
"eval_runtime": 52.8459,
"eval_samples_per_second": 567.688,
"eval_steps_per_second": 35.481,
"step": 8000
},
{
"epoch": 0.08312226796663374,
"grad_norm": 0.15275965631008148,
"learning_rate": 0.000297465,
"loss": 0.0545,
"step": 8500
},
{
"epoch": 0.0880118131411416,
"grad_norm": 0.14770014584064484,
"learning_rate": 0.000314965,
"loss": 0.0509,
"step": 9000
},
{
"epoch": 0.0880118131411416,
"eval_accuracy": 0.9955557619047619,
"eval_loss": 0.029773302376270294,
"eval_runtime": 53.681,
"eval_samples_per_second": 558.857,
"eval_steps_per_second": 34.929,
"step": 9000
},
{
"epoch": 0.09290135831564948,
"grad_norm": 0.13802163302898407,
"learning_rate": 0.000332465,
"loss": 0.0503,
"step": 9500
},
{
"epoch": 0.09779090349015734,
"grad_norm": 0.16345028579235077,
"learning_rate": 0.000349965,
"loss": 0.0492,
"step": 10000
},
{
"epoch": 0.09779090349015734,
"eval_accuracy": 0.9958419047619048,
"eval_loss": 0.027848461642861366,
"eval_runtime": 53.5151,
"eval_samples_per_second": 560.589,
"eval_steps_per_second": 35.037,
"step": 10000
},
{
"epoch": 0.10268044866466522,
"grad_norm": 0.09112809598445892,
"learning_rate": 0.00036746500000000003,
"loss": 0.0475,
"step": 10500
},
{
"epoch": 0.10756999383917308,
"grad_norm": 0.20798154175281525,
"learning_rate": 0.000384965,
"loss": 0.046,
"step": 11000
},
{
"epoch": 0.10756999383917308,
"eval_accuracy": 0.99604,
"eval_loss": 0.026027251034975052,
"eval_runtime": 53.2138,
"eval_samples_per_second": 563.764,
"eval_steps_per_second": 35.235,
"step": 11000
},
{
"epoch": 0.11245953901368094,
"grad_norm": 0.19015829265117645,
"learning_rate": 0.00040246499999999996,
"loss": 0.0432,
"step": 11500
},
{
"epoch": 0.11734908418818882,
"grad_norm": 0.12272685021162033,
"learning_rate": 0.000419965,
"loss": 0.0434,
"step": 12000
},
{
"epoch": 0.11734908418818882,
"eval_accuracy": 0.9963316666666666,
"eval_loss": 0.024126138538122177,
"eval_runtime": 53.0184,
"eval_samples_per_second": 565.841,
"eval_steps_per_second": 35.365,
"step": 12000
},
{
"epoch": 0.12223862936269668,
"grad_norm": 0.10090415924787521,
"learning_rate": 0.000437465,
"loss": 0.0425,
"step": 12500
},
{
"epoch": 0.12712817453720454,
"grad_norm": 0.113510861992836,
"learning_rate": 0.000454965,
"loss": 0.0412,
"step": 13000
},
{
"epoch": 0.12712817453720454,
"eval_accuracy": 0.9965295714285715,
"eval_loss": 0.022982601076364517,
"eval_runtime": 54.4384,
"eval_samples_per_second": 551.082,
"eval_steps_per_second": 34.443,
"step": 13000
},
{
"epoch": 0.13201771971171242,
"grad_norm": 0.09937796741724014,
"learning_rate": 0.00047246500000000004,
"loss": 0.04,
"step": 13500
},
{
"epoch": 0.1369072648862203,
"grad_norm": 0.11914831399917603,
"learning_rate": 0.000489965,
"loss": 0.0389,
"step": 14000
},
{
"epoch": 0.1369072648862203,
"eval_accuracy": 0.996802,
"eval_loss": 0.02113029547035694,
"eval_runtime": 53.2003,
"eval_samples_per_second": 563.907,
"eval_steps_per_second": 35.244,
"step": 14000
},
{
"epoch": 0.14179681006072814,
"grad_norm": 0.17324307560920715,
"learning_rate": 0.000507465,
"loss": 0.0384,
"step": 14500
},
{
"epoch": 0.14668635523523602,
"grad_norm": 0.12623025476932526,
"learning_rate": 0.000524965,
"loss": 0.0364,
"step": 15000
},
{
"epoch": 0.14668635523523602,
"eval_accuracy": 0.9968850476190476,
"eval_loss": 0.020160900428891182,
"eval_runtime": 53.6937,
"eval_samples_per_second": 558.725,
"eval_steps_per_second": 34.92,
"step": 15000
},
{
"epoch": 0.1515759004097439,
"grad_norm": 0.1337081342935562,
"learning_rate": 0.000542465,
"loss": 0.0367,
"step": 15500
},
{
"epoch": 0.15646544558425177,
"grad_norm": 0.16239804029464722,
"learning_rate": 0.000559965,
"loss": 0.0357,
"step": 16000
},
{
"epoch": 0.15646544558425177,
"eval_accuracy": 0.9969695714285715,
"eval_loss": 0.020250126719474792,
"eval_runtime": 54.2376,
"eval_samples_per_second": 553.122,
"eval_steps_per_second": 34.57,
"step": 16000
},
{
"epoch": 0.16135499075875961,
"grad_norm": 0.09299212694168091,
"learning_rate": 0.000577465,
"loss": 0.0356,
"step": 16500
},
{
"epoch": 0.1662445359332675,
"grad_norm": 0.12462040781974792,
"learning_rate": 0.000594965,
"loss": 0.0343,
"step": 17000
},
{
"epoch": 0.1662445359332675,
"eval_accuracy": 0.9971193333333334,
"eval_loss": 0.01877717673778534,
"eval_runtime": 54.3351,
"eval_samples_per_second": 552.13,
"eval_steps_per_second": 34.508,
"step": 17000
},
{
"epoch": 0.17113408110777537,
"grad_norm": 0.08858466893434525,
"learning_rate": 0.000612465,
"loss": 0.0337,
"step": 17500
},
{
"epoch": 0.1760236262822832,
"grad_norm": 0.14879809319972992,
"learning_rate": 0.000629965,
"loss": 0.0335,
"step": 18000
},
{
"epoch": 0.1760236262822832,
"eval_accuracy": 0.9971792380952381,
"eval_loss": 0.018667874857783318,
"eval_runtime": 54.7854,
"eval_samples_per_second": 547.591,
"eval_steps_per_second": 34.224,
"step": 18000
},
{
"epoch": 0.1809131714567911,
"grad_norm": 0.10354409366846085,
"learning_rate": 0.0006474650000000001,
"loss": 0.032,
"step": 18500
},
{
"epoch": 0.18580271663129896,
"grad_norm": 0.1182965636253357,
"learning_rate": 0.000664965,
"loss": 0.0318,
"step": 19000
},
{
"epoch": 0.18580271663129896,
"eval_accuracy": 0.9973930952380953,
"eval_loss": 0.017232514917850494,
"eval_runtime": 53.7973,
"eval_samples_per_second": 557.649,
"eval_steps_per_second": 34.853,
"step": 19000
},
{
"epoch": 0.1906922618058068,
"grad_norm": 0.05959112569689751,
"learning_rate": 0.0006824649999999999,
"loss": 0.0318,
"step": 19500
},
{
"epoch": 0.1955818069803147,
"grad_norm": 0.1270582675933838,
"learning_rate": 0.000699965,
"loss": 0.0307,
"step": 20000
},
{
"epoch": 0.1955818069803147,
"eval_accuracy": 0.9973767619047619,
"eval_loss": 0.01737845316529274,
"eval_runtime": 53.2789,
"eval_samples_per_second": 563.075,
"eval_steps_per_second": 35.192,
"step": 20000
},
{
"epoch": 0.20047135215482256,
"grad_norm": 0.08427739888429642,
"learning_rate": 0.0006980594444444445,
"loss": 0.0298,
"step": 20500
},
{
"epoch": 0.20536089732933044,
"grad_norm": 0.07171203941106796,
"learning_rate": 0.000696115,
"loss": 0.0293,
"step": 21000
},
{
"epoch": 0.20536089732933044,
"eval_accuracy": 0.9975350952380952,
"eval_loss": 0.016114523634314537,
"eval_runtime": 53.7368,
"eval_samples_per_second": 558.277,
"eval_steps_per_second": 34.892,
"step": 21000
},
{
"epoch": 0.2102504425038383,
"grad_norm": 0.07719539105892181,
"learning_rate": 0.0006941705555555555,
"loss": 0.0291,
"step": 21500
},
{
"epoch": 0.21513998767834616,
"grad_norm": 0.08832105249166489,
"learning_rate": 0.0006922261111111111,
"loss": 0.0286,
"step": 22000
},
{
"epoch": 0.21513998767834616,
"eval_accuracy": 0.9976384285714286,
"eval_loss": 0.015542366541922092,
"eval_runtime": 53.6783,
"eval_samples_per_second": 558.885,
"eval_steps_per_second": 34.93,
"step": 22000
},
{
"epoch": 0.22002953285285404,
"grad_norm": 0.1472863107919693,
"learning_rate": 0.0006902816666666667,
"loss": 0.0277,
"step": 22500
},
{
"epoch": 0.22491907802736189,
"grad_norm": 0.09753895550966263,
"learning_rate": 0.0006883372222222222,
"loss": 0.0268,
"step": 23000
},
{
"epoch": 0.22491907802736189,
"eval_accuracy": 0.9977074761904762,
"eval_loss": 0.015192433260381222,
"eval_runtime": 53.436,
"eval_samples_per_second": 561.419,
"eval_steps_per_second": 35.089,
"step": 23000
},
{
"epoch": 0.22980862320186976,
"grad_norm": 0.12348861992359161,
"learning_rate": 0.0006863927777777778,
"loss": 0.026,
"step": 23500
},
{
"epoch": 0.23469816837637764,
"grad_norm": 0.1123756393790245,
"learning_rate": 0.0006844483333333333,
"loss": 0.0257,
"step": 24000
},
{
"epoch": 0.23469816837637764,
"eval_accuracy": 0.997726761904762,
"eval_loss": 0.014932113699615002,
"eval_runtime": 53.1941,
"eval_samples_per_second": 563.972,
"eval_steps_per_second": 35.248,
"step": 24000
},
{
"epoch": 0.23958771355088548,
"grad_norm": 0.07256095856428146,
"learning_rate": 0.0006825038888888889,
"loss": 0.0256,
"step": 24500
},
{
"epoch": 0.24447725872539336,
"grad_norm": 0.05496814846992493,
"learning_rate": 0.0006805594444444444,
"loss": 0.0251,
"step": 25000
},
{
"epoch": 0.24447725872539336,
"eval_accuracy": 0.9978721904761905,
"eval_loss": 0.01384472381323576,
"eval_runtime": 54.0604,
"eval_samples_per_second": 554.935,
"eval_steps_per_second": 34.683,
"step": 25000
},
{
"epoch": 0.24936680389990123,
"grad_norm": 0.09915214031934738,
"learning_rate": 0.000678615,
"loss": 0.0251,
"step": 25500
},
{
"epoch": 0.2542563490744091,
"grad_norm": 0.14060749113559723,
"learning_rate": 0.0006766705555555555,
"loss": 0.0244,
"step": 26000
},
{
"epoch": 0.2542563490744091,
"eval_accuracy": 0.9979192857142857,
"eval_loss": 0.01368007156997919,
"eval_runtime": 52.8524,
"eval_samples_per_second": 567.618,
"eval_steps_per_second": 35.476,
"step": 26000
},
{
"epoch": 0.259145894248917,
"grad_norm": 0.09252548217773438,
"learning_rate": 0.0006747261111111111,
"loss": 0.024,
"step": 26500
},
{
"epoch": 0.26403543942342483,
"grad_norm": 0.11915791034698486,
"learning_rate": 0.0006727816666666666,
"loss": 0.0232,
"step": 27000
},
{
"epoch": 0.26403543942342483,
"eval_accuracy": 0.9980117142857143,
"eval_loss": 0.012998638674616814,
"eval_runtime": 54.0246,
"eval_samples_per_second": 555.303,
"eval_steps_per_second": 34.706,
"step": 27000
},
{
"epoch": 0.2689249845979327,
"grad_norm": 0.10810112953186035,
"learning_rate": 0.0006708372222222222,
"loss": 0.0233,
"step": 27500
},
{
"epoch": 0.2738145297724406,
"grad_norm": 0.07593973726034164,
"learning_rate": 0.0006688927777777778,
"loss": 0.0227,
"step": 28000
},
{
"epoch": 0.2738145297724406,
"eval_accuracy": 0.9980548095238095,
"eval_loss": 0.012805027887225151,
"eval_runtime": 53.176,
"eval_samples_per_second": 564.164,
"eval_steps_per_second": 35.26,
"step": 28000
},
{
"epoch": 0.27870407494694843,
"grad_norm": 0.06336738914251328,
"learning_rate": 0.0006669483333333333,
"loss": 0.0229,
"step": 28500
},
{
"epoch": 0.2835936201214563,
"grad_norm": 0.12944093346595764,
"learning_rate": 0.0006650038888888889,
"loss": 0.0221,
"step": 29000
},
{
"epoch": 0.2835936201214563,
"eval_accuracy": 0.9980741428571429,
"eval_loss": 0.012613357976078987,
"eval_runtime": 53.5915,
"eval_samples_per_second": 559.79,
"eval_steps_per_second": 34.987,
"step": 29000
},
{
"epoch": 0.2884831652959642,
"grad_norm": 0.09919234365224838,
"learning_rate": 0.0006630594444444445,
"loss": 0.0213,
"step": 29500
},
{
"epoch": 0.29337271047047203,
"grad_norm": 0.08204931020736694,
"learning_rate": 0.000661115,
"loss": 0.0219,
"step": 30000
},
{
"epoch": 0.29337271047047203,
"eval_accuracy": 0.998159,
"eval_loss": 0.011940201744437218,
"eval_runtime": 53.1317,
"eval_samples_per_second": 564.635,
"eval_steps_per_second": 35.29,
"step": 30000
},
{
"epoch": 0.2982622556449799,
"grad_norm": 0.11553770303726196,
"learning_rate": 0.0006591705555555556,
"loss": 0.0208,
"step": 30500
},
{
"epoch": 0.3031518008194878,
"grad_norm": 0.12381038069725037,
"learning_rate": 0.0006572261111111111,
"loss": 0.0205,
"step": 31000
},
{
"epoch": 0.3031518008194878,
"eval_accuracy": 0.9982196666666666,
"eval_loss": 0.011603106744587421,
"eval_runtime": 53.375,
"eval_samples_per_second": 562.061,
"eval_steps_per_second": 35.129,
"step": 31000
},
{
"epoch": 0.30804134599399563,
"grad_norm": 0.06441524624824524,
"learning_rate": 0.0006552816666666667,
"loss": 0.0204,
"step": 31500
},
{
"epoch": 0.31293089116850353,
"grad_norm": 0.08449769020080566,
"learning_rate": 0.0006533372222222222,
"loss": 0.0206,
"step": 32000
},
{
"epoch": 0.31293089116850353,
"eval_accuracy": 0.9982467142857143,
"eval_loss": 0.011421745643019676,
"eval_runtime": 53.2003,
"eval_samples_per_second": 563.907,
"eval_steps_per_second": 35.244,
"step": 32000
},
{
"epoch": 0.3178204363430114,
"grad_norm": 0.07885874062776566,
"learning_rate": 0.0006513927777777777,
"loss": 0.02,
"step": 32500
},
{
"epoch": 0.32270998151751923,
"grad_norm": 0.07178321480751038,
"learning_rate": 0.0006494483333333333,
"loss": 0.0193,
"step": 33000
},
{
"epoch": 0.32270998151751923,
"eval_accuracy": 0.9983428571428571,
"eval_loss": 0.011021795682609081,
"eval_runtime": 53.8106,
"eval_samples_per_second": 557.511,
"eval_steps_per_second": 34.844,
"step": 33000
},
{
"epoch": 0.32759952669202713,
"grad_norm": 0.06164510175585747,
"learning_rate": 0.0006475038888888888,
"loss": 0.0192,
"step": 33500
},
{
"epoch": 0.332489071866535,
"grad_norm": 0.11073775589466095,
"learning_rate": 0.0006455594444444444,
"loss": 0.0193,
"step": 34000
},
{
"epoch": 0.332489071866535,
"eval_accuracy": 0.9983445238095238,
"eval_loss": 0.010947330854833126,
"eval_runtime": 53.4068,
"eval_samples_per_second": 561.727,
"eval_steps_per_second": 35.108,
"step": 34000
},
{
"epoch": 0.3373786170410428,
"grad_norm": 0.1216714084148407,
"learning_rate": 0.0006436149999999999,
"loss": 0.0191,
"step": 34500
},
{
"epoch": 0.34226816221555073,
"grad_norm": 0.07570644468069077,
"learning_rate": 0.0006416705555555556,
"loss": 0.0189,
"step": 35000
},
{
"epoch": 0.34226816221555073,
"eval_accuracy": 0.9984051904761905,
"eval_loss": 0.01051774900406599,
"eval_runtime": 53.8775,
"eval_samples_per_second": 556.819,
"eval_steps_per_second": 34.801,
"step": 35000
},
{
"epoch": 0.3471577073900586,
"grad_norm": 0.10820703208446503,
"learning_rate": 0.0006397261111111112,
"loss": 0.0187,
"step": 35500
},
{
"epoch": 0.3520472525645664,
"grad_norm": 0.13289569318294525,
"learning_rate": 0.0006377816666666667,
"loss": 0.0181,
"step": 36000
},
{
"epoch": 0.3520472525645664,
"eval_accuracy": 0.9984183333333333,
"eval_loss": 0.010617985390126705,
"eval_runtime": 53.6453,
"eval_samples_per_second": 559.229,
"eval_steps_per_second": 34.952,
"step": 36000
},
{
"epoch": 0.35693679773907433,
"grad_norm": 0.09950833022594452,
"learning_rate": 0.0006358372222222223,
"loss": 0.0178,
"step": 36500
},
{
"epoch": 0.3618263429135822,
"grad_norm": 0.12055996805429459,
"learning_rate": 0.0006338927777777778,
"loss": 0.0174,
"step": 37000
},
{
"epoch": 0.3618263429135822,
"eval_accuracy": 0.9984319047619048,
"eval_loss": 0.01043427549302578,
"eval_runtime": 53.8445,
"eval_samples_per_second": 557.16,
"eval_steps_per_second": 34.823,
"step": 37000
},
{
"epoch": 0.36671588808809,
"grad_norm": 0.08831817656755447,
"learning_rate": 0.0006319483333333334,
"loss": 0.0183,
"step": 37500
},
{
"epoch": 0.37160543326259793,
"grad_norm": 0.09790224581956863,
"learning_rate": 0.0006300038888888889,
"loss": 0.0171,
"step": 38000
},
{
"epoch": 0.37160543326259793,
"eval_accuracy": 0.9984588571428571,
"eval_loss": 0.010330071672797203,
"eval_runtime": 53.8512,
"eval_samples_per_second": 557.091,
"eval_steps_per_second": 34.818,
"step": 38000
},
{
"epoch": 0.3764949784371058,
"grad_norm": 0.05283864215016365,
"learning_rate": 0.0006280594444444444,
"loss": 0.017,
"step": 38500
},
{
"epoch": 0.3813845236116136,
"grad_norm": 0.12874823808670044,
"learning_rate": 0.000626115,
"loss": 0.0173,
"step": 39000
},
{
"epoch": 0.3813845236116136,
"eval_accuracy": 0.9984891904761904,
"eval_loss": 0.009993654675781727,
"eval_runtime": 53.3842,
"eval_samples_per_second": 561.964,
"eval_steps_per_second": 35.123,
"step": 39000
},
{
"epoch": 0.3862740687861215,
"grad_norm": 0.08774898201227188,
"learning_rate": 0.0006241705555555555,
"loss": 0.0173,
"step": 39500
},
{
"epoch": 0.3911636139606294,
"grad_norm": 0.092228963971138,
"learning_rate": 0.0006222261111111111,
"loss": 0.0169,
"step": 40000
},
{
"epoch": 0.3911636139606294,
"eval_accuracy": 0.9984447142857142,
"eval_loss": 0.010405597276985645,
"eval_runtime": 53.1659,
"eval_samples_per_second": 564.272,
"eval_steps_per_second": 35.267,
"step": 40000
},
{
"epoch": 0.3960531591351372,
"grad_norm": 0.08975362032651901,
"learning_rate": 0.0006202816666666666,
"loss": 0.0174,
"step": 40500
},
{
"epoch": 0.4009427043096451,
"grad_norm": 0.09612125158309937,
"learning_rate": 0.0006183372222222222,
"loss": 0.0168,
"step": 41000
},
{
"epoch": 0.4009427043096451,
"eval_accuracy": 0.9985740952380953,
"eval_loss": 0.009390046820044518,
"eval_runtime": 53.7483,
"eval_samples_per_second": 558.158,
"eval_steps_per_second": 34.885,
"step": 41000
},
{
"epoch": 0.405832249484153,
"grad_norm": 0.04056503251194954,
"learning_rate": 0.0006163927777777777,
"loss": 0.0163,
"step": 41500
},
{
"epoch": 0.4107217946586609,
"grad_norm": 0.11665570735931396,
"learning_rate": 0.0006144483333333333,
"loss": 0.0165,
"step": 42000
},
{
"epoch": 0.4107217946586609,
"eval_accuracy": 0.998547619047619,
"eval_loss": 0.009648078121244907,
"eval_runtime": 53.5013,
"eval_samples_per_second": 560.734,
"eval_steps_per_second": 35.046,
"step": 42000
},
{
"epoch": 0.4156113398331687,
"grad_norm": 0.10102874785661697,
"learning_rate": 0.000612503888888889,
"loss": 0.0163,
"step": 42500
},
{
"epoch": 0.4205008850076766,
"grad_norm": 0.08108735084533691,
"learning_rate": 0.0006105594444444445,
"loss": 0.0154,
"step": 43000
},
{
"epoch": 0.4205008850076766,
"eval_accuracy": 0.998580380952381,
"eval_loss": 0.009399999864399433,
"eval_runtime": 53.6417,
"eval_samples_per_second": 559.266,
"eval_steps_per_second": 34.954,
"step": 43000
},
{
"epoch": 0.4253904301821845,
"grad_norm": 0.07910118252038956,
"learning_rate": 0.000608615,
"loss": 0.0158,
"step": 43500
},
{
"epoch": 0.4302799753566923,
"grad_norm": 0.0742466077208519,
"learning_rate": 0.0006066705555555556,
"loss": 0.0154,
"step": 44000
},
{
"epoch": 0.4302799753566923,
"eval_accuracy": 0.9986305238095238,
"eval_loss": 0.009053844027221203,
"eval_runtime": 53.2625,
"eval_samples_per_second": 563.248,
"eval_steps_per_second": 35.203,
"step": 44000
},
{
"epoch": 0.43516952053120017,
"grad_norm": 0.06712730973958969,
"learning_rate": 0.0006047261111111111,
"loss": 0.0157,
"step": 44500
},
{
"epoch": 0.4400590657057081,
"grad_norm": 0.049518078565597534,
"learning_rate": 0.0006027816666666667,
"loss": 0.0154,
"step": 45000
},
{
"epoch": 0.4400590657057081,
"eval_accuracy": 0.9986142857142857,
"eval_loss": 0.009205291979014874,
"eval_runtime": 55.4539,
"eval_samples_per_second": 540.99,
"eval_steps_per_second": 33.812,
"step": 45000
},
{
"epoch": 0.4449486108802159,
"grad_norm": 0.0538068488240242,
"learning_rate": 0.0006008372222222222,
"loss": 0.0154,
"step": 45500
},
{
"epoch": 0.44983815605472377,
"grad_norm": 0.08187378942966461,
"learning_rate": 0.0005988927777777778,
"loss": 0.015,
"step": 46000
},
{
"epoch": 0.44983815605472377,
"eval_accuracy": 0.9986327142857143,
"eval_loss": 0.009027380496263504,
"eval_runtime": 53.2362,
"eval_samples_per_second": 563.526,
"eval_steps_per_second": 35.22,
"step": 46000
},
{
"epoch": 0.4547277012292317,
"grad_norm": 0.04306895285844803,
"learning_rate": 0.0005969483333333333,
"loss": 0.0153,
"step": 46500
},
{
"epoch": 0.4596172464037395,
"grad_norm": 0.053645290434360504,
"learning_rate": 0.0005950038888888889,
"loss": 0.0146,
"step": 47000
},
{
"epoch": 0.4596172464037395,
"eval_accuracy": 0.998660619047619,
"eval_loss": 0.008829508908092976,
"eval_runtime": 54.0772,
"eval_samples_per_second": 554.763,
"eval_steps_per_second": 34.673,
"step": 47000
},
{
"epoch": 0.46450679157824737,
"grad_norm": 0.08367203176021576,
"learning_rate": 0.0005930594444444444,
"loss": 0.0149,
"step": 47500
},
{
"epoch": 0.46939633675275527,
"grad_norm": 0.06427811086177826,
"learning_rate": 0.000591115,
"loss": 0.0146,
"step": 48000
},
{
"epoch": 0.46939633675275527,
"eval_accuracy": 0.9986682857142857,
"eval_loss": 0.008711729198694229,
"eval_runtime": 54.7568,
"eval_samples_per_second": 547.877,
"eval_steps_per_second": 34.242,
"step": 48000
},
{
"epoch": 0.4742858819272631,
"grad_norm": 0.09367698431015015,
"learning_rate": 0.0005891705555555556,
"loss": 0.0146,
"step": 48500
},
{
"epoch": 0.47917542710177097,
"grad_norm": 0.023252153769135475,
"learning_rate": 0.0005872261111111111,
"loss": 0.0143,
"step": 49000
},
{
"epoch": 0.47917542710177097,
"eval_accuracy": 0.9987029047619047,
"eval_loss": 0.00848183874040842,
"eval_runtime": 53.9633,
"eval_samples_per_second": 555.933,
"eval_steps_per_second": 34.746,
"step": 49000
},
{
"epoch": 0.48406497227627887,
"grad_norm": 0.038976676762104034,
"learning_rate": 0.0005852816666666666,
"loss": 0.0142,
"step": 49500
},
{
"epoch": 0.4889545174507867,
"grad_norm": 0.048157546669244766,
"learning_rate": 0.0005833372222222221,
"loss": 0.0146,
"step": 50000
},
{
"epoch": 0.4889545174507867,
"eval_accuracy": 0.9986898571428572,
"eval_loss": 0.008633621968328953,
"eval_runtime": 53.3435,
"eval_samples_per_second": 562.392,
"eval_steps_per_second": 35.15,
"step": 50000
},
{
"epoch": 0.4938440626252946,
"grad_norm": 0.04257979243993759,
"learning_rate": 0.0005813927777777777,
"loss": 0.0145,
"step": 50500
},
{
"epoch": 0.49873360779980247,
"grad_norm": 0.09921249002218246,
"learning_rate": 0.0005794483333333334,
"loss": 0.0142,
"step": 51000
},
{
"epoch": 0.49873360779980247,
"eval_accuracy": 0.9987676666666667,
"eval_loss": 0.008316335268318653,
"eval_runtime": 53.6985,
"eval_samples_per_second": 558.675,
"eval_steps_per_second": 34.917,
"step": 51000
},
{
"epoch": 0.5036231529743104,
"grad_norm": 0.048569273203611374,
"learning_rate": 0.0005775038888888889,
"loss": 0.0135,
"step": 51500
},
{
"epoch": 0.5085126981488182,
"grad_norm": 0.06064219772815704,
"learning_rate": 0.0005755594444444445,
"loss": 0.0139,
"step": 52000
},
{
"epoch": 0.5085126981488182,
"eval_accuracy": 0.9987182380952381,
"eval_loss": 0.008500739932060242,
"eval_runtime": 53.1478,
"eval_samples_per_second": 564.463,
"eval_steps_per_second": 35.279,
"step": 52000
},
{
"epoch": 0.5134022433233261,
"grad_norm": 0.043598126620054245,
"learning_rate": 0.000573615,
"loss": 0.0145,
"step": 52500
},
{
"epoch": 0.518291788497834,
"grad_norm": 0.059862203896045685,
"learning_rate": 0.0005716705555555556,
"loss": 0.0134,
"step": 53000
},
{
"epoch": 0.518291788497834,
"eval_accuracy": 0.9987784761904762,
"eval_loss": 0.008033830672502518,
"eval_runtime": 55.7465,
"eval_samples_per_second": 538.15,
"eval_steps_per_second": 33.634,
"step": 53000
},
{
"epoch": 0.5231813336723418,
"grad_norm": 0.05372610315680504,
"learning_rate": 0.0005697261111111111,
"loss": 0.0136,
"step": 53500
},
{
"epoch": 0.5280708788468497,
"grad_norm": 0.08553345501422882,
"learning_rate": 0.0005677816666666667,
"loss": 0.0138,
"step": 54000
},
{
"epoch": 0.5280708788468497,
"eval_accuracy": 0.9988229047619047,
"eval_loss": 0.007664266973733902,
"eval_runtime": 53.9758,
"eval_samples_per_second": 555.805,
"eval_steps_per_second": 34.738,
"step": 54000
},
{
"epoch": 0.5329604240213576,
"grad_norm": 0.03992351144552231,
"learning_rate": 0.0005658372222222222,
"loss": 0.0133,
"step": 54500
},
{
"epoch": 0.5378499691958654,
"grad_norm": 0.051119010895490646,
"learning_rate": 0.0005638927777777777,
"loss": 0.0135,
"step": 55000
},
{
"epoch": 0.5378499691958654,
"eval_accuracy": 0.9988099523809524,
"eval_loss": 0.007848628796637058,
"eval_runtime": 54.3513,
"eval_samples_per_second": 551.965,
"eval_steps_per_second": 34.498,
"step": 55000
},
{
"epoch": 0.5427395143703733,
"grad_norm": 0.08714370429515839,
"learning_rate": 0.0005619483333333333,
"loss": 0.0128,
"step": 55500
},
{
"epoch": 0.5476290595448812,
"grad_norm": 0.07373756170272827,
"learning_rate": 0.0005600038888888888,
"loss": 0.013,
"step": 56000
},
{
"epoch": 0.5476290595448812,
"eval_accuracy": 0.9988279047619048,
"eval_loss": 0.007725988980382681,
"eval_runtime": 53.269,
"eval_samples_per_second": 563.179,
"eval_steps_per_second": 35.199,
"step": 56000
},
{
"epoch": 0.552518604719389,
"grad_norm": 0.04964112490415573,
"learning_rate": 0.0005580594444444444,
"loss": 0.0132,
"step": 56500
},
{
"epoch": 0.5574081498938969,
"grad_norm": 0.08856749534606934,
"learning_rate": 0.000556115,
"loss": 0.0128,
"step": 57000
},
{
"epoch": 0.5574081498938969,
"eval_accuracy": 0.998819,
"eval_loss": 0.007981804199516773,
"eval_runtime": 54.4577,
"eval_samples_per_second": 550.886,
"eval_steps_per_second": 34.43,
"step": 57000
},
{
"epoch": 0.5622976950684048,
"grad_norm": 0.06801512092351913,
"learning_rate": 0.0005541705555555555,
"loss": 0.0129,
"step": 57500
},
{
"epoch": 0.5671872402429126,
"grad_norm": 0.21337199211120605,
"learning_rate": 0.0005522261111111112,
"loss": 0.0131,
"step": 58000
},
{
"epoch": 0.5671872402429126,
"eval_accuracy": 0.9988361904761904,
"eval_loss": 0.0077649368904531,
"eval_runtime": 53.1495,
"eval_samples_per_second": 564.445,
"eval_steps_per_second": 35.278,
"step": 58000
},
{
"epoch": 0.5720767854174205,
"grad_norm": 0.0754612609744072,
"learning_rate": 0.0005502816666666667,
"loss": 0.013,
"step": 58500
},
{
"epoch": 0.5769663305919284,
"grad_norm": 0.053277261555194855,
"learning_rate": 0.0005483372222222223,
"loss": 0.013,
"step": 59000
},
{
"epoch": 0.5769663305919284,
"eval_accuracy": 0.9988713333333333,
"eval_loss": 0.007438257802277803,
"eval_runtime": 53.9725,
"eval_samples_per_second": 555.839,
"eval_steps_per_second": 34.74,
"step": 59000
},
{
"epoch": 0.5818558757664362,
"grad_norm": 0.057580217719078064,
"learning_rate": 0.0005463927777777778,
"loss": 0.0126,
"step": 59500
},
{
"epoch": 0.5867454209409441,
"grad_norm": 0.08538717031478882,
"learning_rate": 0.0005444483333333334,
"loss": 0.0125,
"step": 60000
},
{
"epoch": 0.5867454209409441,
"eval_accuracy": 0.9988772857142857,
"eval_loss": 0.0073426892049610615,
"eval_runtime": 53.9301,
"eval_samples_per_second": 556.276,
"eval_steps_per_second": 34.767,
"step": 60000
},
{
"epoch": 0.591634966115452,
"grad_norm": 0.07628747820854187,
"learning_rate": 0.0005425038888888889,
"loss": 0.0127,
"step": 60500
},
{
"epoch": 0.5965245112899598,
"grad_norm": 0.059503812342882156,
"learning_rate": 0.0005405594444444444,
"loss": 0.0119,
"step": 61000
},
{
"epoch": 0.5965245112899598,
"eval_accuracy": 0.9988955714285714,
"eval_loss": 0.007260579615831375,
"eval_runtime": 54.1591,
"eval_samples_per_second": 553.924,
"eval_steps_per_second": 34.62,
"step": 61000
},
{
"epoch": 0.6014140564644677,
"grad_norm": 0.07128513604402542,
"learning_rate": 0.000538615,
"loss": 0.012,
"step": 61500
},
{
"epoch": 0.6063036016389756,
"grad_norm": 0.0615658275783062,
"learning_rate": 0.0005366705555555555,
"loss": 0.0121,
"step": 62000
},
{
"epoch": 0.6063036016389756,
"eval_accuracy": 0.9988924285714286,
"eval_loss": 0.007354605942964554,
"eval_runtime": 53.6133,
"eval_samples_per_second": 559.563,
"eval_steps_per_second": 34.973,
"step": 62000
},
{
"epoch": 0.6111931468134834,
"grad_norm": 0.04977503791451454,
"learning_rate": 0.0005347261111111111,
"loss": 0.0125,
"step": 62500
},
{
"epoch": 0.6160826919879913,
"grad_norm": 0.06748691946268082,
"learning_rate": 0.0005327816666666666,
"loss": 0.0123,
"step": 63000
},
{
"epoch": 0.6160826919879913,
"eval_accuracy": 0.9989074761904762,
"eval_loss": 0.007279036566615105,
"eval_runtime": 54.221,
"eval_samples_per_second": 553.291,
"eval_steps_per_second": 34.581,
"step": 63000
},
{
"epoch": 0.6209722371624992,
"grad_norm": 0.08432789891958237,
"learning_rate": 0.0005308372222222222,
"loss": 0.0119,
"step": 63500
},
{
"epoch": 0.6258617823370071,
"grad_norm": 0.08450587093830109,
"learning_rate": 0.0005288927777777778,
"loss": 0.0123,
"step": 64000
},
{
"epoch": 0.6258617823370071,
"eval_accuracy": 0.998906619047619,
"eval_loss": 0.007195043843239546,
"eval_runtime": 53.6077,
"eval_samples_per_second": 559.621,
"eval_steps_per_second": 34.976,
"step": 64000
},
{
"epoch": 0.6307513275115149,
"grad_norm": 0.05454770103096962,
"learning_rate": 0.0005269483333333333,
"loss": 0.0119,
"step": 64500
},
{
"epoch": 0.6356408726860228,
"grad_norm": 0.029517434537410736,
"learning_rate": 0.0005250038888888889,
"loss": 0.0115,
"step": 65000
},
{
"epoch": 0.6356408726860228,
"eval_accuracy": 0.99894,
"eval_loss": 0.006976461503654718,
"eval_runtime": 54.3436,
"eval_samples_per_second": 552.043,
"eval_steps_per_second": 34.503,
"step": 65000
},
{
"epoch": 0.6405304178605307,
"grad_norm": 0.08749569207429886,
"learning_rate": 0.0005230594444444444,
"loss": 0.0117,
"step": 65500
},
{
"epoch": 0.6454199630350385,
"grad_norm": 0.08669404685497284,
"learning_rate": 0.000521115,
"loss": 0.0118,
"step": 66000
},
{
"epoch": 0.6454199630350385,
"eval_accuracy": 0.9989269523809524,
"eval_loss": 0.0070405821315944195,
"eval_runtime": 53.0176,
"eval_samples_per_second": 565.85,
"eval_steps_per_second": 35.366,
"step": 66000
},
{
"epoch": 0.6503095082095464,
"grad_norm": 0.08068472146987915,
"learning_rate": 0.0005191705555555556,
"loss": 0.012,
"step": 66500
},
{
"epoch": 0.6551990533840543,
"grad_norm": 0.06560824811458588,
"learning_rate": 0.0005172261111111111,
"loss": 0.0113,
"step": 67000
},
{
"epoch": 0.6551990533840543,
"eval_accuracy": 0.9989625238095238,
"eval_loss": 0.006877726875245571,
"eval_runtime": 57.7066,
"eval_samples_per_second": 519.871,
"eval_steps_per_second": 32.492,
"step": 67000
},
{
"epoch": 0.6600885985585621,
"grad_norm": 0.10351342707872391,
"learning_rate": 0.0005152816666666667,
"loss": 0.0117,
"step": 67500
},
{
"epoch": 0.66497814373307,
"grad_norm": 0.06295846402645111,
"learning_rate": 0.0005133372222222222,
"loss": 0.0113,
"step": 68000
},
{
"epoch": 0.66497814373307,
"eval_accuracy": 0.9989663333333333,
"eval_loss": 0.006734638474881649,
"eval_runtime": 56.5776,
"eval_samples_per_second": 530.245,
"eval_steps_per_second": 33.14,
"step": 68000
},
{
"epoch": 0.6698676889075779,
"grad_norm": 0.07197780162096024,
"learning_rate": 0.0005113927777777778,
"loss": 0.0112,
"step": 68500
},
{
"epoch": 0.6747572340820857,
"grad_norm": 0.05394699051976204,
"learning_rate": 0.0005094483333333333,
"loss": 0.0111,
"step": 69000
},
{
"epoch": 0.6747572340820857,
"eval_accuracy": 0.9989654761904762,
"eval_loss": 0.006897720508277416,
"eval_runtime": 53.9516,
"eval_samples_per_second": 556.054,
"eval_steps_per_second": 34.753,
"step": 69000
},
{
"epoch": 0.6796467792565936,
"grad_norm": 0.08804675191640854,
"learning_rate": 0.0005075038888888889,
"loss": 0.0114,
"step": 69500
},
{
"epoch": 0.6845363244311015,
"grad_norm": 0.061258211731910706,
"learning_rate": 0.0005055594444444445,
"loss": 0.0116,
"step": 70000
},
{
"epoch": 0.6845363244311015,
"eval_accuracy": 0.998991619047619,
"eval_loss": 0.006613132543861866,
"eval_runtime": 53.4248,
"eval_samples_per_second": 561.537,
"eval_steps_per_second": 35.096,
"step": 70000
},
{
"epoch": 0.6894258696056093,
"grad_norm": 0.047413647174835205,
"learning_rate": 0.000503615,
"loss": 0.0114,
"step": 70500
},
{
"epoch": 0.6943154147801172,
"grad_norm": 0.048444923013448715,
"learning_rate": 0.0005016705555555556,
"loss": 0.0111,
"step": 71000
},
{
"epoch": 0.6943154147801172,
"eval_accuracy": 0.9989892857142857,
"eval_loss": 0.006757956929504871,
"eval_runtime": 54.0915,
"eval_samples_per_second": 554.616,
"eval_steps_per_second": 34.663,
"step": 71000
},
{
"epoch": 0.6992049599546251,
"grad_norm": 0.0633966252207756,
"learning_rate": 0.0004997261111111111,
"loss": 0.011,
"step": 71500
},
{
"epoch": 0.7040945051291329,
"grad_norm": 0.05330997332930565,
"learning_rate": 0.0004977816666666666,
"loss": 0.0111,
"step": 72000
},
{
"epoch": 0.7040945051291329,
"eval_accuracy": 0.9989945238095238,
"eval_loss": 0.006628294009715319,
"eval_runtime": 53.5745,
"eval_samples_per_second": 559.968,
"eval_steps_per_second": 34.998,
"step": 72000
},
{
"epoch": 0.7089840503036408,
"grad_norm": 0.08384311944246292,
"learning_rate": 0.0004958372222222222,
"loss": 0.0112,
"step": 72500
},
{
"epoch": 0.7138735954781487,
"grad_norm": 0.012912419624626637,
"learning_rate": 0.0004938927777777777,
"loss": 0.0108,
"step": 73000
},
{
"epoch": 0.7138735954781487,
"eval_accuracy": 0.9990231904761905,
"eval_loss": 0.0064848195761442184,
"eval_runtime": 53.4714,
"eval_samples_per_second": 561.048,
"eval_steps_per_second": 35.065,
"step": 73000
},
{
"epoch": 0.7187631406526565,
"grad_norm": 0.03586062043905258,
"learning_rate": 0.0004919483333333333,
"loss": 0.0106,
"step": 73500
},
{
"epoch": 0.7236526858271644,
"grad_norm": 0.03920240327715874,
"learning_rate": 0.0004900038888888888,
"loss": 0.0108,
"step": 74000
},
{
"epoch": 0.7236526858271644,
"eval_accuracy": 0.9990156666666666,
"eval_loss": 0.00646663922816515,
"eval_runtime": 53.3973,
"eval_samples_per_second": 561.827,
"eval_steps_per_second": 35.114,
"step": 74000
},
{
"epoch": 0.7285422310016723,
"grad_norm": 0.07299363613128662,
"learning_rate": 0.00048805944444444446,
"loss": 0.0109,
"step": 74500
},
{
"epoch": 0.73343177617618,
"grad_norm": 0.061152711510658264,
"learning_rate": 0.000486115,
"loss": 0.0102,
"step": 75000
},
{
"epoch": 0.73343177617618,
"eval_accuracy": 0.9990235714285715,
"eval_loss": 0.00648918654769659,
"eval_runtime": 54.8259,
"eval_samples_per_second": 547.187,
"eval_steps_per_second": 34.199,
"step": 75000
},
{
"epoch": 0.738321321350688,
"grad_norm": 0.052978385239839554,
"learning_rate": 0.0004841705555555556,
"loss": 0.0108,
"step": 75500
},
{
"epoch": 0.7432108665251959,
"grad_norm": 0.03460371494293213,
"learning_rate": 0.00048222611111111113,
"loss": 0.0104,
"step": 76000
},
{
"epoch": 0.7432108665251959,
"eval_accuracy": 0.9990159047619047,
"eval_loss": 0.006446553394198418,
"eval_runtime": 53.4946,
"eval_samples_per_second": 560.804,
"eval_steps_per_second": 35.05,
"step": 76000
},
{
"epoch": 0.7481004116997036,
"grad_norm": 0.08936499804258347,
"learning_rate": 0.0004802816666666667,
"loss": 0.0105,
"step": 76500
},
{
"epoch": 0.7529899568742116,
"grad_norm": 0.04613318666815758,
"learning_rate": 0.00047833722222222224,
"loss": 0.0104,
"step": 77000
},
{
"epoch": 0.7529899568742116,
"eval_accuracy": 0.9990385714285714,
"eval_loss": 0.0063977050594985485,
"eval_runtime": 54.4466,
"eval_samples_per_second": 550.998,
"eval_steps_per_second": 34.437,
"step": 77000
},
{
"epoch": 0.7578795020487195,
"grad_norm": 0.05318485200405121,
"learning_rate": 0.00047639277777777775,
"loss": 0.0106,
"step": 77500
},
{
"epoch": 0.7627690472232272,
"grad_norm": 0.061067450791597366,
"learning_rate": 0.0004744483333333333,
"loss": 0.0101,
"step": 78000
},
{
"epoch": 0.7627690472232272,
"eval_accuracy": 0.9990490952380953,
"eval_loss": 0.006357032340019941,
"eval_runtime": 53.2067,
"eval_samples_per_second": 563.839,
"eval_steps_per_second": 35.24,
"step": 78000
},
{
"epoch": 0.7676585923977352,
"grad_norm": 0.042733557522296906,
"learning_rate": 0.00047250388888888886,
"loss": 0.01,
"step": 78500
},
{
"epoch": 0.772548137572243,
"grad_norm": 0.08034121245145798,
"learning_rate": 0.0004705594444444444,
"loss": 0.0103,
"step": 79000
},
{
"epoch": 0.772548137572243,
"eval_accuracy": 0.9990574761904762,
"eval_loss": 0.0062187593430280685,
"eval_runtime": 53.8428,
"eval_samples_per_second": 557.177,
"eval_steps_per_second": 34.824,
"step": 79000
},
{
"epoch": 0.7774376827467508,
"grad_norm": 0.07830695807933807,
"learning_rate": 0.00046861499999999997,
"loss": 0.0101,
"step": 79500
},
{
"epoch": 0.7823272279212587,
"grad_norm": 0.07382604479789734,
"learning_rate": 0.00046667055555555553,
"loss": 0.0103,
"step": 80000
},
{
"epoch": 0.7823272279212587,
"eval_accuracy": 0.9990759047619048,
"eval_loss": 0.006141056306660175,
"eval_runtime": 53.2408,
"eval_samples_per_second": 563.478,
"eval_steps_per_second": 35.217,
"step": 80000
},
{
"epoch": 0.7872167730957667,
"grad_norm": 0.1125330850481987,
"learning_rate": 0.00046472611111111114,
"loss": 0.0102,
"step": 80500
},
{
"epoch": 0.7921063182702744,
"grad_norm": 0.03520214557647705,
"learning_rate": 0.0004627816666666667,
"loss": 0.01,
"step": 81000
},
{
"epoch": 0.7921063182702744,
"eval_accuracy": 0.9990739047619047,
"eval_loss": 0.006076267920434475,
"eval_runtime": 53.791,
"eval_samples_per_second": 557.715,
"eval_steps_per_second": 34.857,
"step": 81000
},
{
"epoch": 0.7969958634447823,
"grad_norm": 0.042487915605306625,
"learning_rate": 0.00046083722222222225,
"loss": 0.0097,
"step": 81500
},
{
"epoch": 0.8018854086192903,
"grad_norm": 0.054117601364851,
"learning_rate": 0.0004588927777777778,
"loss": 0.0101,
"step": 82000
},
{
"epoch": 0.8018854086192903,
"eval_accuracy": 0.9990634761904762,
"eval_loss": 0.006164718419313431,
"eval_runtime": 53.5332,
"eval_samples_per_second": 560.4,
"eval_steps_per_second": 35.025,
"step": 82000
},
{
"epoch": 0.8067749537937982,
"grad_norm": 0.04976029694080353,
"learning_rate": 0.00045694833333333336,
"loss": 0.0099,
"step": 82500
},
{
"epoch": 0.811664498968306,
"grad_norm": 0.054267916828393936,
"learning_rate": 0.00045500388888888887,
"loss": 0.0097,
"step": 83000
},
{
"epoch": 0.811664498968306,
"eval_accuracy": 0.9990979047619047,
"eval_loss": 0.005987876560539007,
"eval_runtime": 54.2289,
"eval_samples_per_second": 553.211,
"eval_steps_per_second": 34.576,
"step": 83000
},
{
"epoch": 0.8165540441428139,
"grad_norm": 0.03349093720316887,
"learning_rate": 0.0004530594444444444,
"loss": 0.0094,
"step": 83500
},
{
"epoch": 0.8214435893173218,
"grad_norm": 0.04999032989144325,
"learning_rate": 0.000451115,
"loss": 0.0101,
"step": 84000
},
{
"epoch": 0.8214435893173218,
"eval_accuracy": 0.9990905714285714,
"eval_loss": 0.006010835990309715,
"eval_runtime": 53.4299,
"eval_samples_per_second": 561.483,
"eval_steps_per_second": 35.093,
"step": 84000
},
{
"epoch": 0.8263331344918295,
"grad_norm": 0.045149870216846466,
"learning_rate": 0.00044917055555555554,
"loss": 0.0097,
"step": 84500
},
{
"epoch": 0.8312226796663374,
"grad_norm": 0.0918109267950058,
"learning_rate": 0.0004472261111111111,
"loss": 0.0099,
"step": 85000
},
{
"epoch": 0.8312226796663374,
"eval_accuracy": 0.9990772380952381,
"eval_loss": 0.006181794218719006,
"eval_runtime": 54.1897,
"eval_samples_per_second": 553.611,
"eval_steps_per_second": 34.601,
"step": 85000
},
{
"epoch": 0.8361122248408454,
"grad_norm": 0.0643276646733284,
"learning_rate": 0.00044528166666666665,
"loss": 0.0099,
"step": 85500
},
{
"epoch": 0.8410017700153531,
"grad_norm": 0.06930361688137054,
"learning_rate": 0.0004433372222222222,
"loss": 0.0095,
"step": 86000
},
{
"epoch": 0.8410017700153531,
"eval_accuracy": 0.9991025238095238,
"eval_loss": 0.00590873695909977,
"eval_runtime": 52.8942,
"eval_samples_per_second": 567.169,
"eval_steps_per_second": 35.448,
"step": 86000
},
{
"epoch": 0.845891315189861,
"grad_norm": 0.08163397759199142,
"learning_rate": 0.00044139277777777776,
"loss": 0.0099,
"step": 86500
},
{
"epoch": 0.850780860364369,
"grad_norm": 0.0483279749751091,
"learning_rate": 0.00043944833333333337,
"loss": 0.0092,
"step": 87000
},
{
"epoch": 0.850780860364369,
"eval_accuracy": 0.9991092857142857,
"eval_loss": 0.006001894827932119,
"eval_runtime": 53.2268,
"eval_samples_per_second": 563.626,
"eval_steps_per_second": 35.227,
"step": 87000
},
{
"epoch": 0.8556704055388767,
"grad_norm": 0.02636638656258583,
"learning_rate": 0.00043750388888888893,
"loss": 0.0094,
"step": 87500
},
{
"epoch": 0.8605599507133846,
"grad_norm": 0.042217135429382324,
"learning_rate": 0.0004355594444444445,
"loss": 0.0092,
"step": 88000
},
{
"epoch": 0.8605599507133846,
"eval_accuracy": 0.999128,
"eval_loss": 0.005815317388623953,
"eval_runtime": 53.8299,
"eval_samples_per_second": 557.311,
"eval_steps_per_second": 34.832,
"step": 88000
},
{
"epoch": 0.8654494958878926,
"grad_norm": 0.08632192760705948,
"learning_rate": 0.00043361499999999993,
"loss": 0.0092,
"step": 88500
},
{
"epoch": 0.8703390410624003,
"grad_norm": 0.04315312206745148,
"learning_rate": 0.00043167055555555554,
"loss": 0.0094,
"step": 89000
},
{
"epoch": 0.8703390410624003,
"eval_accuracy": 0.9991279047619047,
"eval_loss": 0.0056898752227425575,
"eval_runtime": 53.7075,
"eval_samples_per_second": 558.581,
"eval_steps_per_second": 34.911,
"step": 89000
},
{
"epoch": 0.8752285862369082,
"grad_norm": 0.03837065026164055,
"learning_rate": 0.0004297261111111111,
"loss": 0.0094,
"step": 89500
},
{
"epoch": 0.8801181314114161,
"grad_norm": 0.04201444238424301,
"learning_rate": 0.00042778166666666666,
"loss": 0.0093,
"step": 90000
},
{
"epoch": 0.8801181314114161,
"eval_accuracy": 0.9991310952380953,
"eval_loss": 0.00587738212198019,
"eval_runtime": 53.5135,
"eval_samples_per_second": 560.606,
"eval_steps_per_second": 35.038,
"step": 90000
},
{
"epoch": 0.8850076765859239,
"grad_norm": 0.061635617166757584,
"learning_rate": 0.0004258372222222222,
"loss": 0.0092,
"step": 90500
},
{
"epoch": 0.8898972217604318,
"grad_norm": 0.03518196567893028,
"learning_rate": 0.00042389277777777777,
"loss": 0.0088,
"step": 91000
},
{
"epoch": 0.8898972217604318,
"eval_accuracy": 0.9991415238095238,
"eval_loss": 0.005721970461308956,
"eval_runtime": 53.7456,
"eval_samples_per_second": 558.185,
"eval_steps_per_second": 34.887,
"step": 91000
},
{
"epoch": 0.8947867669349397,
"grad_norm": 0.06095174327492714,
"learning_rate": 0.0004219483333333333,
"loss": 0.0095,
"step": 91500
},
{
"epoch": 0.8996763121094475,
"grad_norm": 0.03404530510306358,
"learning_rate": 0.0004200038888888889,
"loss": 0.0091,
"step": 92000
},
{
"epoch": 0.8996763121094475,
"eval_accuracy": 0.9991448571428572,
"eval_loss": 0.0056047323159873486,
"eval_runtime": 53.6229,
"eval_samples_per_second": 559.463,
"eval_steps_per_second": 34.966,
"step": 92000
},
{
"epoch": 0.9045658572839554,
"grad_norm": 0.044711388647556305,
"learning_rate": 0.00041805944444444444,
"loss": 0.0094,
"step": 92500
},
{
"epoch": 0.9094554024584633,
"grad_norm": 0.025318428874015808,
"learning_rate": 0.000416115,
"loss": 0.0091,
"step": 93000
},
{
"epoch": 0.9094554024584633,
"eval_accuracy": 0.9991459047619048,
"eval_loss": 0.0056663015857338905,
"eval_runtime": 53.7217,
"eval_samples_per_second": 558.433,
"eval_steps_per_second": 34.902,
"step": 93000
},
{
"epoch": 0.9143449476329711,
"grad_norm": 0.09479326009750366,
"learning_rate": 0.0004141705555555556,
"loss": 0.0091,
"step": 93500
},
{
"epoch": 0.919234492807479,
"grad_norm": 0.04621125012636185,
"learning_rate": 0.00041222611111111116,
"loss": 0.0091,
"step": 94000
},
{
"epoch": 0.919234492807479,
"eval_accuracy": 0.9991637619047619,
"eval_loss": 0.005490881856530905,
"eval_runtime": 52.8914,
"eval_samples_per_second": 567.2,
"eval_steps_per_second": 35.45,
"step": 94000
},
{
"epoch": 0.924124037981987,
"grad_norm": 0.11758420616388321,
"learning_rate": 0.0004102816666666666,
"loss": 0.0091,
"step": 94500
},
{
"epoch": 0.9290135831564947,
"grad_norm": 0.048568353056907654,
"learning_rate": 0.00040833722222222217,
"loss": 0.0085,
"step": 95000
},
{
"epoch": 0.9290135831564947,
"eval_accuracy": 0.9991408571428572,
"eval_loss": 0.0056878020986914635,
"eval_runtime": 54.7817,
"eval_samples_per_second": 547.628,
"eval_steps_per_second": 34.227,
"step": 95000
},
{
"epoch": 0.9339031283310026,
"grad_norm": 0.12460034340620041,
"learning_rate": 0.0004063927777777778,
"loss": 0.0089,
"step": 95500
},
{
"epoch": 0.9387926735055105,
"grad_norm": 0.04623766988515854,
"learning_rate": 0.00040444833333333334,
"loss": 0.0087,
"step": 96000
},
{
"epoch": 0.9387926735055105,
"eval_accuracy": 0.9991676190476191,
"eval_loss": 0.005500451661646366,
"eval_runtime": 53.9981,
"eval_samples_per_second": 555.575,
"eval_steps_per_second": 34.723,
"step": 96000
},
{
"epoch": 0.9436822186800183,
"grad_norm": 0.08665420114994049,
"learning_rate": 0.0004025038888888889,
"loss": 0.0087,
"step": 96500
},
{
"epoch": 0.9485717638545262,
"grad_norm": 0.0452926941215992,
"learning_rate": 0.00040055944444444445,
"loss": 0.0084,
"step": 97000
},
{
"epoch": 0.9485717638545262,
"eval_accuracy": 0.999164,
"eval_loss": 0.005574519746005535,
"eval_runtime": 54.6981,
"eval_samples_per_second": 548.465,
"eval_steps_per_second": 34.279,
"step": 97000
},
{
"epoch": 0.9534613090290341,
"grad_norm": 0.03491511195898056,
"learning_rate": 0.000398615,
"loss": 0.0086,
"step": 97500
},
{
"epoch": 0.9583508542035419,
"grad_norm": 0.044573381543159485,
"learning_rate": 0.00039667055555555556,
"loss": 0.0089,
"step": 98000
},
{
"epoch": 0.9583508542035419,
"eval_accuracy": 0.9991894285714286,
"eval_loss": 0.005372173152863979,
"eval_runtime": 53.4094,
"eval_samples_per_second": 561.699,
"eval_steps_per_second": 35.106,
"step": 98000
},
{
"epoch": 0.9632403993780498,
"grad_norm": 0.02608780935406685,
"learning_rate": 0.0003947261111111111,
"loss": 0.0086,
"step": 98500
},
{
"epoch": 0.9681299445525577,
"grad_norm": 0.04312971234321594,
"learning_rate": 0.0003927816666666667,
"loss": 0.0086,
"step": 99000
},
{
"epoch": 0.9681299445525577,
"eval_accuracy": 0.9991722380952381,
"eval_loss": 0.0054678237065672874,
"eval_runtime": 54.0015,
"eval_samples_per_second": 555.541,
"eval_steps_per_second": 34.721,
"step": 99000
},
{
"epoch": 0.9730194897270655,
"grad_norm": 0.06294015049934387,
"learning_rate": 0.00039083722222222223,
"loss": 0.0085,
"step": 99500
},
{
"epoch": 0.9779090349015734,
"grad_norm": 0.029000315815210342,
"learning_rate": 0.00038889277777777773,
"loss": 0.0087,
"step": 100000
},
{
"epoch": 0.9779090349015734,
"eval_accuracy": 0.999185380952381,
"eval_loss": 0.005396171938627958,
"eval_runtime": 55.6579,
"eval_samples_per_second": 539.007,
"eval_steps_per_second": 33.688,
"step": 100000
},
{
"epoch": 0.9827985800760813,
"grad_norm": 0.04323006793856621,
"learning_rate": 0.0003869483333333333,
"loss": 0.0087,
"step": 100500
},
{
"epoch": 0.9876881252505892,
"grad_norm": 0.0731167271733284,
"learning_rate": 0.00038500388888888885,
"loss": 0.0081,
"step": 101000
},
{
"epoch": 0.9876881252505892,
"eval_accuracy": 0.9991765238095238,
"eval_loss": 0.005412892438471317,
"eval_runtime": 55.9769,
"eval_samples_per_second": 535.935,
"eval_steps_per_second": 33.496,
"step": 101000
},
{
"epoch": 0.992577670425097,
"grad_norm": 0.023585299029946327,
"learning_rate": 0.0003830594444444444,
"loss": 0.0088,
"step": 101500
},
{
"epoch": 0.9974672155996049,
"grad_norm": 0.08938384801149368,
"learning_rate": 0.000381115,
"loss": 0.0086,
"step": 102000
},
{
"epoch": 0.9974672155996049,
"eval_accuracy": 0.9991979047619047,
"eval_loss": 0.005323469173163176,
"eval_runtime": 53.2851,
"eval_samples_per_second": 563.009,
"eval_steps_per_second": 35.188,
"step": 102000
},
{
"epoch": 1.0023567607741128,
"grad_norm": 0.038682036101818085,
"learning_rate": 0.00037917055555555557,
"loss": 0.0082,
"step": 102500
},
{
"epoch": 1.0072463059486207,
"grad_norm": 0.07080361992120743,
"learning_rate": 0.0003772261111111111,
"loss": 0.0081,
"step": 103000
},
{
"epoch": 1.0072463059486207,
"eval_accuracy": 0.9992074761904762,
"eval_loss": 0.00541540514677763,
"eval_runtime": 54.1542,
"eval_samples_per_second": 553.974,
"eval_steps_per_second": 34.623,
"step": 103000
},
{
"epoch": 1.0121358511231284,
"grad_norm": 0.0545232892036438,
"learning_rate": 0.0003752816666666667,
"loss": 0.0079,
"step": 103500
},
{
"epoch": 1.0170253962976363,
"grad_norm": 0.05419744551181793,
"learning_rate": 0.00037333722222222224,
"loss": 0.0083,
"step": 104000
},
{
"epoch": 1.0170253962976363,
"eval_accuracy": 0.999227380952381,
"eval_loss": 0.005181997548788786,
"eval_runtime": 54.7563,
"eval_samples_per_second": 547.882,
"eval_steps_per_second": 34.243,
"step": 104000
},
{
"epoch": 1.0219149414721442,
"grad_norm": 0.062064480036497116,
"learning_rate": 0.0003713927777777778,
"loss": 0.0078,
"step": 104500
},
{
"epoch": 1.0268044866466521,
"grad_norm": 0.0431884303689003,
"learning_rate": 0.00036944833333333335,
"loss": 0.0078,
"step": 105000
},
{
"epoch": 1.0268044866466521,
"eval_accuracy": 0.999227380952381,
"eval_loss": 0.005218331702053547,
"eval_runtime": 53.5479,
"eval_samples_per_second": 560.246,
"eval_steps_per_second": 35.015,
"step": 105000
},
{
"epoch": 1.03169403182116,
"grad_norm": 0.035419270396232605,
"learning_rate": 0.00036750388888888885,
"loss": 0.0079,
"step": 105500
},
{
"epoch": 1.036583576995668,
"grad_norm": 0.03565732017159462,
"learning_rate": 0.0003655594444444444,
"loss": 0.0078,
"step": 106000
},
{
"epoch": 1.036583576995668,
"eval_accuracy": 0.9992299523809524,
"eval_loss": 0.005135852377861738,
"eval_runtime": 54.0577,
"eval_samples_per_second": 554.962,
"eval_steps_per_second": 34.685,
"step": 106000
},
{
"epoch": 1.0414731221701756,
"grad_norm": 0.04575124382972717,
"learning_rate": 0.00036361499999999997,
"loss": 0.0076,
"step": 106500
},
{
"epoch": 1.0463626673446835,
"grad_norm": 0.07697087526321411,
"learning_rate": 0.0003616705555555555,
"loss": 0.0076,
"step": 107000
},
{
"epoch": 1.0463626673446835,
"eval_accuracy": 0.9992333809523809,
"eval_loss": 0.005050502717494965,
"eval_runtime": 53.4533,
"eval_samples_per_second": 561.238,
"eval_steps_per_second": 35.077,
"step": 107000
},
{
"epoch": 1.0512522125191914,
"grad_norm": 0.05499347671866417,
"learning_rate": 0.0003597261111111111,
"loss": 0.0079,
"step": 107500
},
{
"epoch": 1.0561417576936993,
"grad_norm": 0.035594772547483444,
"learning_rate": 0.00035778166666666664,
"loss": 0.0081,
"step": 108000
},
{
"epoch": 1.0561417576936993,
"eval_accuracy": 0.9992301428571428,
"eval_loss": 0.0050900341011583805,
"eval_runtime": 53.2622,
"eval_samples_per_second": 563.251,
"eval_steps_per_second": 35.203,
"step": 108000
},
{
"epoch": 1.0610313028682072,
"grad_norm": 0.020569855347275734,
"learning_rate": 0.00035583722222222225,
"loss": 0.0077,
"step": 108500
},
{
"epoch": 1.0659208480427151,
"grad_norm": 0.06758717447519302,
"learning_rate": 0.0003538927777777778,
"loss": 0.0082,
"step": 109000
},
{
"epoch": 1.0659208480427151,
"eval_accuracy": 0.9992373333333333,
"eval_loss": 0.005076898727566004,
"eval_runtime": 53.4707,
"eval_samples_per_second": 561.054,
"eval_steps_per_second": 35.066,
"step": 109000
},
{
"epoch": 1.070810393217223,
"grad_norm": 0.04208175465464592,
"learning_rate": 0.00035194833333333336,
"loss": 0.0079,
"step": 109500
},
{
"epoch": 1.0756999383917307,
"grad_norm": 0.040982868522405624,
"learning_rate": 0.0003500038888888889,
"loss": 0.0074,
"step": 110000
},
{
"epoch": 1.0756999383917307,
"eval_accuracy": 0.9992489523809523,
"eval_loss": 0.00500760693103075,
"eval_runtime": 54.1302,
"eval_samples_per_second": 554.219,
"eval_steps_per_second": 34.639,
"step": 110000
},
{
"epoch": 1.0805894835662386,
"grad_norm": 0.05090247467160225,
"learning_rate": 0.0003480594444444444,
"loss": 0.0075,
"step": 110500
},
{
"epoch": 1.0854790287407465,
"grad_norm": 0.02564290165901184,
"learning_rate": 0.000346115,
"loss": 0.0077,
"step": 111000
},
{
"epoch": 1.0854790287407465,
"eval_accuracy": 0.9992412380952381,
"eval_loss": 0.005068215075880289,
"eval_runtime": 53.2721,
"eval_samples_per_second": 563.147,
"eval_steps_per_second": 35.197,
"step": 111000
},
{
"epoch": 1.0903685739152544,
"grad_norm": 0.032404959201812744,
"learning_rate": 0.0003441705555555556,
"loss": 0.0076,
"step": 111500
},
{
"epoch": 1.0952581190897623,
"grad_norm": 0.05177515000104904,
"learning_rate": 0.00034222611111111114,
"loss": 0.0077,
"step": 112000
},
{
"epoch": 1.0952581190897623,
"eval_accuracy": 0.9992587142857143,
"eval_loss": 0.00494408467784524,
"eval_runtime": 53.7598,
"eval_samples_per_second": 558.038,
"eval_steps_per_second": 34.877,
"step": 112000
},
{
"epoch": 1.10014766426427,
"grad_norm": 0.041296541690826416,
"learning_rate": 0.00034028166666666664,
"loss": 0.0076,
"step": 112500
},
{
"epoch": 1.105037209438778,
"grad_norm": 0.027352752164006233,
"learning_rate": 0.0003383372222222222,
"loss": 0.0077,
"step": 113000
},
{
"epoch": 1.105037209438778,
"eval_accuracy": 0.9992613333333333,
"eval_loss": 0.004911018069833517,
"eval_runtime": 53.361,
"eval_samples_per_second": 562.209,
"eval_steps_per_second": 35.138,
"step": 113000
},
{
"epoch": 1.1099267546132858,
"grad_norm": 0.017891952767968178,
"learning_rate": 0.00033639277777777776,
"loss": 0.0074,
"step": 113500
},
{
"epoch": 1.1148162997877937,
"grad_norm": 0.10825661569833755,
"learning_rate": 0.0003344483333333333,
"loss": 0.0077,
"step": 114000
},
{
"epoch": 1.1148162997877937,
"eval_accuracy": 0.9992698095238095,
"eval_loss": 0.004937721882015467,
"eval_runtime": 53.9545,
"eval_samples_per_second": 556.024,
"eval_steps_per_second": 34.752,
"step": 114000
},
{
"epoch": 1.1197058449623016,
"grad_norm": 0.0252179317176342,
"learning_rate": 0.00033250388888888887,
"loss": 0.0072,
"step": 114500
},
{
"epoch": 1.1245953901368095,
"grad_norm": 0.10007605701684952,
"learning_rate": 0.0003305594444444445,
"loss": 0.0073,
"step": 115000
},
{
"epoch": 1.1245953901368095,
"eval_accuracy": 0.9992664285714286,
"eval_loss": 0.005000779405236244,
"eval_runtime": 53.4444,
"eval_samples_per_second": 561.331,
"eval_steps_per_second": 35.083,
"step": 115000
},
{
"epoch": 1.1294849353113174,
"grad_norm": 0.08812825381755829,
"learning_rate": 0.000328615,
"loss": 0.0076,
"step": 115500
},
{
"epoch": 1.1343744804858251,
"grad_norm": 0.04212397709488869,
"learning_rate": 0.00032667055555555554,
"loss": 0.0071,
"step": 116000
},
{
"epoch": 1.1343744804858251,
"eval_accuracy": 0.9992689523809524,
"eval_loss": 0.0048895059153437614,
"eval_runtime": 56.3714,
"eval_samples_per_second": 532.185,
"eval_steps_per_second": 33.262,
"step": 116000
},
{
"epoch": 1.139264025660333,
"grad_norm": 0.02763226442039013,
"learning_rate": 0.0003247261111111111,
"loss": 0.0075,
"step": 116500
},
{
"epoch": 1.144153570834841,
"grad_norm": 0.05487339198589325,
"learning_rate": 0.00032278166666666665,
"loss": 0.0074,
"step": 117000
},
{
"epoch": 1.144153570834841,
"eval_accuracy": 0.9992661428571429,
"eval_loss": 0.004837568383663893,
"eval_runtime": 54.3925,
"eval_samples_per_second": 551.547,
"eval_steps_per_second": 34.472,
"step": 117000
},
{
"epoch": 1.1490431160093488,
"grad_norm": 0.04747488722205162,
"learning_rate": 0.0003208372222222222,
"loss": 0.0075,
"step": 117500
},
{
"epoch": 1.1539326611838567,
"grad_norm": 0.10006921738386154,
"learning_rate": 0.00031889277777777777,
"loss": 0.0074,
"step": 118000
},
{
"epoch": 1.1539326611838567,
"eval_accuracy": 0.9992860476190476,
"eval_loss": 0.0047850459814071655,
"eval_runtime": 53.7241,
"eval_samples_per_second": 558.408,
"eval_steps_per_second": 34.901,
"step": 118000
},
{
"epoch": 1.1588222063583646,
"grad_norm": 0.03712115064263344,
"learning_rate": 0.0003169483333333333,
"loss": 0.0075,
"step": 118500
},
{
"epoch": 1.1637117515328723,
"grad_norm": 0.05919933691620827,
"learning_rate": 0.0003150038888888889,
"loss": 0.0073,
"step": 119000
},
{
"epoch": 1.1637117515328723,
"eval_accuracy": 0.9992771428571429,
"eval_loss": 0.004803878720849752,
"eval_runtime": 53.7517,
"eval_samples_per_second": 558.121,
"eval_steps_per_second": 34.883,
"step": 119000
},
{
"epoch": 1.1686012967073802,
"grad_norm": 0.017905965447425842,
"learning_rate": 0.00031305944444444444,
"loss": 0.0069,
"step": 119500
},
{
"epoch": 1.1734908418818881,
"grad_norm": 0.05728234723210335,
"learning_rate": 0.000311115,
"loss": 0.007,
"step": 120000
},
{
"epoch": 1.1734908418818881,
"eval_accuracy": 0.999289,
"eval_loss": 0.004755858797580004,
"eval_runtime": 53.6273,
"eval_samples_per_second": 559.417,
"eval_steps_per_second": 34.964,
"step": 120000
},
{
"epoch": 1.178380387056396,
"grad_norm": 0.05677701532840729,
"learning_rate": 0.00030917055555555555,
"loss": 0.007,
"step": 120500
},
{
"epoch": 1.183269932230904,
"grad_norm": 0.05953844264149666,
"learning_rate": 0.0003072261111111111,
"loss": 0.0071,
"step": 121000
},
{
"epoch": 1.183269932230904,
"eval_accuracy": 0.999293619047619,
"eval_loss": 0.004746082704514265,
"eval_runtime": 55.1206,
"eval_samples_per_second": 544.262,
"eval_steps_per_second": 34.016,
"step": 121000
},
{
"epoch": 1.1881594774054118,
"grad_norm": 0.03433966636657715,
"learning_rate": 0.00030528166666666666,
"loss": 0.0071,
"step": 121500
},
{
"epoch": 1.1930490225799195,
"grad_norm": 0.0718400701880455,
"learning_rate": 0.0003033372222222222,
"loss": 0.0073,
"step": 122000
},
{
"epoch": 1.1930490225799195,
"eval_accuracy": 0.9992973333333334,
"eval_loss": 0.004623962566256523,
"eval_runtime": 54.751,
"eval_samples_per_second": 547.935,
"eval_steps_per_second": 34.246,
"step": 122000
},
{
"epoch": 1.1979385677544274,
"grad_norm": 0.026871928945183754,
"learning_rate": 0.0003013927777777778,
"loss": 0.0065,
"step": 122500
},
{
"epoch": 1.2028281129289353,
"grad_norm": 0.015808627009391785,
"learning_rate": 0.00029944833333333333,
"loss": 0.0069,
"step": 123000
},
{
"epoch": 1.2028281129289353,
"eval_accuracy": 0.9992959047619048,
"eval_loss": 0.004734317306429148,
"eval_runtime": 53.9604,
"eval_samples_per_second": 555.963,
"eval_steps_per_second": 34.748,
"step": 123000
},
{
"epoch": 1.2077176581034432,
"grad_norm": 0.06739887595176697,
"learning_rate": 0.0002975038888888889,
"loss": 0.0071,
"step": 123500
},
{
"epoch": 1.2126072032779511,
"grad_norm": 0.020941952243447304,
"learning_rate": 0.00029555944444444444,
"loss": 0.007,
"step": 124000
},
{
"epoch": 1.2126072032779511,
"eval_accuracy": 0.9992935238095239,
"eval_loss": 0.004609288647770882,
"eval_runtime": 54.1194,
"eval_samples_per_second": 554.33,
"eval_steps_per_second": 34.646,
"step": 124000
},
{
"epoch": 1.217496748452459,
"grad_norm": 0.027827920392155647,
"learning_rate": 0.000293615,
"loss": 0.007,
"step": 124500
},
{
"epoch": 1.222386293626967,
"grad_norm": 0.08693556487560272,
"learning_rate": 0.00029167055555555556,
"loss": 0.0069,
"step": 125000
},
{
"epoch": 1.222386293626967,
"eval_accuracy": 0.9993093333333334,
"eval_loss": 0.004602524451911449,
"eval_runtime": 53.3938,
"eval_samples_per_second": 561.863,
"eval_steps_per_second": 35.116,
"step": 125000
},
{
"epoch": 1.2272758388014746,
"grad_norm": 0.04795575141906738,
"learning_rate": 0.0002897261111111111,
"loss": 0.0069,
"step": 125500
},
{
"epoch": 1.2321653839759825,
"grad_norm": 0.07266402244567871,
"learning_rate": 0.00028778166666666667,
"loss": 0.0071,
"step": 126000
},
{
"epoch": 1.2321653839759825,
"eval_accuracy": 0.9993089523809524,
"eval_loss": 0.00456634908914566,
"eval_runtime": 54.0494,
"eval_samples_per_second": 555.048,
"eval_steps_per_second": 34.69,
"step": 126000
},
{
"epoch": 1.2370549291504904,
"grad_norm": 0.03289886936545372,
"learning_rate": 0.0002858372222222222,
"loss": 0.0072,
"step": 126500
},
{
"epoch": 1.2419444743249983,
"grad_norm": 0.02240580879151821,
"learning_rate": 0.0002838927777777778,
"loss": 0.007,
"step": 127000
},
{
"epoch": 1.2419444743249983,
"eval_accuracy": 0.9993215714285715,
"eval_loss": 0.004485046491026878,
"eval_runtime": 53.4392,
"eval_samples_per_second": 561.386,
"eval_steps_per_second": 35.087,
"step": 127000
},
{
"epoch": 1.2468340194995062,
"grad_norm": 0.040360696613788605,
"learning_rate": 0.00028194833333333334,
"loss": 0.0068,
"step": 127500
},
{
"epoch": 1.251723564674014,
"grad_norm": 0.032697584480047226,
"learning_rate": 0.0002800038888888889,
"loss": 0.0072,
"step": 128000
},
{
"epoch": 1.251723564674014,
"eval_accuracy": 0.9993274761904762,
"eval_loss": 0.004469048231840134,
"eval_runtime": 53.9627,
"eval_samples_per_second": 555.939,
"eval_steps_per_second": 34.746,
"step": 128000
},
{
"epoch": 1.2566131098485218,
"grad_norm": 0.021058347076177597,
"learning_rate": 0.00027805944444444445,
"loss": 0.0069,
"step": 128500
},
{
"epoch": 1.2615026550230297,
"grad_norm": 0.036056675016880035,
"learning_rate": 0.000276115,
"loss": 0.0067,
"step": 129000
},
{
"epoch": 1.2615026550230297,
"eval_accuracy": 0.9993329047619047,
"eval_loss": 0.004397740587592125,
"eval_runtime": 53.2747,
"eval_samples_per_second": 563.12,
"eval_steps_per_second": 35.195,
"step": 129000
},
{
"epoch": 1.2663922001975376,
"grad_norm": 0.034787457436323166,
"learning_rate": 0.0002741705555555555,
"loss": 0.0066,
"step": 129500
},
{
"epoch": 1.2712817453720455,
"grad_norm": 0.05359942466020584,
"learning_rate": 0.0002722261111111111,
"loss": 0.0065,
"step": 130000
},
{
"epoch": 1.2712817453720455,
"eval_accuracy": 0.9993344761904762,
"eval_loss": 0.004399556666612625,
"eval_runtime": 53.9523,
"eval_samples_per_second": 556.047,
"eval_steps_per_second": 34.753,
"step": 130000
},
{
"epoch": 1.2761712905465534,
"grad_norm": 0.02243073098361492,
"learning_rate": 0.0002702816666666667,
"loss": 0.0068,
"step": 130500
},
{
"epoch": 1.2810608357210613,
"grad_norm": 0.049295682460069656,
"learning_rate": 0.00026833722222222223,
"loss": 0.0068,
"step": 131000
},
{
"epoch": 1.2810608357210613,
"eval_accuracy": 0.9993318571428571,
"eval_loss": 0.004440919030457735,
"eval_runtime": 53.2304,
"eval_samples_per_second": 563.587,
"eval_steps_per_second": 35.224,
"step": 131000
},
{
"epoch": 1.285950380895569,
"grad_norm": 0.021682027727365494,
"learning_rate": 0.0002663927777777778,
"loss": 0.0067,
"step": 131500
},
{
"epoch": 1.290839926070077,
"grad_norm": 0.0382467582821846,
"learning_rate": 0.00026444833333333335,
"loss": 0.0067,
"step": 132000
},
{
"epoch": 1.290839926070077,
"eval_accuracy": 0.9993491904761905,
"eval_loss": 0.004402833059430122,
"eval_runtime": 53.8618,
"eval_samples_per_second": 556.981,
"eval_steps_per_second": 34.811,
"step": 132000
},
{
"epoch": 1.2957294712445848,
"grad_norm": 0.041405659168958664,
"learning_rate": 0.00026250388888888885,
"loss": 0.0068,
"step": 132500
},
{
"epoch": 1.3006190164190927,
"grad_norm": 0.039939701557159424,
"learning_rate": 0.00026055944444444446,
"loss": 0.0064,
"step": 133000
},
{
"epoch": 1.3006190164190927,
"eval_accuracy": 0.9993461904761904,
"eval_loss": 0.004411030560731888,
"eval_runtime": 52.9835,
"eval_samples_per_second": 566.214,
"eval_steps_per_second": 35.388,
"step": 133000
},
{
"epoch": 1.3055085615936006,
"grad_norm": 0.07499232143163681,
"learning_rate": 0.000258615,
"loss": 0.0068,
"step": 133500
},
{
"epoch": 1.3103981067681083,
"grad_norm": 0.03830355405807495,
"learning_rate": 0.0002566705555555556,
"loss": 0.0066,
"step": 134000
},
{
"epoch": 1.3103981067681083,
"eval_accuracy": 0.9993475238095239,
"eval_loss": 0.004307963885366917,
"eval_runtime": 54.0847,
"eval_samples_per_second": 554.685,
"eval_steps_per_second": 34.668,
"step": 134000
},
{
"epoch": 1.3152876519426164,
"grad_norm": 0.04341171681880951,
"learning_rate": 0.00025472611111111113,
"loss": 0.0064,
"step": 134500
},
{
"epoch": 1.3201771971171241,
"grad_norm": 0.05085453763604164,
"learning_rate": 0.00025278166666666663,
"loss": 0.0066,
"step": 135000
},
{
"epoch": 1.3201771971171241,
"eval_accuracy": 0.9993423809523809,
"eval_loss": 0.004391905851662159,
"eval_runtime": 53.489,
"eval_samples_per_second": 560.863,
"eval_steps_per_second": 35.054,
"step": 135000
},
{
"epoch": 1.325066742291632,
"grad_norm": 0.05465886369347572,
"learning_rate": 0.0002508372222222222,
"loss": 0.0065,
"step": 135500
},
{
"epoch": 1.32995628746614,
"grad_norm": 0.028779752552509308,
"learning_rate": 0.00024889277777777774,
"loss": 0.0065,
"step": 136000
},
{
"epoch": 1.32995628746614,
"eval_accuracy": 0.9993518571428571,
"eval_loss": 0.004291407763957977,
"eval_runtime": 53.5568,
"eval_samples_per_second": 560.153,
"eval_steps_per_second": 35.01,
"step": 136000
},
{
"epoch": 1.3348458326406478,
"grad_norm": 0.07813508808612823,
"learning_rate": 0.00024694833333333336,
"loss": 0.0069,
"step": 136500
},
{
"epoch": 1.3397353778151557,
"grad_norm": 0.034233298152685165,
"learning_rate": 0.0002450038888888889,
"loss": 0.0064,
"step": 137000
},
{
"epoch": 1.3397353778151557,
"eval_accuracy": 0.9993458095238095,
"eval_loss": 0.004360624123364687,
"eval_runtime": 52.8603,
"eval_samples_per_second": 567.534,
"eval_steps_per_second": 35.471,
"step": 137000
},
{
"epoch": 1.3446249229896634,
"grad_norm": 0.08024276047945023,
"learning_rate": 0.00024305944444444447,
"loss": 0.0061,
"step": 137500
},
{
"epoch": 1.3495144681641713,
"grad_norm": 0.05493255332112312,
"learning_rate": 0.00024111499999999997,
"loss": 0.0066,
"step": 138000
},
{
"epoch": 1.3495144681641713,
"eval_accuracy": 0.9993639047619047,
"eval_loss": 0.00431590573862195,
"eval_runtime": 53.7077,
"eval_samples_per_second": 558.579,
"eval_steps_per_second": 34.911,
"step": 138000
},
{
"epoch": 1.3544040133386792,
"grad_norm": 0.04275180399417877,
"learning_rate": 0.00023917055555555555,
"loss": 0.0062,
"step": 138500
},
{
"epoch": 1.3592935585131871,
"grad_norm": 0.07628139853477478,
"learning_rate": 0.0002372261111111111,
"loss": 0.0065,
"step": 139000
},
{
"epoch": 1.3592935585131871,
"eval_accuracy": 0.9993583809523809,
"eval_loss": 0.0042925444431602955,
"eval_runtime": 53.3087,
"eval_samples_per_second": 562.76,
"eval_steps_per_second": 35.173,
"step": 139000
},
{
"epoch": 1.364183103687695,
"grad_norm": 0.018862802535295486,
"learning_rate": 0.00023528166666666667,
"loss": 0.0064,
"step": 139500
},
{
"epoch": 1.3690726488622027,
"grad_norm": 0.059994716197252274,
"learning_rate": 0.00023333722222222222,
"loss": 0.0061,
"step": 140000
},
{
"epoch": 1.3690726488622027,
"eval_accuracy": 0.9993745714285714,
"eval_loss": 0.004216773435473442,
"eval_runtime": 53.7427,
"eval_samples_per_second": 558.215,
"eval_steps_per_second": 34.888,
"step": 140000
},
{
"epoch": 1.3739621940367108,
"grad_norm": 0.02738560363650322,
"learning_rate": 0.00023139277777777775,
"loss": 0.006,
"step": 140500
},
{
"epoch": 1.3788517392112185,
"grad_norm": 0.16879647970199585,
"learning_rate": 0.0002294483333333333,
"loss": 0.0062,
"step": 141000
},
{
"epoch": 1.3788517392112185,
"eval_accuracy": 0.9993692380952381,
"eval_loss": 0.004215199965983629,
"eval_runtime": 53.2674,
"eval_samples_per_second": 563.197,
"eval_steps_per_second": 35.2,
"step": 141000
},
{
"epoch": 1.3837412843857264,
"grad_norm": 0.03396091237664223,
"learning_rate": 0.0002275038888888889,
"loss": 0.0062,
"step": 141500
},
{
"epoch": 1.3886308295602343,
"grad_norm": 0.04174041002988815,
"learning_rate": 0.00022555944444444445,
"loss": 0.0063,
"step": 142000
},
{
"epoch": 1.3886308295602343,
"eval_accuracy": 0.9993620476190476,
"eval_loss": 0.00427864259108901,
"eval_runtime": 54.516,
"eval_samples_per_second": 550.297,
"eval_steps_per_second": 34.394,
"step": 142000
},
{
"epoch": 1.3935203747347422,
"grad_norm": 0.032653287053108215,
"learning_rate": 0.000223615,
"loss": 0.0062,
"step": 142500
},
{
"epoch": 1.3984099199092501,
"grad_norm": 0.04273010045289993,
"learning_rate": 0.00022167055555555556,
"loss": 0.0061,
"step": 143000
},
{
"epoch": 1.3984099199092501,
"eval_accuracy": 0.9993804761904762,
"eval_loss": 0.0041556586511433125,
"eval_runtime": 53.4491,
"eval_samples_per_second": 561.282,
"eval_steps_per_second": 35.08,
"step": 143000
},
{
"epoch": 1.4032994650837578,
"grad_norm": 0.043946944177150726,
"learning_rate": 0.0002197261111111111,
"loss": 0.0059,
"step": 143500
},
{
"epoch": 1.4081890102582657,
"grad_norm": 0.016042672097682953,
"learning_rate": 0.00021778166666666665,
"loss": 0.0062,
"step": 144000
},
{
"epoch": 1.4081890102582657,
"eval_accuracy": 0.9993822857142857,
"eval_loss": 0.004146920517086983,
"eval_runtime": 53.2095,
"eval_samples_per_second": 563.809,
"eval_steps_per_second": 35.238,
"step": 144000
},
{
"epoch": 1.4130785554327736,
"grad_norm": 0.04190443456172943,
"learning_rate": 0.0002158372222222222,
"loss": 0.006,
"step": 144500
},
{
"epoch": 1.4179681006072815,
"grad_norm": 0.029104501008987427,
"learning_rate": 0.0002138927777777778,
"loss": 0.006,
"step": 145000
},
{
"epoch": 1.4179681006072815,
"eval_accuracy": 0.9993911428571428,
"eval_loss": 0.004062490537762642,
"eval_runtime": 53.4832,
"eval_samples_per_second": 560.923,
"eval_steps_per_second": 35.058,
"step": 145000
},
{
"epoch": 1.4228576457817894,
"grad_norm": 0.019995709881186485,
"learning_rate": 0.00021194833333333335,
"loss": 0.0058,
"step": 145500
},
{
"epoch": 1.4277471909562973,
"grad_norm": 0.016850166022777557,
"learning_rate": 0.0002100038888888889,
"loss": 0.0062,
"step": 146000
},
{
"epoch": 1.4277471909562973,
"eval_accuracy": 0.9993850476190477,
"eval_loss": 0.00406758114695549,
"eval_runtime": 54.1149,
"eval_samples_per_second": 554.376,
"eval_steps_per_second": 34.648,
"step": 146000
},
{
"epoch": 1.4326367361308052,
"grad_norm": 0.042491696774959564,
"learning_rate": 0.00020805944444444443,
"loss": 0.0059,
"step": 146500
},
{
"epoch": 1.437526281305313,
"grad_norm": 0.07708732038736343,
"learning_rate": 0.000206115,
"loss": 0.006,
"step": 147000
},
{
"epoch": 1.437526281305313,
"eval_accuracy": 0.9993972857142858,
"eval_loss": 0.004030513111501932,
"eval_runtime": 53.221,
"eval_samples_per_second": 563.687,
"eval_steps_per_second": 35.23,
"step": 147000
},
{
"epoch": 1.4424158264798208,
"grad_norm": 0.032772552222013474,
"learning_rate": 0.00020417055555555554,
"loss": 0.0059,
"step": 147500
},
{
"epoch": 1.4473053716543287,
"grad_norm": 0.041167329996824265,
"learning_rate": 0.00020222611111111113,
"loss": 0.0058,
"step": 148000
},
{
"epoch": 1.4473053716543287,
"eval_accuracy": 0.999398,
"eval_loss": 0.004109182395040989,
"eval_runtime": 53.8747,
"eval_samples_per_second": 556.848,
"eval_steps_per_second": 34.803,
"step": 148000
},
{
"epoch": 1.4521949168288366,
"grad_norm": 0.033146705478429794,
"learning_rate": 0.00020028166666666668,
"loss": 0.0058,
"step": 148500
},
{
"epoch": 1.4570844620033445,
"grad_norm": 0.04614367336034775,
"learning_rate": 0.0001983372222222222,
"loss": 0.0057,
"step": 149000
},
{
"epoch": 1.4570844620033445,
"eval_accuracy": 0.9994065238095238,
"eval_loss": 0.003991841338574886,
"eval_runtime": 53.7363,
"eval_samples_per_second": 558.282,
"eval_steps_per_second": 34.893,
"step": 149000
},
{
"epoch": 1.4619740071778522,
"grad_norm": 0.031296566128730774,
"learning_rate": 0.00019639277777777777,
"loss": 0.0057,
"step": 149500
},
{
"epoch": 1.46686355235236,
"grad_norm": 0.03523857146501541,
"learning_rate": 0.00019444833333333333,
"loss": 0.0059,
"step": 150000
},
{
"epoch": 1.46686355235236,
"eval_accuracy": 0.9994045238095238,
"eval_loss": 0.00398767227306962,
"eval_runtime": 54.0668,
"eval_samples_per_second": 554.869,
"eval_steps_per_second": 34.679,
"step": 150000
},
{
"epoch": 1.471753097526868,
"grad_norm": 0.030513431876897812,
"learning_rate": 0.00019250388888888888,
"loss": 0.006,
"step": 150500
},
{
"epoch": 1.476642642701376,
"grad_norm": 0.03433874994516373,
"learning_rate": 0.00019055944444444444,
"loss": 0.0057,
"step": 151000
},
{
"epoch": 1.476642642701376,
"eval_accuracy": 0.999412,
"eval_loss": 0.003936768043786287,
"eval_runtime": 53.4197,
"eval_samples_per_second": 561.591,
"eval_steps_per_second": 35.099,
"step": 151000
},
{
"epoch": 1.4815321878758838,
"grad_norm": 0.03743559867143631,
"learning_rate": 0.00018861500000000002,
"loss": 0.0059,
"step": 151500
},
{
"epoch": 1.4864217330503917,
"grad_norm": 0.023772869259119034,
"learning_rate": 0.00018667055555555553,
"loss": 0.0056,
"step": 152000
},
{
"epoch": 1.4864217330503917,
"eval_accuracy": 0.9994103333333333,
"eval_loss": 0.00395695585757494,
"eval_runtime": 53.4862,
"eval_samples_per_second": 560.892,
"eval_steps_per_second": 35.056,
"step": 152000
},
{
"epoch": 1.4913112782248996,
"grad_norm": 0.021286042407155037,
"learning_rate": 0.0001847261111111111,
"loss": 0.0056,
"step": 152500
},
{
"epoch": 1.4962008233994073,
"grad_norm": 0.04487517103552818,
"learning_rate": 0.00018278166666666667,
"loss": 0.0059,
"step": 153000
},
{
"epoch": 1.4962008233994073,
"eval_accuracy": 0.9994135714285715,
"eval_loss": 0.0038883944507688284,
"eval_runtime": 53.7959,
"eval_samples_per_second": 557.663,
"eval_steps_per_second": 34.854,
"step": 153000
},
{
"epoch": 1.5010903685739152,
"grad_norm": 0.02229585126042366,
"learning_rate": 0.00018083722222222222,
"loss": 0.0056,
"step": 153500
},
{
"epoch": 1.505979913748423,
"grad_norm": 0.06015641614794731,
"learning_rate": 0.00017889277777777778,
"loss": 0.0055,
"step": 154000
},
{
"epoch": 1.505979913748423,
"eval_accuracy": 0.9994171428571429,
"eval_loss": 0.0039031950291246176,
"eval_runtime": 53.8206,
"eval_samples_per_second": 557.408,
"eval_steps_per_second": 34.838,
"step": 154000
},
{
"epoch": 1.510869458922931,
"grad_norm": 0.060777414590120316,
"learning_rate": 0.00017694833333333336,
"loss": 0.0057,
"step": 154500
},
{
"epoch": 1.515759004097439,
"grad_norm": 0.010729908011853695,
"learning_rate": 0.00017500388888888886,
"loss": 0.0055,
"step": 155000
},
{
"epoch": 1.515759004097439,
"eval_accuracy": 0.9994168095238095,
"eval_loss": 0.0038592983037233353,
"eval_runtime": 52.9997,
"eval_samples_per_second": 566.041,
"eval_steps_per_second": 35.378,
"step": 155000
},
{
"epoch": 1.5206485492719466,
"grad_norm": 0.07996519654989243,
"learning_rate": 0.00017305944444444445,
"loss": 0.0056,
"step": 155500
},
{
"epoch": 1.5255380944464547,
"grad_norm": 0.05094398185610771,
"learning_rate": 0.000171115,
"loss": 0.0056,
"step": 156000
},
{
"epoch": 1.5255380944464547,
"eval_accuracy": 0.9994315238095238,
"eval_loss": 0.0037978454492986202,
"eval_runtime": 53.5723,
"eval_samples_per_second": 559.991,
"eval_steps_per_second": 34.999,
"step": 156000
},
{
"epoch": 1.5304276396209624,
"grad_norm": 0.038200926035642624,
"learning_rate": 0.00016917055555555556,
"loss": 0.0055,
"step": 156500
},
{
"epoch": 1.5353171847954703,
"grad_norm": 0.10346455127000809,
"learning_rate": 0.00016722611111111112,
"loss": 0.0054,
"step": 157000
},
{
"epoch": 1.5353171847954703,
"eval_accuracy": 0.9994299523809523,
"eval_loss": 0.0037865168415009975,
"eval_runtime": 53.2357,
"eval_samples_per_second": 563.531,
"eval_steps_per_second": 35.221,
"step": 157000
},
{
"epoch": 1.5402067299699782,
"grad_norm": 0.015595887787640095,
"learning_rate": 0.00016528166666666667,
"loss": 0.0056,
"step": 157500
},
{
"epoch": 1.545096275144486,
"grad_norm": 0.0232669860124588,
"learning_rate": 0.00016333722222222223,
"loss": 0.0055,
"step": 158000
},
{
"epoch": 1.545096275144486,
"eval_accuracy": 0.9994310476190477,
"eval_loss": 0.003748950082808733,
"eval_runtime": 54.3134,
"eval_samples_per_second": 552.35,
"eval_steps_per_second": 34.522,
"step": 158000
},
{
"epoch": 1.549985820318994,
"grad_norm": 0.04196183383464813,
"learning_rate": 0.00016139277777777776,
"loss": 0.0054,
"step": 158500
},
{
"epoch": 1.5548753654935017,
"grad_norm": 0.04280064254999161,
"learning_rate": 0.00015944833333333334,
"loss": 0.0055,
"step": 159000
},
{
"epoch": 1.5548753654935017,
"eval_accuracy": 0.9994327619047619,
"eval_loss": 0.00377083383500576,
"eval_runtime": 53.1652,
"eval_samples_per_second": 564.278,
"eval_steps_per_second": 35.267,
"step": 159000
},
{
"epoch": 1.5597649106680098,
"grad_norm": 0.01646304689347744,
"learning_rate": 0.00015750388888888887,
"loss": 0.0053,
"step": 159500
},
{
"epoch": 1.5646544558425175,
"grad_norm": 0.015490056946873665,
"learning_rate": 0.00015555944444444443,
"loss": 0.0053,
"step": 160000
},
{
"epoch": 1.5646544558425175,
"eval_accuracy": 0.9994344285714286,
"eval_loss": 0.0037254535127431154,
"eval_runtime": 55.4441,
"eval_samples_per_second": 541.086,
"eval_steps_per_second": 33.818,
"step": 160000
},
{
"epoch": 1.5695440010170254,
"grad_norm": 0.034573186188936234,
"learning_rate": 0.000153615,
"loss": 0.0052,
"step": 160500
},
{
"epoch": 1.5744335461915333,
"grad_norm": 0.0471004843711853,
"learning_rate": 0.00015167055555555554,
"loss": 0.0055,
"step": 161000
},
{
"epoch": 1.5744335461915333,
"eval_accuracy": 0.9994374285714286,
"eval_loss": 0.003749826457351446,
"eval_runtime": 52.9976,
"eval_samples_per_second": 566.063,
"eval_steps_per_second": 35.379,
"step": 161000
},
{
"epoch": 1.579323091366041,
"grad_norm": 0.06533846259117126,
"learning_rate": 0.0001497261111111111,
"loss": 0.0056,
"step": 161500
},
{
"epoch": 1.5842126365405491,
"grad_norm": 0.009449661709368229,
"learning_rate": 0.00014778166666666668,
"loss": 0.0053,
"step": 162000
},
{
"epoch": 1.5842126365405491,
"eval_accuracy": 0.9994476666666666,
"eval_loss": 0.003748701885342598,
"eval_runtime": 53.6491,
"eval_samples_per_second": 559.189,
"eval_steps_per_second": 34.949,
"step": 162000
},
{
"epoch": 1.5891021817150568,
"grad_norm": 0.009880056604743004,
"learning_rate": 0.0001458372222222222,
"loss": 0.0055,
"step": 162500
},
{
"epoch": 1.5939917268895647,
"grad_norm": 0.05580669641494751,
"learning_rate": 0.00014389277777777777,
"loss": 0.0051,
"step": 163000
},
{
"epoch": 1.5939917268895647,
"eval_accuracy": 0.9994498571428572,
"eval_loss": 0.0037052214611321688,
"eval_runtime": 53.1475,
"eval_samples_per_second": 564.467,
"eval_steps_per_second": 35.279,
"step": 163000
},
{
"epoch": 1.5988812720640726,
"grad_norm": 0.033147793263196945,
"learning_rate": 0.00014194833333333335,
"loss": 0.0055,
"step": 163500
},
{
"epoch": 1.6037708172385805,
"grad_norm": 0.04852864146232605,
"learning_rate": 0.00014000388888888888,
"loss": 0.0054,
"step": 164000
},
{
"epoch": 1.6037708172385805,
"eval_accuracy": 0.9994494761904762,
"eval_loss": 0.003642507828772068,
"eval_runtime": 53.3294,
"eval_samples_per_second": 562.542,
"eval_steps_per_second": 35.159,
"step": 164000
},
{
"epoch": 1.6086603624130884,
"grad_norm": 0.04461289569735527,
"learning_rate": 0.00013805944444444444,
"loss": 0.0053,
"step": 164500
},
{
"epoch": 1.613549907587596,
"grad_norm": 0.04816494509577751,
"learning_rate": 0.000136115,
"loss": 0.0053,
"step": 165000
},
{
"epoch": 1.613549907587596,
"eval_accuracy": 0.999451380952381,
"eval_loss": 0.003629567800089717,
"eval_runtime": 53.0304,
"eval_samples_per_second": 565.713,
"eval_steps_per_second": 35.357,
"step": 165000
},
{
"epoch": 1.6184394527621042,
"grad_norm": 0.04067426174879074,
"learning_rate": 0.00013417055555555555,
"loss": 0.0063,
"step": 165500
},
{
"epoch": 1.623328997936612,
"grad_norm": 0.040210772305727005,
"learning_rate": 0.0001322261111111111,
"loss": 0.0053,
"step": 166000
},
{
"epoch": 1.623328997936612,
"eval_accuracy": 0.9994541904761904,
"eval_loss": 0.0036138601135462523,
"eval_runtime": 53.4406,
"eval_samples_per_second": 561.371,
"eval_steps_per_second": 35.086,
"step": 166000
},
{
"epoch": 1.6282185431111198,
"grad_norm": 0.04125046357512474,
"learning_rate": 0.00013028166666666666,
"loss": 0.0053,
"step": 166500
},
{
"epoch": 1.6331080882856277,
"grad_norm": 0.03415411710739136,
"learning_rate": 0.00012833722222222222,
"loss": 0.0051,
"step": 167000
},
{
"epoch": 1.6331080882856277,
"eval_accuracy": 0.9994632380952381,
"eval_loss": 0.003615338122472167,
"eval_runtime": 53.1392,
"eval_samples_per_second": 564.555,
"eval_steps_per_second": 35.285,
"step": 167000
},
{
"epoch": 1.6379976334601354,
"grad_norm": 0.03695495426654816,
"learning_rate": 0.00012639277777777778,
"loss": 0.0053,
"step": 167500
},
{
"epoch": 1.6428871786346435,
"grad_norm": 0.011762870475649834,
"learning_rate": 0.00012444833333333333,
"loss": 0.0051,
"step": 168000
},
{
"epoch": 1.6428871786346435,
"eval_accuracy": 0.9994638095238095,
"eval_loss": 0.003587596118450165,
"eval_runtime": 53.5347,
"eval_samples_per_second": 560.384,
"eval_steps_per_second": 35.024,
"step": 168000
},
{
"epoch": 1.6477767238091512,
"grad_norm": 0.01232131477445364,
"learning_rate": 0.0001225038888888889,
"loss": 0.0048,
"step": 168500
},
{
"epoch": 1.652666268983659,
"grad_norm": 0.04049614071846008,
"learning_rate": 0.00012055944444444445,
"loss": 0.0048,
"step": 169000
},
{
"epoch": 1.652666268983659,
"eval_accuracy": 0.9994665714285714,
"eval_loss": 0.003581820521503687,
"eval_runtime": 53.0217,
"eval_samples_per_second": 565.806,
"eval_steps_per_second": 35.363,
"step": 169000
},
{
"epoch": 1.657555814158167,
"grad_norm": 0.04034195467829704,
"learning_rate": 0.00011861499999999999,
"loss": 0.0051,
"step": 169500
},
{
"epoch": 1.662445359332675,
"grad_norm": 0.014481657184660435,
"learning_rate": 0.00011667055555555556,
"loss": 0.0051,
"step": 170000
},
{
"epoch": 1.662445359332675,
"eval_accuracy": 0.9994720476190476,
"eval_loss": 0.0035638269037008286,
"eval_runtime": 54.3974,
"eval_samples_per_second": 551.497,
"eval_steps_per_second": 34.469,
"step": 170000
},
{
"epoch": 1.6673349045071828,
"grad_norm": 0.025204768404364586,
"learning_rate": 0.00011472611111111111,
"loss": 0.0051,
"step": 170500
},
{
"epoch": 1.6722244496816905,
"grad_norm": 0.027605898678302765,
"learning_rate": 0.00011278166666666666,
"loss": 0.0049,
"step": 171000
},
{
"epoch": 1.6722244496816905,
"eval_accuracy": 0.9994744285714285,
"eval_loss": 0.003567066974937916,
"eval_runtime": 53.8985,
"eval_samples_per_second": 556.602,
"eval_steps_per_second": 34.788,
"step": 171000
},
{
"epoch": 1.6771139948561986,
"grad_norm": 0.038017790764570236,
"learning_rate": 0.00011083722222222223,
"loss": 0.005,
"step": 171500
},
{
"epoch": 1.6820035400307063,
"grad_norm": 0.048752035945653915,
"learning_rate": 0.00010889277777777778,
"loss": 0.005,
"step": 172000
},
{
"epoch": 1.6820035400307063,
"eval_accuracy": 0.9994751428571429,
"eval_loss": 0.003484962275251746,
"eval_runtime": 54.461,
"eval_samples_per_second": 550.853,
"eval_steps_per_second": 34.428,
"step": 172000
},
{
"epoch": 1.6868930852052142,
"grad_norm": 0.08453824371099472,
"learning_rate": 0.00010694833333333333,
"loss": 0.005,
"step": 172500
},
{
"epoch": 1.691782630379722,
"grad_norm": 0.01620589755475521,
"learning_rate": 0.00010500388888888888,
"loss": 0.005,
"step": 173000
},
{
"epoch": 1.691782630379722,
"eval_accuracy": 0.9994759047619047,
"eval_loss": 0.003478883532807231,
"eval_runtime": 54.0084,
"eval_samples_per_second": 555.469,
"eval_steps_per_second": 34.717,
"step": 173000
},
{
"epoch": 1.69667217555423,
"grad_norm": 0.024735888466238976,
"learning_rate": 0.00010305944444444445,
"loss": 0.005,
"step": 173500
},
{
"epoch": 1.701561720728738,
"grad_norm": 0.020829100161790848,
"learning_rate": 0.000101115,
"loss": 0.005,
"step": 174000
},
{
"epoch": 1.701561720728738,
"eval_accuracy": 0.9994835714285715,
"eval_loss": 0.003460401203483343,
"eval_runtime": 53.8886,
"eval_samples_per_second": 556.704,
"eval_steps_per_second": 34.794,
"step": 174000
},
{
"epoch": 1.7064512659032456,
"grad_norm": 0.02870938368141651,
"learning_rate": 9.917055555555555e-05,
"loss": 0.0049,
"step": 174500
},
{
"epoch": 1.7113408110777537,
"grad_norm": 0.03082539327442646,
"learning_rate": 9.72261111111111e-05,
"loss": 0.0049,
"step": 175000
},
{
"epoch": 1.7113408110777537,
"eval_accuracy": 0.9994848095238095,
"eval_loss": 0.0034341050777584314,
"eval_runtime": 53.6965,
"eval_samples_per_second": 558.695,
"eval_steps_per_second": 34.918,
"step": 175000
},
{
"epoch": 1.7162303562522614,
"grad_norm": 0.04300360381603241,
"learning_rate": 9.528166666666667e-05,
"loss": 0.0047,
"step": 175500
},
{
"epoch": 1.7211199014267693,
"grad_norm": 0.010836569592356682,
"learning_rate": 9.333722222222222e-05,
"loss": 0.0049,
"step": 176000
},
{
"epoch": 1.7211199014267693,
"eval_accuracy": 0.999487,
"eval_loss": 0.0034149654675275087,
"eval_runtime": 54.2439,
"eval_samples_per_second": 553.058,
"eval_steps_per_second": 34.566,
"step": 176000
},
{
"epoch": 1.7260094466012772,
"grad_norm": 0.012880703434348106,
"learning_rate": 9.139277777777777e-05,
"loss": 0.0049,
"step": 176500
},
{
"epoch": 1.7308989917757849,
"grad_norm": 0.029965711757540703,
"learning_rate": 8.944833333333334e-05,
"loss": 0.0049,
"step": 177000
},
{
"epoch": 1.7308989917757849,
"eval_accuracy": 0.9994862857142857,
"eval_loss": 0.0034615020267665386,
"eval_runtime": 53.1772,
"eval_samples_per_second": 564.151,
"eval_steps_per_second": 35.259,
"step": 177000
},
{
"epoch": 1.735788536950293,
"grad_norm": 0.014986414462327957,
"learning_rate": 8.750388888888889e-05,
"loss": 0.0048,
"step": 177500
},
{
"epoch": 1.7406780821248007,
"grad_norm": 0.02675153873860836,
"learning_rate": 8.555944444444445e-05,
"loss": 0.0049,
"step": 178000
},
{
"epoch": 1.7406780821248007,
"eval_accuracy": 0.9994909047619047,
"eval_loss": 0.003412367310374975,
"eval_runtime": 54.1342,
"eval_samples_per_second": 554.178,
"eval_steps_per_second": 34.636,
"step": 178000
},
{
"epoch": 1.7455676272993086,
"grad_norm": 0.031100204214453697,
"learning_rate": 8.3615e-05,
"loss": 0.0051,
"step": 178500
},
{
"epoch": 1.7504571724738165,
"grad_norm": 0.04925690218806267,
"learning_rate": 8.167055555555555e-05,
"loss": 0.005,
"step": 179000
},
{
"epoch": 1.7504571724738165,
"eval_accuracy": 0.9994981428571429,
"eval_loss": 0.003331870539113879,
"eval_runtime": 53.5829,
"eval_samples_per_second": 559.881,
"eval_steps_per_second": 34.993,
"step": 179000
},
{
"epoch": 1.7553467176483244,
"grad_norm": 0.029799846932291985,
"learning_rate": 7.972611111111112e-05,
"loss": 0.0048,
"step": 179500
},
{
"epoch": 1.7602362628228323,
"grad_norm": 0.012169072404503822,
"learning_rate": 7.778166666666666e-05,
"loss": 0.005,
"step": 180000
},
{
"epoch": 1.7602362628228323,
"eval_accuracy": 0.9994982380952381,
"eval_loss": 0.003362874034792185,
"eval_runtime": 53.5982,
"eval_samples_per_second": 559.72,
"eval_steps_per_second": 34.983,
"step": 180000
},
{
"epoch": 1.76512580799734,
"grad_norm": 0.016585633158683777,
"learning_rate": 7.583722222222222e-05,
"loss": 0.0045,
"step": 180500
},
{
"epoch": 1.770015353171848,
"grad_norm": 0.025369074195623398,
"learning_rate": 7.389277777777777e-05,
"loss": 0.0047,
"step": 181000
},
{
"epoch": 1.770015353171848,
"eval_accuracy": 0.9995001904761904,
"eval_loss": 0.0033530080690979958,
"eval_runtime": 53.4193,
"eval_samples_per_second": 561.595,
"eval_steps_per_second": 35.1,
"step": 181000
},
{
"epoch": 1.7749048983463558,
"grad_norm": 0.04421771690249443,
"learning_rate": 7.194833333333333e-05,
"loss": 0.0046,
"step": 181500
},
{
"epoch": 1.7797944435208637,
"grad_norm": 0.05346609279513359,
"learning_rate": 7.000388888888889e-05,
"loss": 0.0048,
"step": 182000
},
{
"epoch": 1.7797944435208637,
"eval_accuracy": 0.9995021428571429,
"eval_loss": 0.0033386677969247103,
"eval_runtime": 53.8476,
"eval_samples_per_second": 557.128,
"eval_steps_per_second": 34.82,
"step": 182000
},
{
"epoch": 1.7846839886953716,
"grad_norm": 0.019687172025442123,
"learning_rate": 6.805944444444444e-05,
"loss": 0.0048,
"step": 182500
},
{
"epoch": 1.7895735338698793,
"grad_norm": 0.026194104924798012,
"learning_rate": 6.6115e-05,
"loss": 0.0048,
"step": 183000
},
{
"epoch": 1.7895735338698793,
"eval_accuracy": 0.9995039523809524,
"eval_loss": 0.003320470917969942,
"eval_runtime": 54.9547,
"eval_samples_per_second": 545.904,
"eval_steps_per_second": 34.119,
"step": 183000
},
{
"epoch": 1.7944630790443874,
"grad_norm": 0.039239440113306046,
"learning_rate": 6.417055555555556e-05,
"loss": 0.0046,
"step": 183500
},
{
"epoch": 1.799352624218895,
"grad_norm": 0.007467139046639204,
"learning_rate": 6.222611111111111e-05,
"loss": 0.0045,
"step": 184000
},
{
"epoch": 1.799352624218895,
"eval_accuracy": 0.9995039047619048,
"eval_loss": 0.003313555382192135,
"eval_runtime": 54.3263,
"eval_samples_per_second": 552.218,
"eval_steps_per_second": 34.514,
"step": 184000
},
{
"epoch": 1.804242169393403,
"grad_norm": 0.015036596916615963,
"learning_rate": 6.028166666666666e-05,
"loss": 0.0047,
"step": 184500
},
{
"epoch": 1.8091317145679109,
"grad_norm": 0.03583378717303276,
"learning_rate": 5.8337222222222226e-05,
"loss": 0.0045,
"step": 185000
},
{
"epoch": 1.8091317145679109,
"eval_accuracy": 0.9995053333333334,
"eval_loss": 0.003319466719403863,
"eval_runtime": 53.7318,
"eval_samples_per_second": 558.329,
"eval_steps_per_second": 34.896,
"step": 185000
},
{
"epoch": 1.8140212597424188,
"grad_norm": 0.025585120543837547,
"learning_rate": 5.6392777777777775e-05,
"loss": 0.0046,
"step": 185500
},
{
"epoch": 1.8189108049169267,
"grad_norm": 0.05633428320288658,
"learning_rate": 5.444833333333333e-05,
"loss": 0.0049,
"step": 186000
},
{
"epoch": 1.8189108049169267,
"eval_accuracy": 0.9995094285714285,
"eval_loss": 0.0032863873057067394,
"eval_runtime": 54.2808,
"eval_samples_per_second": 552.682,
"eval_steps_per_second": 34.543,
"step": 186000
},
{
"epoch": 1.8238003500914344,
"grad_norm": 0.08839651942253113,
"learning_rate": 5.2503888888888895e-05,
"loss": 0.0046,
"step": 186500
},
{
"epoch": 1.8286898952659425,
"grad_norm": 0.02346086874604225,
"learning_rate": 5.0559444444444445e-05,
"loss": 0.0046,
"step": 187000
},
{
"epoch": 1.8286898952659425,
"eval_accuracy": 0.999513,
"eval_loss": 0.0032574611250311136,
"eval_runtime": 53.1956,
"eval_samples_per_second": 563.957,
"eval_steps_per_second": 35.247,
"step": 187000
},
{
"epoch": 1.8335794404404502,
"grad_norm": 0.04460394009947777,
"learning_rate": 4.8615e-05,
"loss": 0.0048,
"step": 187500
},
{
"epoch": 1.838468985614958,
"grad_norm": 0.039988644421100616,
"learning_rate": 4.667055555555555e-05,
"loss": 0.0045,
"step": 188000
},
{
"epoch": 1.838468985614958,
"eval_accuracy": 0.999518380952381,
"eval_loss": 0.0032375219743698835,
"eval_runtime": 54.4684,
"eval_samples_per_second": 550.778,
"eval_steps_per_second": 34.424,
"step": 188000
},
{
"epoch": 1.843358530789466,
"grad_norm": 0.026043614372611046,
"learning_rate": 4.4726111111111114e-05,
"loss": 0.0045,
"step": 188500
},
{
"epoch": 1.8482480759639737,
"grad_norm": 0.03250015527009964,
"learning_rate": 4.2781666666666664e-05,
"loss": 0.0046,
"step": 189000
},
{
"epoch": 1.8482480759639737,
"eval_accuracy": 0.999518,
"eval_loss": 0.003229686524719,
"eval_runtime": 53.1577,
"eval_samples_per_second": 564.358,
"eval_steps_per_second": 35.272,
"step": 189000
},
{
"epoch": 1.8531376211384818,
"grad_norm": 0.041333604604005814,
"learning_rate": 4.083722222222222e-05,
"loss": 0.0045,
"step": 189500
},
{
"epoch": 1.8580271663129895,
"grad_norm": 0.030839432030916214,
"learning_rate": 3.889277777777778e-05,
"loss": 0.0044,
"step": 190000
},
{
"epoch": 1.8580271663129895,
"eval_accuracy": 0.9995217142857142,
"eval_loss": 0.0032240275759249926,
"eval_runtime": 53.3872,
"eval_samples_per_second": 561.933,
"eval_steps_per_second": 35.121,
"step": 190000
},
{
"epoch": 1.8629167114874974,
"grad_norm": 0.0212627574801445,
"learning_rate": 3.694833333333333e-05,
"loss": 0.0044,
"step": 190500
},
{
"epoch": 1.8678062566620053,
"grad_norm": 0.04159221053123474,
"learning_rate": 3.500388888888889e-05,
"loss": 0.0046,
"step": 191000
},
{
"epoch": 1.8678062566620053,
"eval_accuracy": 0.9995220476190476,
"eval_loss": 0.0032232191879302263,
"eval_runtime": 53.7977,
"eval_samples_per_second": 557.645,
"eval_steps_per_second": 34.853,
"step": 191000
},
{
"epoch": 1.8726958018365132,
"grad_norm": 0.02389533445239067,
"learning_rate": 3.3059444444444446e-05,
"loss": 0.0045,
"step": 191500
},
{
"epoch": 1.877585347011021,
"grad_norm": 0.02341424487531185,
"learning_rate": 3.1115e-05,
"loss": 0.0045,
"step": 192000
},
{
"epoch": 1.877585347011021,
"eval_accuracy": 0.9995232857142857,
"eval_loss": 0.0032045834232121706,
"eval_runtime": 54.1632,
"eval_samples_per_second": 553.881,
"eval_steps_per_second": 34.618,
"step": 192000
},
{
"epoch": 1.8824748921855288,
"grad_norm": 0.03770390897989273,
"learning_rate": 2.9170555555555556e-05,
"loss": 0.0046,
"step": 192500
},
{
"epoch": 1.887364437360037,
"grad_norm": 0.024086985737085342,
"learning_rate": 2.7226111111111112e-05,
"loss": 0.0044,
"step": 193000
},
{
"epoch": 1.887364437360037,
"eval_accuracy": 0.9995234285714286,
"eval_loss": 0.003206311957910657,
"eval_runtime": 53.6844,
"eval_samples_per_second": 558.822,
"eval_steps_per_second": 34.926,
"step": 193000
},
{
"epoch": 1.8922539825345446,
"grad_norm": 0.02860225737094879,
"learning_rate": 2.5281666666666665e-05,
"loss": 0.0043,
"step": 193500
},
{
"epoch": 1.8971435277090525,
"grad_norm": 0.034325193613767624,
"learning_rate": 2.3337222222222222e-05,
"loss": 0.0044,
"step": 194000
},
{
"epoch": 1.8971435277090525,
"eval_accuracy": 0.9995254285714286,
"eval_loss": 0.0031913991551846266,
"eval_runtime": 54.2224,
"eval_samples_per_second": 553.276,
"eval_steps_per_second": 34.58,
"step": 194000
},
{
"epoch": 1.9020330728835604,
"grad_norm": 0.03300917148590088,
"learning_rate": 2.139277777777778e-05,
"loss": 0.0045,
"step": 194500
},
{
"epoch": 1.9069226180580683,
"grad_norm": 0.037190355360507965,
"learning_rate": 1.9448333333333335e-05,
"loss": 0.0043,
"step": 195000
},
{
"epoch": 1.9069226180580683,
"eval_accuracy": 0.9995253333333334,
"eval_loss": 0.0031883243937045336,
"eval_runtime": 54.2098,
"eval_samples_per_second": 553.405,
"eval_steps_per_second": 34.588,
"step": 195000
},
{
"epoch": 1.9118121632325762,
"grad_norm": 0.1029098629951477,
"learning_rate": 1.7503888888888888e-05,
"loss": 0.0045,
"step": 195500
},
{
"epoch": 1.9167017084070839,
"grad_norm": 0.027764180675148964,
"learning_rate": 1.5559444444444444e-05,
"loss": 0.0043,
"step": 196000
},
{
"epoch": 1.9167017084070839,
"eval_accuracy": 0.9995274761904762,
"eval_loss": 0.0031636343337595463,
"eval_runtime": 54.5719,
"eval_samples_per_second": 549.733,
"eval_steps_per_second": 34.358,
"step": 196000
},
{
"epoch": 1.921591253581592,
"grad_norm": 0.031358424574136734,
"learning_rate": 1.3615e-05,
"loss": 0.0043,
"step": 196500
},
{
"epoch": 1.9264807987560997,
"grad_norm": 0.035557158291339874,
"learning_rate": 1.1670555555555556e-05,
"loss": 0.0046,
"step": 197000
},
{
"epoch": 1.9264807987560997,
"eval_accuracy": 0.999529761904762,
"eval_loss": 0.0031551867723464966,
"eval_runtime": 56.5808,
"eval_samples_per_second": 530.215,
"eval_steps_per_second": 33.138,
"step": 197000
},
{
"epoch": 1.9313703439306076,
"grad_norm": 0.034682463854551315,
"learning_rate": 9.72611111111111e-06,
"loss": 0.0045,
"step": 197500
},
{
"epoch": 1.9362598891051155,
"grad_norm": 0.023823970928788185,
"learning_rate": 7.781666666666667e-06,
"loss": 0.0043,
"step": 198000
},
{
"epoch": 1.9362598891051155,
"eval_accuracy": 0.9995307142857143,
"eval_loss": 0.0031563735101372004,
"eval_runtime": 56.8893,
"eval_samples_per_second": 527.34,
"eval_steps_per_second": 32.959,
"step": 198000
},
{
"epoch": 1.9411494342796232,
"grad_norm": 0.020929349586367607,
"learning_rate": 5.837222222222222e-06,
"loss": 0.0044,
"step": 198500
},
{
"epoch": 1.9460389794541313,
"grad_norm": 0.025913028046488762,
"learning_rate": 3.892777777777778e-06,
"loss": 0.0044,
"step": 199000
},
{
"epoch": 1.9460389794541313,
"eval_accuracy": 0.9995307619047619,
"eval_loss": 0.003153804922476411,
"eval_runtime": 54.8658,
"eval_samples_per_second": 546.788,
"eval_steps_per_second": 34.174,
"step": 199000
},
{
"epoch": 1.950928524628639,
"grad_norm": 0.02582838013768196,
"learning_rate": 1.9483333333333335e-06,
"loss": 0.0044,
"step": 199500
},
{
"epoch": 1.9558180698031469,
"grad_norm": 0.016758419573307037,
"learning_rate": 3.888888888888889e-09,
"loss": 0.0042,
"step": 200000
},
{
"epoch": 1.9558180698031469,
"eval_accuracy": 0.9995309047619048,
"eval_loss": 0.003156075021252036,
"eval_runtime": 55.5903,
"eval_samples_per_second": 539.663,
"eval_steps_per_second": 33.729,
"step": 200000
}
],
"logging_steps": 500,
"max_steps": 200000,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 1000,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 7,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 1
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.480299103223808e+17,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}