{ "best_global_step": 199000, "best_metric": 0.003153804922476411, "best_model_checkpoint": "./models/t5-small-separated-augmented-200k\\checkpoint-199000", "epoch": 1.9558180698031469, "eval_steps": 1000, "global_step": 200000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.004889545174507868, "grad_norm": 0.5060574412345886, "learning_rate": 1.7465e-05, "loss": 6.6771, "step": 500 }, { "epoch": 0.009779090349015735, "grad_norm": 0.35827475786209106, "learning_rate": 3.4965e-05, "loss": 0.1893, "step": 1000 }, { "epoch": 0.009779090349015735, "eval_accuracy": 0.9878454285714285, "eval_loss": 0.09678807854652405, "eval_runtime": 54.2373, "eval_samples_per_second": 553.125, "eval_steps_per_second": 34.57, "step": 1000 }, { "epoch": 0.014668635523523602, "grad_norm": 0.34889769554138184, "learning_rate": 5.2465e-05, "loss": 0.1447, "step": 1500 }, { "epoch": 0.01955818069803147, "grad_norm": 0.23819516599178314, "learning_rate": 6.9965e-05, "loss": 0.1206, "step": 2000 }, { "epoch": 0.01955818069803147, "eval_accuracy": 0.9902656666666667, "eval_loss": 0.073659747838974, "eval_runtime": 53.4879, "eval_samples_per_second": 560.875, "eval_steps_per_second": 35.055, "step": 2000 }, { "epoch": 0.024447725872539336, "grad_norm": 0.22346411645412445, "learning_rate": 8.7465e-05, "loss": 0.1088, "step": 2500 }, { "epoch": 0.029337271047047205, "grad_norm": 0.1609542965888977, "learning_rate": 0.000104965, "loss": 0.1005, "step": 3000 }, { "epoch": 0.029337271047047205, "eval_accuracy": 0.9918363809523809, "eval_loss": 0.05878164619207382, "eval_runtime": 55.0708, "eval_samples_per_second": 544.753, "eval_steps_per_second": 34.047, "step": 3000 }, { "epoch": 0.03422681622155507, "grad_norm": 0.16877996921539307, "learning_rate": 0.000122465, "loss": 0.0921, "step": 3500 }, { "epoch": 0.03911636139606294, "grad_norm": 0.19500480592250824, "learning_rate": 0.00013996499999999998, "loss": 0.0857, "step": 4000 }, { "epoch": 0.03911636139606294, "eval_accuracy": 0.992877380952381, "eval_loss": 0.05044129863381386, "eval_runtime": 52.8632, "eval_samples_per_second": 567.503, "eval_steps_per_second": 35.469, "step": 4000 }, { "epoch": 0.0440059065705708, "grad_norm": 0.2271735668182373, "learning_rate": 0.000157465, "loss": 0.0785, "step": 4500 }, { "epoch": 0.04889545174507867, "grad_norm": 0.15599773824214935, "learning_rate": 0.000174965, "loss": 0.0743, "step": 5000 }, { "epoch": 0.04889545174507867, "eval_accuracy": 0.993641380952381, "eval_loss": 0.04416767507791519, "eval_runtime": 53.9114, "eval_samples_per_second": 556.469, "eval_steps_per_second": 34.779, "step": 5000 }, { "epoch": 0.05378499691958654, "grad_norm": 0.13649936020374298, "learning_rate": 0.000192465, "loss": 0.0696, "step": 5500 }, { "epoch": 0.05867454209409441, "grad_norm": 0.17215129733085632, "learning_rate": 0.000209965, "loss": 0.0669, "step": 6000 }, { "epoch": 0.05867454209409441, "eval_accuracy": 0.994305380952381, "eval_loss": 0.03922554850578308, "eval_runtime": 53.2344, "eval_samples_per_second": 563.546, "eval_steps_per_second": 35.222, "step": 6000 }, { "epoch": 0.06356408726860227, "grad_norm": 0.21248804032802582, "learning_rate": 0.00022746500000000002, "loss": 0.0636, "step": 6500 }, { "epoch": 0.06845363244311015, "grad_norm": 0.2209671139717102, "learning_rate": 0.000244965, "loss": 0.062, "step": 7000 }, { "epoch": 0.06845363244311015, "eval_accuracy": 0.9948234285714286, "eval_loss": 0.035148605704307556, "eval_runtime": 53.6983, "eval_samples_per_second": 558.677, "eval_steps_per_second": 34.917, "step": 7000 }, { "epoch": 0.07334317761761801, "grad_norm": 0.16804896295070648, "learning_rate": 0.000262465, "loss": 0.0584, "step": 7500 }, { "epoch": 0.07823272279212588, "grad_norm": 0.13331238925457, "learning_rate": 0.000279965, "loss": 0.0576, "step": 8000 }, { "epoch": 0.07823272279212588, "eval_accuracy": 0.995110619047619, "eval_loss": 0.03278239071369171, "eval_runtime": 52.8459, "eval_samples_per_second": 567.688, "eval_steps_per_second": 35.481, "step": 8000 }, { "epoch": 0.08312226796663374, "grad_norm": 0.15275965631008148, "learning_rate": 0.000297465, "loss": 0.0545, "step": 8500 }, { "epoch": 0.0880118131411416, "grad_norm": 0.14770014584064484, "learning_rate": 0.000314965, "loss": 0.0509, "step": 9000 }, { "epoch": 0.0880118131411416, "eval_accuracy": 0.9955557619047619, "eval_loss": 0.029773302376270294, "eval_runtime": 53.681, "eval_samples_per_second": 558.857, "eval_steps_per_second": 34.929, "step": 9000 }, { "epoch": 0.09290135831564948, "grad_norm": 0.13802163302898407, "learning_rate": 0.000332465, "loss": 0.0503, "step": 9500 }, { "epoch": 0.09779090349015734, "grad_norm": 0.16345028579235077, "learning_rate": 0.000349965, "loss": 0.0492, "step": 10000 }, { "epoch": 0.09779090349015734, "eval_accuracy": 0.9958419047619048, "eval_loss": 0.027848461642861366, "eval_runtime": 53.5151, "eval_samples_per_second": 560.589, "eval_steps_per_second": 35.037, "step": 10000 }, { "epoch": 0.10268044866466522, "grad_norm": 0.09112809598445892, "learning_rate": 0.00036746500000000003, "loss": 0.0475, "step": 10500 }, { "epoch": 0.10756999383917308, "grad_norm": 0.20798154175281525, "learning_rate": 0.000384965, "loss": 0.046, "step": 11000 }, { "epoch": 0.10756999383917308, "eval_accuracy": 0.99604, "eval_loss": 0.026027251034975052, "eval_runtime": 53.2138, "eval_samples_per_second": 563.764, "eval_steps_per_second": 35.235, "step": 11000 }, { "epoch": 0.11245953901368094, "grad_norm": 0.19015829265117645, "learning_rate": 0.00040246499999999996, "loss": 0.0432, "step": 11500 }, { "epoch": 0.11734908418818882, "grad_norm": 0.12272685021162033, "learning_rate": 0.000419965, "loss": 0.0434, "step": 12000 }, { "epoch": 0.11734908418818882, "eval_accuracy": 0.9963316666666666, "eval_loss": 0.024126138538122177, "eval_runtime": 53.0184, "eval_samples_per_second": 565.841, "eval_steps_per_second": 35.365, "step": 12000 }, { "epoch": 0.12223862936269668, "grad_norm": 0.10090415924787521, "learning_rate": 0.000437465, "loss": 0.0425, "step": 12500 }, { "epoch": 0.12712817453720454, "grad_norm": 0.113510861992836, "learning_rate": 0.000454965, "loss": 0.0412, "step": 13000 }, { "epoch": 0.12712817453720454, "eval_accuracy": 0.9965295714285715, "eval_loss": 0.022982601076364517, "eval_runtime": 54.4384, "eval_samples_per_second": 551.082, "eval_steps_per_second": 34.443, "step": 13000 }, { "epoch": 0.13201771971171242, "grad_norm": 0.09937796741724014, "learning_rate": 0.00047246500000000004, "loss": 0.04, "step": 13500 }, { "epoch": 0.1369072648862203, "grad_norm": 0.11914831399917603, "learning_rate": 0.000489965, "loss": 0.0389, "step": 14000 }, { "epoch": 0.1369072648862203, "eval_accuracy": 0.996802, "eval_loss": 0.02113029547035694, "eval_runtime": 53.2003, "eval_samples_per_second": 563.907, "eval_steps_per_second": 35.244, "step": 14000 }, { "epoch": 0.14179681006072814, "grad_norm": 0.17324307560920715, "learning_rate": 0.000507465, "loss": 0.0384, "step": 14500 }, { "epoch": 0.14668635523523602, "grad_norm": 0.12623025476932526, "learning_rate": 0.000524965, "loss": 0.0364, "step": 15000 }, { "epoch": 0.14668635523523602, "eval_accuracy": 0.9968850476190476, "eval_loss": 0.020160900428891182, "eval_runtime": 53.6937, "eval_samples_per_second": 558.725, "eval_steps_per_second": 34.92, "step": 15000 }, { "epoch": 0.1515759004097439, "grad_norm": 0.1337081342935562, "learning_rate": 0.000542465, "loss": 0.0367, "step": 15500 }, { "epoch": 0.15646544558425177, "grad_norm": 0.16239804029464722, "learning_rate": 0.000559965, "loss": 0.0357, "step": 16000 }, { "epoch": 0.15646544558425177, "eval_accuracy": 0.9969695714285715, "eval_loss": 0.020250126719474792, "eval_runtime": 54.2376, "eval_samples_per_second": 553.122, "eval_steps_per_second": 34.57, "step": 16000 }, { "epoch": 0.16135499075875961, "grad_norm": 0.09299212694168091, "learning_rate": 0.000577465, "loss": 0.0356, "step": 16500 }, { "epoch": 0.1662445359332675, "grad_norm": 0.12462040781974792, "learning_rate": 0.000594965, "loss": 0.0343, "step": 17000 }, { "epoch": 0.1662445359332675, "eval_accuracy": 0.9971193333333334, "eval_loss": 0.01877717673778534, "eval_runtime": 54.3351, "eval_samples_per_second": 552.13, "eval_steps_per_second": 34.508, "step": 17000 }, { "epoch": 0.17113408110777537, "grad_norm": 0.08858466893434525, "learning_rate": 0.000612465, "loss": 0.0337, "step": 17500 }, { "epoch": 0.1760236262822832, "grad_norm": 0.14879809319972992, "learning_rate": 0.000629965, "loss": 0.0335, "step": 18000 }, { "epoch": 0.1760236262822832, "eval_accuracy": 0.9971792380952381, "eval_loss": 0.018667874857783318, "eval_runtime": 54.7854, "eval_samples_per_second": 547.591, "eval_steps_per_second": 34.224, "step": 18000 }, { "epoch": 0.1809131714567911, "grad_norm": 0.10354409366846085, "learning_rate": 0.0006474650000000001, "loss": 0.032, "step": 18500 }, { "epoch": 0.18580271663129896, "grad_norm": 0.1182965636253357, "learning_rate": 0.000664965, "loss": 0.0318, "step": 19000 }, { "epoch": 0.18580271663129896, "eval_accuracy": 0.9973930952380953, "eval_loss": 0.017232514917850494, "eval_runtime": 53.7973, "eval_samples_per_second": 557.649, "eval_steps_per_second": 34.853, "step": 19000 }, { "epoch": 0.1906922618058068, "grad_norm": 0.05959112569689751, "learning_rate": 0.0006824649999999999, "loss": 0.0318, "step": 19500 }, { "epoch": 0.1955818069803147, "grad_norm": 0.1270582675933838, "learning_rate": 0.000699965, "loss": 0.0307, "step": 20000 }, { "epoch": 0.1955818069803147, "eval_accuracy": 0.9973767619047619, "eval_loss": 0.01737845316529274, "eval_runtime": 53.2789, "eval_samples_per_second": 563.075, "eval_steps_per_second": 35.192, "step": 20000 }, { "epoch": 0.20047135215482256, "grad_norm": 0.08427739888429642, "learning_rate": 0.0006980594444444445, "loss": 0.0298, "step": 20500 }, { "epoch": 0.20536089732933044, "grad_norm": 0.07171203941106796, "learning_rate": 0.000696115, "loss": 0.0293, "step": 21000 }, { "epoch": 0.20536089732933044, "eval_accuracy": 0.9975350952380952, "eval_loss": 0.016114523634314537, "eval_runtime": 53.7368, "eval_samples_per_second": 558.277, "eval_steps_per_second": 34.892, "step": 21000 }, { "epoch": 0.2102504425038383, "grad_norm": 0.07719539105892181, "learning_rate": 0.0006941705555555555, "loss": 0.0291, "step": 21500 }, { "epoch": 0.21513998767834616, "grad_norm": 0.08832105249166489, "learning_rate": 0.0006922261111111111, "loss": 0.0286, "step": 22000 }, { "epoch": 0.21513998767834616, "eval_accuracy": 0.9976384285714286, "eval_loss": 0.015542366541922092, "eval_runtime": 53.6783, "eval_samples_per_second": 558.885, "eval_steps_per_second": 34.93, "step": 22000 }, { "epoch": 0.22002953285285404, "grad_norm": 0.1472863107919693, "learning_rate": 0.0006902816666666667, "loss": 0.0277, "step": 22500 }, { "epoch": 0.22491907802736189, "grad_norm": 0.09753895550966263, "learning_rate": 0.0006883372222222222, "loss": 0.0268, "step": 23000 }, { "epoch": 0.22491907802736189, "eval_accuracy": 0.9977074761904762, "eval_loss": 0.015192433260381222, "eval_runtime": 53.436, "eval_samples_per_second": 561.419, "eval_steps_per_second": 35.089, "step": 23000 }, { "epoch": 0.22980862320186976, "grad_norm": 0.12348861992359161, "learning_rate": 0.0006863927777777778, "loss": 0.026, "step": 23500 }, { "epoch": 0.23469816837637764, "grad_norm": 0.1123756393790245, "learning_rate": 0.0006844483333333333, "loss": 0.0257, "step": 24000 }, { "epoch": 0.23469816837637764, "eval_accuracy": 0.997726761904762, "eval_loss": 0.014932113699615002, "eval_runtime": 53.1941, "eval_samples_per_second": 563.972, "eval_steps_per_second": 35.248, "step": 24000 }, { "epoch": 0.23958771355088548, "grad_norm": 0.07256095856428146, "learning_rate": 0.0006825038888888889, "loss": 0.0256, "step": 24500 }, { "epoch": 0.24447725872539336, "grad_norm": 0.05496814846992493, "learning_rate": 0.0006805594444444444, "loss": 0.0251, "step": 25000 }, { "epoch": 0.24447725872539336, "eval_accuracy": 0.9978721904761905, "eval_loss": 0.01384472381323576, "eval_runtime": 54.0604, "eval_samples_per_second": 554.935, "eval_steps_per_second": 34.683, "step": 25000 }, { "epoch": 0.24936680389990123, "grad_norm": 0.09915214031934738, "learning_rate": 0.000678615, "loss": 0.0251, "step": 25500 }, { "epoch": 0.2542563490744091, "grad_norm": 0.14060749113559723, "learning_rate": 0.0006766705555555555, "loss": 0.0244, "step": 26000 }, { "epoch": 0.2542563490744091, "eval_accuracy": 0.9979192857142857, "eval_loss": 0.01368007156997919, "eval_runtime": 52.8524, "eval_samples_per_second": 567.618, "eval_steps_per_second": 35.476, "step": 26000 }, { "epoch": 0.259145894248917, "grad_norm": 0.09252548217773438, "learning_rate": 0.0006747261111111111, "loss": 0.024, "step": 26500 }, { "epoch": 0.26403543942342483, "grad_norm": 0.11915791034698486, "learning_rate": 0.0006727816666666666, "loss": 0.0232, "step": 27000 }, { "epoch": 0.26403543942342483, "eval_accuracy": 0.9980117142857143, "eval_loss": 0.012998638674616814, "eval_runtime": 54.0246, "eval_samples_per_second": 555.303, "eval_steps_per_second": 34.706, "step": 27000 }, { "epoch": 0.2689249845979327, "grad_norm": 0.10810112953186035, "learning_rate": 0.0006708372222222222, "loss": 0.0233, "step": 27500 }, { "epoch": 0.2738145297724406, "grad_norm": 0.07593973726034164, "learning_rate": 0.0006688927777777778, "loss": 0.0227, "step": 28000 }, { "epoch": 0.2738145297724406, "eval_accuracy": 0.9980548095238095, "eval_loss": 0.012805027887225151, "eval_runtime": 53.176, "eval_samples_per_second": 564.164, "eval_steps_per_second": 35.26, "step": 28000 }, { "epoch": 0.27870407494694843, "grad_norm": 0.06336738914251328, "learning_rate": 0.0006669483333333333, "loss": 0.0229, "step": 28500 }, { "epoch": 0.2835936201214563, "grad_norm": 0.12944093346595764, "learning_rate": 0.0006650038888888889, "loss": 0.0221, "step": 29000 }, { "epoch": 0.2835936201214563, "eval_accuracy": 0.9980741428571429, "eval_loss": 0.012613357976078987, "eval_runtime": 53.5915, "eval_samples_per_second": 559.79, "eval_steps_per_second": 34.987, "step": 29000 }, { "epoch": 0.2884831652959642, "grad_norm": 0.09919234365224838, "learning_rate": 0.0006630594444444445, "loss": 0.0213, "step": 29500 }, { "epoch": 0.29337271047047203, "grad_norm": 0.08204931020736694, "learning_rate": 0.000661115, "loss": 0.0219, "step": 30000 }, { "epoch": 0.29337271047047203, "eval_accuracy": 0.998159, "eval_loss": 0.011940201744437218, "eval_runtime": 53.1317, "eval_samples_per_second": 564.635, "eval_steps_per_second": 35.29, "step": 30000 }, { "epoch": 0.2982622556449799, "grad_norm": 0.11553770303726196, "learning_rate": 0.0006591705555555556, "loss": 0.0208, "step": 30500 }, { "epoch": 0.3031518008194878, "grad_norm": 0.12381038069725037, "learning_rate": 0.0006572261111111111, "loss": 0.0205, "step": 31000 }, { "epoch": 0.3031518008194878, "eval_accuracy": 0.9982196666666666, "eval_loss": 0.011603106744587421, "eval_runtime": 53.375, "eval_samples_per_second": 562.061, "eval_steps_per_second": 35.129, "step": 31000 }, { "epoch": 0.30804134599399563, "grad_norm": 0.06441524624824524, "learning_rate": 0.0006552816666666667, "loss": 0.0204, "step": 31500 }, { "epoch": 0.31293089116850353, "grad_norm": 0.08449769020080566, "learning_rate": 0.0006533372222222222, "loss": 0.0206, "step": 32000 }, { "epoch": 0.31293089116850353, "eval_accuracy": 0.9982467142857143, "eval_loss": 0.011421745643019676, "eval_runtime": 53.2003, "eval_samples_per_second": 563.907, "eval_steps_per_second": 35.244, "step": 32000 }, { "epoch": 0.3178204363430114, "grad_norm": 0.07885874062776566, "learning_rate": 0.0006513927777777777, "loss": 0.02, "step": 32500 }, { "epoch": 0.32270998151751923, "grad_norm": 0.07178321480751038, "learning_rate": 0.0006494483333333333, "loss": 0.0193, "step": 33000 }, { "epoch": 0.32270998151751923, "eval_accuracy": 0.9983428571428571, "eval_loss": 0.011021795682609081, "eval_runtime": 53.8106, "eval_samples_per_second": 557.511, "eval_steps_per_second": 34.844, "step": 33000 }, { "epoch": 0.32759952669202713, "grad_norm": 0.06164510175585747, "learning_rate": 0.0006475038888888888, "loss": 0.0192, "step": 33500 }, { "epoch": 0.332489071866535, "grad_norm": 0.11073775589466095, "learning_rate": 0.0006455594444444444, "loss": 0.0193, "step": 34000 }, { "epoch": 0.332489071866535, "eval_accuracy": 0.9983445238095238, "eval_loss": 0.010947330854833126, "eval_runtime": 53.4068, "eval_samples_per_second": 561.727, "eval_steps_per_second": 35.108, "step": 34000 }, { "epoch": 0.3373786170410428, "grad_norm": 0.1216714084148407, "learning_rate": 0.0006436149999999999, "loss": 0.0191, "step": 34500 }, { "epoch": 0.34226816221555073, "grad_norm": 0.07570644468069077, "learning_rate": 0.0006416705555555556, "loss": 0.0189, "step": 35000 }, { "epoch": 0.34226816221555073, "eval_accuracy": 0.9984051904761905, "eval_loss": 0.01051774900406599, "eval_runtime": 53.8775, "eval_samples_per_second": 556.819, "eval_steps_per_second": 34.801, "step": 35000 }, { "epoch": 0.3471577073900586, "grad_norm": 0.10820703208446503, "learning_rate": 0.0006397261111111112, "loss": 0.0187, "step": 35500 }, { "epoch": 0.3520472525645664, "grad_norm": 0.13289569318294525, "learning_rate": 0.0006377816666666667, "loss": 0.0181, "step": 36000 }, { "epoch": 0.3520472525645664, "eval_accuracy": 0.9984183333333333, "eval_loss": 0.010617985390126705, "eval_runtime": 53.6453, "eval_samples_per_second": 559.229, "eval_steps_per_second": 34.952, "step": 36000 }, { "epoch": 0.35693679773907433, "grad_norm": 0.09950833022594452, "learning_rate": 0.0006358372222222223, "loss": 0.0178, "step": 36500 }, { "epoch": 0.3618263429135822, "grad_norm": 0.12055996805429459, "learning_rate": 0.0006338927777777778, "loss": 0.0174, "step": 37000 }, { "epoch": 0.3618263429135822, "eval_accuracy": 0.9984319047619048, "eval_loss": 0.01043427549302578, "eval_runtime": 53.8445, "eval_samples_per_second": 557.16, "eval_steps_per_second": 34.823, "step": 37000 }, { "epoch": 0.36671588808809, "grad_norm": 0.08831817656755447, "learning_rate": 0.0006319483333333334, "loss": 0.0183, "step": 37500 }, { "epoch": 0.37160543326259793, "grad_norm": 0.09790224581956863, "learning_rate": 0.0006300038888888889, "loss": 0.0171, "step": 38000 }, { "epoch": 0.37160543326259793, "eval_accuracy": 0.9984588571428571, "eval_loss": 0.010330071672797203, "eval_runtime": 53.8512, "eval_samples_per_second": 557.091, "eval_steps_per_second": 34.818, "step": 38000 }, { "epoch": 0.3764949784371058, "grad_norm": 0.05283864215016365, "learning_rate": 0.0006280594444444444, "loss": 0.017, "step": 38500 }, { "epoch": 0.3813845236116136, "grad_norm": 0.12874823808670044, "learning_rate": 0.000626115, "loss": 0.0173, "step": 39000 }, { "epoch": 0.3813845236116136, "eval_accuracy": 0.9984891904761904, "eval_loss": 0.009993654675781727, "eval_runtime": 53.3842, "eval_samples_per_second": 561.964, "eval_steps_per_second": 35.123, "step": 39000 }, { "epoch": 0.3862740687861215, "grad_norm": 0.08774898201227188, "learning_rate": 0.0006241705555555555, "loss": 0.0173, "step": 39500 }, { "epoch": 0.3911636139606294, "grad_norm": 0.092228963971138, "learning_rate": 0.0006222261111111111, "loss": 0.0169, "step": 40000 }, { "epoch": 0.3911636139606294, "eval_accuracy": 0.9984447142857142, "eval_loss": 0.010405597276985645, "eval_runtime": 53.1659, "eval_samples_per_second": 564.272, "eval_steps_per_second": 35.267, "step": 40000 }, { "epoch": 0.3960531591351372, "grad_norm": 0.08975362032651901, "learning_rate": 0.0006202816666666666, "loss": 0.0174, "step": 40500 }, { "epoch": 0.4009427043096451, "grad_norm": 0.09612125158309937, "learning_rate": 0.0006183372222222222, "loss": 0.0168, "step": 41000 }, { "epoch": 0.4009427043096451, "eval_accuracy": 0.9985740952380953, "eval_loss": 0.009390046820044518, "eval_runtime": 53.7483, "eval_samples_per_second": 558.158, "eval_steps_per_second": 34.885, "step": 41000 }, { "epoch": 0.405832249484153, "grad_norm": 0.04056503251194954, "learning_rate": 0.0006163927777777777, "loss": 0.0163, "step": 41500 }, { "epoch": 0.4107217946586609, "grad_norm": 0.11665570735931396, "learning_rate": 0.0006144483333333333, "loss": 0.0165, "step": 42000 }, { "epoch": 0.4107217946586609, "eval_accuracy": 0.998547619047619, "eval_loss": 0.009648078121244907, "eval_runtime": 53.5013, "eval_samples_per_second": 560.734, "eval_steps_per_second": 35.046, "step": 42000 }, { "epoch": 0.4156113398331687, "grad_norm": 0.10102874785661697, "learning_rate": 0.000612503888888889, "loss": 0.0163, "step": 42500 }, { "epoch": 0.4205008850076766, "grad_norm": 0.08108735084533691, "learning_rate": 0.0006105594444444445, "loss": 0.0154, "step": 43000 }, { "epoch": 0.4205008850076766, "eval_accuracy": 0.998580380952381, "eval_loss": 0.009399999864399433, "eval_runtime": 53.6417, "eval_samples_per_second": 559.266, "eval_steps_per_second": 34.954, "step": 43000 }, { "epoch": 0.4253904301821845, "grad_norm": 0.07910118252038956, "learning_rate": 0.000608615, "loss": 0.0158, "step": 43500 }, { "epoch": 0.4302799753566923, "grad_norm": 0.0742466077208519, "learning_rate": 0.0006066705555555556, "loss": 0.0154, "step": 44000 }, { "epoch": 0.4302799753566923, "eval_accuracy": 0.9986305238095238, "eval_loss": 0.009053844027221203, "eval_runtime": 53.2625, "eval_samples_per_second": 563.248, "eval_steps_per_second": 35.203, "step": 44000 }, { "epoch": 0.43516952053120017, "grad_norm": 0.06712730973958969, "learning_rate": 0.0006047261111111111, "loss": 0.0157, "step": 44500 }, { "epoch": 0.4400590657057081, "grad_norm": 0.049518078565597534, "learning_rate": 0.0006027816666666667, "loss": 0.0154, "step": 45000 }, { "epoch": 0.4400590657057081, "eval_accuracy": 0.9986142857142857, "eval_loss": 0.009205291979014874, "eval_runtime": 55.4539, "eval_samples_per_second": 540.99, "eval_steps_per_second": 33.812, "step": 45000 }, { "epoch": 0.4449486108802159, "grad_norm": 0.0538068488240242, "learning_rate": 0.0006008372222222222, "loss": 0.0154, "step": 45500 }, { "epoch": 0.44983815605472377, "grad_norm": 0.08187378942966461, "learning_rate": 0.0005988927777777778, "loss": 0.015, "step": 46000 }, { "epoch": 0.44983815605472377, "eval_accuracy": 0.9986327142857143, "eval_loss": 0.009027380496263504, "eval_runtime": 53.2362, "eval_samples_per_second": 563.526, "eval_steps_per_second": 35.22, "step": 46000 }, { "epoch": 0.4547277012292317, "grad_norm": 0.04306895285844803, "learning_rate": 0.0005969483333333333, "loss": 0.0153, "step": 46500 }, { "epoch": 0.4596172464037395, "grad_norm": 0.053645290434360504, "learning_rate": 0.0005950038888888889, "loss": 0.0146, "step": 47000 }, { "epoch": 0.4596172464037395, "eval_accuracy": 0.998660619047619, "eval_loss": 0.008829508908092976, "eval_runtime": 54.0772, "eval_samples_per_second": 554.763, "eval_steps_per_second": 34.673, "step": 47000 }, { "epoch": 0.46450679157824737, "grad_norm": 0.08367203176021576, "learning_rate": 0.0005930594444444444, "loss": 0.0149, "step": 47500 }, { "epoch": 0.46939633675275527, "grad_norm": 0.06427811086177826, "learning_rate": 0.000591115, "loss": 0.0146, "step": 48000 }, { "epoch": 0.46939633675275527, "eval_accuracy": 0.9986682857142857, "eval_loss": 0.008711729198694229, "eval_runtime": 54.7568, "eval_samples_per_second": 547.877, "eval_steps_per_second": 34.242, "step": 48000 }, { "epoch": 0.4742858819272631, "grad_norm": 0.09367698431015015, "learning_rate": 0.0005891705555555556, "loss": 0.0146, "step": 48500 }, { "epoch": 0.47917542710177097, "grad_norm": 0.023252153769135475, "learning_rate": 0.0005872261111111111, "loss": 0.0143, "step": 49000 }, { "epoch": 0.47917542710177097, "eval_accuracy": 0.9987029047619047, "eval_loss": 0.00848183874040842, "eval_runtime": 53.9633, "eval_samples_per_second": 555.933, "eval_steps_per_second": 34.746, "step": 49000 }, { "epoch": 0.48406497227627887, "grad_norm": 0.038976676762104034, "learning_rate": 0.0005852816666666666, "loss": 0.0142, "step": 49500 }, { "epoch": 0.4889545174507867, "grad_norm": 0.048157546669244766, "learning_rate": 0.0005833372222222221, "loss": 0.0146, "step": 50000 }, { "epoch": 0.4889545174507867, "eval_accuracy": 0.9986898571428572, "eval_loss": 0.008633621968328953, "eval_runtime": 53.3435, "eval_samples_per_second": 562.392, "eval_steps_per_second": 35.15, "step": 50000 }, { "epoch": 0.4938440626252946, "grad_norm": 0.04257979243993759, "learning_rate": 0.0005813927777777777, "loss": 0.0145, "step": 50500 }, { "epoch": 0.49873360779980247, "grad_norm": 0.09921249002218246, "learning_rate": 0.0005794483333333334, "loss": 0.0142, "step": 51000 }, { "epoch": 0.49873360779980247, "eval_accuracy": 0.9987676666666667, "eval_loss": 0.008316335268318653, "eval_runtime": 53.6985, "eval_samples_per_second": 558.675, "eval_steps_per_second": 34.917, "step": 51000 }, { "epoch": 0.5036231529743104, "grad_norm": 0.048569273203611374, "learning_rate": 0.0005775038888888889, "loss": 0.0135, "step": 51500 }, { "epoch": 0.5085126981488182, "grad_norm": 0.06064219772815704, "learning_rate": 0.0005755594444444445, "loss": 0.0139, "step": 52000 }, { "epoch": 0.5085126981488182, "eval_accuracy": 0.9987182380952381, "eval_loss": 0.008500739932060242, "eval_runtime": 53.1478, "eval_samples_per_second": 564.463, "eval_steps_per_second": 35.279, "step": 52000 }, { "epoch": 0.5134022433233261, "grad_norm": 0.043598126620054245, "learning_rate": 0.000573615, "loss": 0.0145, "step": 52500 }, { "epoch": 0.518291788497834, "grad_norm": 0.059862203896045685, "learning_rate": 0.0005716705555555556, "loss": 0.0134, "step": 53000 }, { "epoch": 0.518291788497834, "eval_accuracy": 0.9987784761904762, "eval_loss": 0.008033830672502518, "eval_runtime": 55.7465, "eval_samples_per_second": 538.15, "eval_steps_per_second": 33.634, "step": 53000 }, { "epoch": 0.5231813336723418, "grad_norm": 0.05372610315680504, "learning_rate": 0.0005697261111111111, "loss": 0.0136, "step": 53500 }, { "epoch": 0.5280708788468497, "grad_norm": 0.08553345501422882, "learning_rate": 0.0005677816666666667, "loss": 0.0138, "step": 54000 }, { "epoch": 0.5280708788468497, "eval_accuracy": 0.9988229047619047, "eval_loss": 0.007664266973733902, "eval_runtime": 53.9758, "eval_samples_per_second": 555.805, "eval_steps_per_second": 34.738, "step": 54000 }, { "epoch": 0.5329604240213576, "grad_norm": 0.03992351144552231, "learning_rate": 0.0005658372222222222, "loss": 0.0133, "step": 54500 }, { "epoch": 0.5378499691958654, "grad_norm": 0.051119010895490646, "learning_rate": 0.0005638927777777777, "loss": 0.0135, "step": 55000 }, { "epoch": 0.5378499691958654, "eval_accuracy": 0.9988099523809524, "eval_loss": 0.007848628796637058, "eval_runtime": 54.3513, "eval_samples_per_second": 551.965, "eval_steps_per_second": 34.498, "step": 55000 }, { "epoch": 0.5427395143703733, "grad_norm": 0.08714370429515839, "learning_rate": 0.0005619483333333333, "loss": 0.0128, "step": 55500 }, { "epoch": 0.5476290595448812, "grad_norm": 0.07373756170272827, "learning_rate": 0.0005600038888888888, "loss": 0.013, "step": 56000 }, { "epoch": 0.5476290595448812, "eval_accuracy": 0.9988279047619048, "eval_loss": 0.007725988980382681, "eval_runtime": 53.269, "eval_samples_per_second": 563.179, "eval_steps_per_second": 35.199, "step": 56000 }, { "epoch": 0.552518604719389, "grad_norm": 0.04964112490415573, "learning_rate": 0.0005580594444444444, "loss": 0.0132, "step": 56500 }, { "epoch": 0.5574081498938969, "grad_norm": 0.08856749534606934, "learning_rate": 0.000556115, "loss": 0.0128, "step": 57000 }, { "epoch": 0.5574081498938969, "eval_accuracy": 0.998819, "eval_loss": 0.007981804199516773, "eval_runtime": 54.4577, "eval_samples_per_second": 550.886, "eval_steps_per_second": 34.43, "step": 57000 }, { "epoch": 0.5622976950684048, "grad_norm": 0.06801512092351913, "learning_rate": 0.0005541705555555555, "loss": 0.0129, "step": 57500 }, { "epoch": 0.5671872402429126, "grad_norm": 0.21337199211120605, "learning_rate": 0.0005522261111111112, "loss": 0.0131, "step": 58000 }, { "epoch": 0.5671872402429126, "eval_accuracy": 0.9988361904761904, "eval_loss": 0.0077649368904531, "eval_runtime": 53.1495, "eval_samples_per_second": 564.445, "eval_steps_per_second": 35.278, "step": 58000 }, { "epoch": 0.5720767854174205, "grad_norm": 0.0754612609744072, "learning_rate": 0.0005502816666666667, "loss": 0.013, "step": 58500 }, { "epoch": 0.5769663305919284, "grad_norm": 0.053277261555194855, "learning_rate": 0.0005483372222222223, "loss": 0.013, "step": 59000 }, { "epoch": 0.5769663305919284, "eval_accuracy": 0.9988713333333333, "eval_loss": 0.007438257802277803, "eval_runtime": 53.9725, "eval_samples_per_second": 555.839, "eval_steps_per_second": 34.74, "step": 59000 }, { "epoch": 0.5818558757664362, "grad_norm": 0.057580217719078064, "learning_rate": 0.0005463927777777778, "loss": 0.0126, "step": 59500 }, { "epoch": 0.5867454209409441, "grad_norm": 0.08538717031478882, "learning_rate": 0.0005444483333333334, "loss": 0.0125, "step": 60000 }, { "epoch": 0.5867454209409441, "eval_accuracy": 0.9988772857142857, "eval_loss": 0.0073426892049610615, "eval_runtime": 53.9301, "eval_samples_per_second": 556.276, "eval_steps_per_second": 34.767, "step": 60000 }, { "epoch": 0.591634966115452, "grad_norm": 0.07628747820854187, "learning_rate": 0.0005425038888888889, "loss": 0.0127, "step": 60500 }, { "epoch": 0.5965245112899598, "grad_norm": 0.059503812342882156, "learning_rate": 0.0005405594444444444, "loss": 0.0119, "step": 61000 }, { "epoch": 0.5965245112899598, "eval_accuracy": 0.9988955714285714, "eval_loss": 0.007260579615831375, "eval_runtime": 54.1591, "eval_samples_per_second": 553.924, "eval_steps_per_second": 34.62, "step": 61000 }, { "epoch": 0.6014140564644677, "grad_norm": 0.07128513604402542, "learning_rate": 0.000538615, "loss": 0.012, "step": 61500 }, { "epoch": 0.6063036016389756, "grad_norm": 0.0615658275783062, "learning_rate": 0.0005366705555555555, "loss": 0.0121, "step": 62000 }, { "epoch": 0.6063036016389756, "eval_accuracy": 0.9988924285714286, "eval_loss": 0.007354605942964554, "eval_runtime": 53.6133, "eval_samples_per_second": 559.563, "eval_steps_per_second": 34.973, "step": 62000 }, { "epoch": 0.6111931468134834, "grad_norm": 0.04977503791451454, "learning_rate": 0.0005347261111111111, "loss": 0.0125, "step": 62500 }, { "epoch": 0.6160826919879913, "grad_norm": 0.06748691946268082, "learning_rate": 0.0005327816666666666, "loss": 0.0123, "step": 63000 }, { "epoch": 0.6160826919879913, "eval_accuracy": 0.9989074761904762, "eval_loss": 0.007279036566615105, "eval_runtime": 54.221, "eval_samples_per_second": 553.291, "eval_steps_per_second": 34.581, "step": 63000 }, { "epoch": 0.6209722371624992, "grad_norm": 0.08432789891958237, "learning_rate": 0.0005308372222222222, "loss": 0.0119, "step": 63500 }, { "epoch": 0.6258617823370071, "grad_norm": 0.08450587093830109, "learning_rate": 0.0005288927777777778, "loss": 0.0123, "step": 64000 }, { "epoch": 0.6258617823370071, "eval_accuracy": 0.998906619047619, "eval_loss": 0.007195043843239546, "eval_runtime": 53.6077, "eval_samples_per_second": 559.621, "eval_steps_per_second": 34.976, "step": 64000 }, { "epoch": 0.6307513275115149, "grad_norm": 0.05454770103096962, "learning_rate": 0.0005269483333333333, "loss": 0.0119, "step": 64500 }, { "epoch": 0.6356408726860228, "grad_norm": 0.029517434537410736, "learning_rate": 0.0005250038888888889, "loss": 0.0115, "step": 65000 }, { "epoch": 0.6356408726860228, "eval_accuracy": 0.99894, "eval_loss": 0.006976461503654718, "eval_runtime": 54.3436, "eval_samples_per_second": 552.043, "eval_steps_per_second": 34.503, "step": 65000 }, { "epoch": 0.6405304178605307, "grad_norm": 0.08749569207429886, "learning_rate": 0.0005230594444444444, "loss": 0.0117, "step": 65500 }, { "epoch": 0.6454199630350385, "grad_norm": 0.08669404685497284, "learning_rate": 0.000521115, "loss": 0.0118, "step": 66000 }, { "epoch": 0.6454199630350385, "eval_accuracy": 0.9989269523809524, "eval_loss": 0.0070405821315944195, "eval_runtime": 53.0176, "eval_samples_per_second": 565.85, "eval_steps_per_second": 35.366, "step": 66000 }, { "epoch": 0.6503095082095464, "grad_norm": 0.08068472146987915, "learning_rate": 0.0005191705555555556, "loss": 0.012, "step": 66500 }, { "epoch": 0.6551990533840543, "grad_norm": 0.06560824811458588, "learning_rate": 0.0005172261111111111, "loss": 0.0113, "step": 67000 }, { "epoch": 0.6551990533840543, "eval_accuracy": 0.9989625238095238, "eval_loss": 0.006877726875245571, "eval_runtime": 57.7066, "eval_samples_per_second": 519.871, "eval_steps_per_second": 32.492, "step": 67000 }, { "epoch": 0.6600885985585621, "grad_norm": 0.10351342707872391, "learning_rate": 0.0005152816666666667, "loss": 0.0117, "step": 67500 }, { "epoch": 0.66497814373307, "grad_norm": 0.06295846402645111, "learning_rate": 0.0005133372222222222, "loss": 0.0113, "step": 68000 }, { "epoch": 0.66497814373307, "eval_accuracy": 0.9989663333333333, "eval_loss": 0.006734638474881649, "eval_runtime": 56.5776, "eval_samples_per_second": 530.245, "eval_steps_per_second": 33.14, "step": 68000 }, { "epoch": 0.6698676889075779, "grad_norm": 0.07197780162096024, "learning_rate": 0.0005113927777777778, "loss": 0.0112, "step": 68500 }, { "epoch": 0.6747572340820857, "grad_norm": 0.05394699051976204, "learning_rate": 0.0005094483333333333, "loss": 0.0111, "step": 69000 }, { "epoch": 0.6747572340820857, "eval_accuracy": 0.9989654761904762, "eval_loss": 0.006897720508277416, "eval_runtime": 53.9516, "eval_samples_per_second": 556.054, "eval_steps_per_second": 34.753, "step": 69000 }, { "epoch": 0.6796467792565936, "grad_norm": 0.08804675191640854, "learning_rate": 0.0005075038888888889, "loss": 0.0114, "step": 69500 }, { "epoch": 0.6845363244311015, "grad_norm": 0.061258211731910706, "learning_rate": 0.0005055594444444445, "loss": 0.0116, "step": 70000 }, { "epoch": 0.6845363244311015, "eval_accuracy": 0.998991619047619, "eval_loss": 0.006613132543861866, "eval_runtime": 53.4248, "eval_samples_per_second": 561.537, "eval_steps_per_second": 35.096, "step": 70000 }, { "epoch": 0.6894258696056093, "grad_norm": 0.047413647174835205, "learning_rate": 0.000503615, "loss": 0.0114, "step": 70500 }, { "epoch": 0.6943154147801172, "grad_norm": 0.048444923013448715, "learning_rate": 0.0005016705555555556, "loss": 0.0111, "step": 71000 }, { "epoch": 0.6943154147801172, "eval_accuracy": 0.9989892857142857, "eval_loss": 0.006757956929504871, "eval_runtime": 54.0915, "eval_samples_per_second": 554.616, "eval_steps_per_second": 34.663, "step": 71000 }, { "epoch": 0.6992049599546251, "grad_norm": 0.0633966252207756, "learning_rate": 0.0004997261111111111, "loss": 0.011, "step": 71500 }, { "epoch": 0.7040945051291329, "grad_norm": 0.05330997332930565, "learning_rate": 0.0004977816666666666, "loss": 0.0111, "step": 72000 }, { "epoch": 0.7040945051291329, "eval_accuracy": 0.9989945238095238, "eval_loss": 0.006628294009715319, "eval_runtime": 53.5745, "eval_samples_per_second": 559.968, "eval_steps_per_second": 34.998, "step": 72000 }, { "epoch": 0.7089840503036408, "grad_norm": 0.08384311944246292, "learning_rate": 0.0004958372222222222, "loss": 0.0112, "step": 72500 }, { "epoch": 0.7138735954781487, "grad_norm": 0.012912419624626637, "learning_rate": 0.0004938927777777777, "loss": 0.0108, "step": 73000 }, { "epoch": 0.7138735954781487, "eval_accuracy": 0.9990231904761905, "eval_loss": 0.0064848195761442184, "eval_runtime": 53.4714, "eval_samples_per_second": 561.048, "eval_steps_per_second": 35.065, "step": 73000 }, { "epoch": 0.7187631406526565, "grad_norm": 0.03586062043905258, "learning_rate": 0.0004919483333333333, "loss": 0.0106, "step": 73500 }, { "epoch": 0.7236526858271644, "grad_norm": 0.03920240327715874, "learning_rate": 0.0004900038888888888, "loss": 0.0108, "step": 74000 }, { "epoch": 0.7236526858271644, "eval_accuracy": 0.9990156666666666, "eval_loss": 0.00646663922816515, "eval_runtime": 53.3973, "eval_samples_per_second": 561.827, "eval_steps_per_second": 35.114, "step": 74000 }, { "epoch": 0.7285422310016723, "grad_norm": 0.07299363613128662, "learning_rate": 0.00048805944444444446, "loss": 0.0109, "step": 74500 }, { "epoch": 0.73343177617618, "grad_norm": 0.061152711510658264, "learning_rate": 0.000486115, "loss": 0.0102, "step": 75000 }, { "epoch": 0.73343177617618, "eval_accuracy": 0.9990235714285715, "eval_loss": 0.00648918654769659, "eval_runtime": 54.8259, "eval_samples_per_second": 547.187, "eval_steps_per_second": 34.199, "step": 75000 }, { "epoch": 0.738321321350688, "grad_norm": 0.052978385239839554, "learning_rate": 0.0004841705555555556, "loss": 0.0108, "step": 75500 }, { "epoch": 0.7432108665251959, "grad_norm": 0.03460371494293213, "learning_rate": 0.00048222611111111113, "loss": 0.0104, "step": 76000 }, { "epoch": 0.7432108665251959, "eval_accuracy": 0.9990159047619047, "eval_loss": 0.006446553394198418, "eval_runtime": 53.4946, "eval_samples_per_second": 560.804, "eval_steps_per_second": 35.05, "step": 76000 }, { "epoch": 0.7481004116997036, "grad_norm": 0.08936499804258347, "learning_rate": 0.0004802816666666667, "loss": 0.0105, "step": 76500 }, { "epoch": 0.7529899568742116, "grad_norm": 0.04613318666815758, "learning_rate": 0.00047833722222222224, "loss": 0.0104, "step": 77000 }, { "epoch": 0.7529899568742116, "eval_accuracy": 0.9990385714285714, "eval_loss": 0.0063977050594985485, "eval_runtime": 54.4466, "eval_samples_per_second": 550.998, "eval_steps_per_second": 34.437, "step": 77000 }, { "epoch": 0.7578795020487195, "grad_norm": 0.05318485200405121, "learning_rate": 0.00047639277777777775, "loss": 0.0106, "step": 77500 }, { "epoch": 0.7627690472232272, "grad_norm": 0.061067450791597366, "learning_rate": 0.0004744483333333333, "loss": 0.0101, "step": 78000 }, { "epoch": 0.7627690472232272, "eval_accuracy": 0.9990490952380953, "eval_loss": 0.006357032340019941, "eval_runtime": 53.2067, "eval_samples_per_second": 563.839, "eval_steps_per_second": 35.24, "step": 78000 }, { "epoch": 0.7676585923977352, "grad_norm": 0.042733557522296906, "learning_rate": 0.00047250388888888886, "loss": 0.01, "step": 78500 }, { "epoch": 0.772548137572243, "grad_norm": 0.08034121245145798, "learning_rate": 0.0004705594444444444, "loss": 0.0103, "step": 79000 }, { "epoch": 0.772548137572243, "eval_accuracy": 0.9990574761904762, "eval_loss": 0.0062187593430280685, "eval_runtime": 53.8428, "eval_samples_per_second": 557.177, "eval_steps_per_second": 34.824, "step": 79000 }, { "epoch": 0.7774376827467508, "grad_norm": 0.07830695807933807, "learning_rate": 0.00046861499999999997, "loss": 0.0101, "step": 79500 }, { "epoch": 0.7823272279212587, "grad_norm": 0.07382604479789734, "learning_rate": 0.00046667055555555553, "loss": 0.0103, "step": 80000 }, { "epoch": 0.7823272279212587, "eval_accuracy": 0.9990759047619048, "eval_loss": 0.006141056306660175, "eval_runtime": 53.2408, "eval_samples_per_second": 563.478, "eval_steps_per_second": 35.217, "step": 80000 }, { "epoch": 0.7872167730957667, "grad_norm": 0.1125330850481987, "learning_rate": 0.00046472611111111114, "loss": 0.0102, "step": 80500 }, { "epoch": 0.7921063182702744, "grad_norm": 0.03520214557647705, "learning_rate": 0.0004627816666666667, "loss": 0.01, "step": 81000 }, { "epoch": 0.7921063182702744, "eval_accuracy": 0.9990739047619047, "eval_loss": 0.006076267920434475, "eval_runtime": 53.791, "eval_samples_per_second": 557.715, "eval_steps_per_second": 34.857, "step": 81000 }, { "epoch": 0.7969958634447823, "grad_norm": 0.042487915605306625, "learning_rate": 0.00046083722222222225, "loss": 0.0097, "step": 81500 }, { "epoch": 0.8018854086192903, "grad_norm": 0.054117601364851, "learning_rate": 0.0004588927777777778, "loss": 0.0101, "step": 82000 }, { "epoch": 0.8018854086192903, "eval_accuracy": 0.9990634761904762, "eval_loss": 0.006164718419313431, "eval_runtime": 53.5332, "eval_samples_per_second": 560.4, "eval_steps_per_second": 35.025, "step": 82000 }, { "epoch": 0.8067749537937982, "grad_norm": 0.04976029694080353, "learning_rate": 0.00045694833333333336, "loss": 0.0099, "step": 82500 }, { "epoch": 0.811664498968306, "grad_norm": 0.054267916828393936, "learning_rate": 0.00045500388888888887, "loss": 0.0097, "step": 83000 }, { "epoch": 0.811664498968306, "eval_accuracy": 0.9990979047619047, "eval_loss": 0.005987876560539007, "eval_runtime": 54.2289, "eval_samples_per_second": 553.211, "eval_steps_per_second": 34.576, "step": 83000 }, { "epoch": 0.8165540441428139, "grad_norm": 0.03349093720316887, "learning_rate": 0.0004530594444444444, "loss": 0.0094, "step": 83500 }, { "epoch": 0.8214435893173218, "grad_norm": 0.04999032989144325, "learning_rate": 0.000451115, "loss": 0.0101, "step": 84000 }, { "epoch": 0.8214435893173218, "eval_accuracy": 0.9990905714285714, "eval_loss": 0.006010835990309715, "eval_runtime": 53.4299, "eval_samples_per_second": 561.483, "eval_steps_per_second": 35.093, "step": 84000 }, { "epoch": 0.8263331344918295, "grad_norm": 0.045149870216846466, "learning_rate": 0.00044917055555555554, "loss": 0.0097, "step": 84500 }, { "epoch": 0.8312226796663374, "grad_norm": 0.0918109267950058, "learning_rate": 0.0004472261111111111, "loss": 0.0099, "step": 85000 }, { "epoch": 0.8312226796663374, "eval_accuracy": 0.9990772380952381, "eval_loss": 0.006181794218719006, "eval_runtime": 54.1897, "eval_samples_per_second": 553.611, "eval_steps_per_second": 34.601, "step": 85000 }, { "epoch": 0.8361122248408454, "grad_norm": 0.0643276646733284, "learning_rate": 0.00044528166666666665, "loss": 0.0099, "step": 85500 }, { "epoch": 0.8410017700153531, "grad_norm": 0.06930361688137054, "learning_rate": 0.0004433372222222222, "loss": 0.0095, "step": 86000 }, { "epoch": 0.8410017700153531, "eval_accuracy": 0.9991025238095238, "eval_loss": 0.00590873695909977, "eval_runtime": 52.8942, "eval_samples_per_second": 567.169, "eval_steps_per_second": 35.448, "step": 86000 }, { "epoch": 0.845891315189861, "grad_norm": 0.08163397759199142, "learning_rate": 0.00044139277777777776, "loss": 0.0099, "step": 86500 }, { "epoch": 0.850780860364369, "grad_norm": 0.0483279749751091, "learning_rate": 0.00043944833333333337, "loss": 0.0092, "step": 87000 }, { "epoch": 0.850780860364369, "eval_accuracy": 0.9991092857142857, "eval_loss": 0.006001894827932119, "eval_runtime": 53.2268, "eval_samples_per_second": 563.626, "eval_steps_per_second": 35.227, "step": 87000 }, { "epoch": 0.8556704055388767, "grad_norm": 0.02636638656258583, "learning_rate": 0.00043750388888888893, "loss": 0.0094, "step": 87500 }, { "epoch": 0.8605599507133846, "grad_norm": 0.042217135429382324, "learning_rate": 0.0004355594444444445, "loss": 0.0092, "step": 88000 }, { "epoch": 0.8605599507133846, "eval_accuracy": 0.999128, "eval_loss": 0.005815317388623953, "eval_runtime": 53.8299, "eval_samples_per_second": 557.311, "eval_steps_per_second": 34.832, "step": 88000 }, { "epoch": 0.8654494958878926, "grad_norm": 0.08632192760705948, "learning_rate": 0.00043361499999999993, "loss": 0.0092, "step": 88500 }, { "epoch": 0.8703390410624003, "grad_norm": 0.04315312206745148, "learning_rate": 0.00043167055555555554, "loss": 0.0094, "step": 89000 }, { "epoch": 0.8703390410624003, "eval_accuracy": 0.9991279047619047, "eval_loss": 0.0056898752227425575, "eval_runtime": 53.7075, "eval_samples_per_second": 558.581, "eval_steps_per_second": 34.911, "step": 89000 }, { "epoch": 0.8752285862369082, "grad_norm": 0.03837065026164055, "learning_rate": 0.0004297261111111111, "loss": 0.0094, "step": 89500 }, { "epoch": 0.8801181314114161, "grad_norm": 0.04201444238424301, "learning_rate": 0.00042778166666666666, "loss": 0.0093, "step": 90000 }, { "epoch": 0.8801181314114161, "eval_accuracy": 0.9991310952380953, "eval_loss": 0.00587738212198019, "eval_runtime": 53.5135, "eval_samples_per_second": 560.606, "eval_steps_per_second": 35.038, "step": 90000 }, { "epoch": 0.8850076765859239, "grad_norm": 0.061635617166757584, "learning_rate": 0.0004258372222222222, "loss": 0.0092, "step": 90500 }, { "epoch": 0.8898972217604318, "grad_norm": 0.03518196567893028, "learning_rate": 0.00042389277777777777, "loss": 0.0088, "step": 91000 }, { "epoch": 0.8898972217604318, "eval_accuracy": 0.9991415238095238, "eval_loss": 0.005721970461308956, "eval_runtime": 53.7456, "eval_samples_per_second": 558.185, "eval_steps_per_second": 34.887, "step": 91000 }, { "epoch": 0.8947867669349397, "grad_norm": 0.06095174327492714, "learning_rate": 0.0004219483333333333, "loss": 0.0095, "step": 91500 }, { "epoch": 0.8996763121094475, "grad_norm": 0.03404530510306358, "learning_rate": 0.0004200038888888889, "loss": 0.0091, "step": 92000 }, { "epoch": 0.8996763121094475, "eval_accuracy": 0.9991448571428572, "eval_loss": 0.0056047323159873486, "eval_runtime": 53.6229, "eval_samples_per_second": 559.463, "eval_steps_per_second": 34.966, "step": 92000 }, { "epoch": 0.9045658572839554, "grad_norm": 0.044711388647556305, "learning_rate": 0.00041805944444444444, "loss": 0.0094, "step": 92500 }, { "epoch": 0.9094554024584633, "grad_norm": 0.025318428874015808, "learning_rate": 0.000416115, "loss": 0.0091, "step": 93000 }, { "epoch": 0.9094554024584633, "eval_accuracy": 0.9991459047619048, "eval_loss": 0.0056663015857338905, "eval_runtime": 53.7217, "eval_samples_per_second": 558.433, "eval_steps_per_second": 34.902, "step": 93000 }, { "epoch": 0.9143449476329711, "grad_norm": 0.09479326009750366, "learning_rate": 0.0004141705555555556, "loss": 0.0091, "step": 93500 }, { "epoch": 0.919234492807479, "grad_norm": 0.04621125012636185, "learning_rate": 0.00041222611111111116, "loss": 0.0091, "step": 94000 }, { "epoch": 0.919234492807479, "eval_accuracy": 0.9991637619047619, "eval_loss": 0.005490881856530905, "eval_runtime": 52.8914, "eval_samples_per_second": 567.2, "eval_steps_per_second": 35.45, "step": 94000 }, { "epoch": 0.924124037981987, "grad_norm": 0.11758420616388321, "learning_rate": 0.0004102816666666666, "loss": 0.0091, "step": 94500 }, { "epoch": 0.9290135831564947, "grad_norm": 0.048568353056907654, "learning_rate": 0.00040833722222222217, "loss": 0.0085, "step": 95000 }, { "epoch": 0.9290135831564947, "eval_accuracy": 0.9991408571428572, "eval_loss": 0.0056878020986914635, "eval_runtime": 54.7817, "eval_samples_per_second": 547.628, "eval_steps_per_second": 34.227, "step": 95000 }, { "epoch": 0.9339031283310026, "grad_norm": 0.12460034340620041, "learning_rate": 0.0004063927777777778, "loss": 0.0089, "step": 95500 }, { "epoch": 0.9387926735055105, "grad_norm": 0.04623766988515854, "learning_rate": 0.00040444833333333334, "loss": 0.0087, "step": 96000 }, { "epoch": 0.9387926735055105, "eval_accuracy": 0.9991676190476191, "eval_loss": 0.005500451661646366, "eval_runtime": 53.9981, "eval_samples_per_second": 555.575, "eval_steps_per_second": 34.723, "step": 96000 }, { "epoch": 0.9436822186800183, "grad_norm": 0.08665420114994049, "learning_rate": 0.0004025038888888889, "loss": 0.0087, "step": 96500 }, { "epoch": 0.9485717638545262, "grad_norm": 0.0452926941215992, "learning_rate": 0.00040055944444444445, "loss": 0.0084, "step": 97000 }, { "epoch": 0.9485717638545262, "eval_accuracy": 0.999164, "eval_loss": 0.005574519746005535, "eval_runtime": 54.6981, "eval_samples_per_second": 548.465, "eval_steps_per_second": 34.279, "step": 97000 }, { "epoch": 0.9534613090290341, "grad_norm": 0.03491511195898056, "learning_rate": 0.000398615, "loss": 0.0086, "step": 97500 }, { "epoch": 0.9583508542035419, "grad_norm": 0.044573381543159485, "learning_rate": 0.00039667055555555556, "loss": 0.0089, "step": 98000 }, { "epoch": 0.9583508542035419, "eval_accuracy": 0.9991894285714286, "eval_loss": 0.005372173152863979, "eval_runtime": 53.4094, "eval_samples_per_second": 561.699, "eval_steps_per_second": 35.106, "step": 98000 }, { "epoch": 0.9632403993780498, "grad_norm": 0.02608780935406685, "learning_rate": 0.0003947261111111111, "loss": 0.0086, "step": 98500 }, { "epoch": 0.9681299445525577, "grad_norm": 0.04312971234321594, "learning_rate": 0.0003927816666666667, "loss": 0.0086, "step": 99000 }, { "epoch": 0.9681299445525577, "eval_accuracy": 0.9991722380952381, "eval_loss": 0.0054678237065672874, "eval_runtime": 54.0015, "eval_samples_per_second": 555.541, "eval_steps_per_second": 34.721, "step": 99000 }, { "epoch": 0.9730194897270655, "grad_norm": 0.06294015049934387, "learning_rate": 0.00039083722222222223, "loss": 0.0085, "step": 99500 }, { "epoch": 0.9779090349015734, "grad_norm": 0.029000315815210342, "learning_rate": 0.00038889277777777773, "loss": 0.0087, "step": 100000 }, { "epoch": 0.9779090349015734, "eval_accuracy": 0.999185380952381, "eval_loss": 0.005396171938627958, "eval_runtime": 55.6579, "eval_samples_per_second": 539.007, "eval_steps_per_second": 33.688, "step": 100000 }, { "epoch": 0.9827985800760813, "grad_norm": 0.04323006793856621, "learning_rate": 0.0003869483333333333, "loss": 0.0087, "step": 100500 }, { "epoch": 0.9876881252505892, "grad_norm": 0.0731167271733284, "learning_rate": 0.00038500388888888885, "loss": 0.0081, "step": 101000 }, { "epoch": 0.9876881252505892, "eval_accuracy": 0.9991765238095238, "eval_loss": 0.005412892438471317, "eval_runtime": 55.9769, "eval_samples_per_second": 535.935, "eval_steps_per_second": 33.496, "step": 101000 }, { "epoch": 0.992577670425097, "grad_norm": 0.023585299029946327, "learning_rate": 0.0003830594444444444, "loss": 0.0088, "step": 101500 }, { "epoch": 0.9974672155996049, "grad_norm": 0.08938384801149368, "learning_rate": 0.000381115, "loss": 0.0086, "step": 102000 }, { "epoch": 0.9974672155996049, "eval_accuracy": 0.9991979047619047, "eval_loss": 0.005323469173163176, "eval_runtime": 53.2851, "eval_samples_per_second": 563.009, "eval_steps_per_second": 35.188, "step": 102000 }, { "epoch": 1.0023567607741128, "grad_norm": 0.038682036101818085, "learning_rate": 0.00037917055555555557, "loss": 0.0082, "step": 102500 }, { "epoch": 1.0072463059486207, "grad_norm": 0.07080361992120743, "learning_rate": 0.0003772261111111111, "loss": 0.0081, "step": 103000 }, { "epoch": 1.0072463059486207, "eval_accuracy": 0.9992074761904762, "eval_loss": 0.00541540514677763, "eval_runtime": 54.1542, "eval_samples_per_second": 553.974, "eval_steps_per_second": 34.623, "step": 103000 }, { "epoch": 1.0121358511231284, "grad_norm": 0.0545232892036438, "learning_rate": 0.0003752816666666667, "loss": 0.0079, "step": 103500 }, { "epoch": 1.0170253962976363, "grad_norm": 0.05419744551181793, "learning_rate": 0.00037333722222222224, "loss": 0.0083, "step": 104000 }, { "epoch": 1.0170253962976363, "eval_accuracy": 0.999227380952381, "eval_loss": 0.005181997548788786, "eval_runtime": 54.7563, "eval_samples_per_second": 547.882, "eval_steps_per_second": 34.243, "step": 104000 }, { "epoch": 1.0219149414721442, "grad_norm": 0.062064480036497116, "learning_rate": 0.0003713927777777778, "loss": 0.0078, "step": 104500 }, { "epoch": 1.0268044866466521, "grad_norm": 0.0431884303689003, "learning_rate": 0.00036944833333333335, "loss": 0.0078, "step": 105000 }, { "epoch": 1.0268044866466521, "eval_accuracy": 0.999227380952381, "eval_loss": 0.005218331702053547, "eval_runtime": 53.5479, "eval_samples_per_second": 560.246, "eval_steps_per_second": 35.015, "step": 105000 }, { "epoch": 1.03169403182116, "grad_norm": 0.035419270396232605, "learning_rate": 0.00036750388888888885, "loss": 0.0079, "step": 105500 }, { "epoch": 1.036583576995668, "grad_norm": 0.03565732017159462, "learning_rate": 0.0003655594444444444, "loss": 0.0078, "step": 106000 }, { "epoch": 1.036583576995668, "eval_accuracy": 0.9992299523809524, "eval_loss": 0.005135852377861738, "eval_runtime": 54.0577, "eval_samples_per_second": 554.962, "eval_steps_per_second": 34.685, "step": 106000 }, { "epoch": 1.0414731221701756, "grad_norm": 0.04575124382972717, "learning_rate": 0.00036361499999999997, "loss": 0.0076, "step": 106500 }, { "epoch": 1.0463626673446835, "grad_norm": 0.07697087526321411, "learning_rate": 0.0003616705555555555, "loss": 0.0076, "step": 107000 }, { "epoch": 1.0463626673446835, "eval_accuracy": 0.9992333809523809, "eval_loss": 0.005050502717494965, "eval_runtime": 53.4533, "eval_samples_per_second": 561.238, "eval_steps_per_second": 35.077, "step": 107000 }, { "epoch": 1.0512522125191914, "grad_norm": 0.05499347671866417, "learning_rate": 0.0003597261111111111, "loss": 0.0079, "step": 107500 }, { "epoch": 1.0561417576936993, "grad_norm": 0.035594772547483444, "learning_rate": 0.00035778166666666664, "loss": 0.0081, "step": 108000 }, { "epoch": 1.0561417576936993, "eval_accuracy": 0.9992301428571428, "eval_loss": 0.0050900341011583805, "eval_runtime": 53.2622, "eval_samples_per_second": 563.251, "eval_steps_per_second": 35.203, "step": 108000 }, { "epoch": 1.0610313028682072, "grad_norm": 0.020569855347275734, "learning_rate": 0.00035583722222222225, "loss": 0.0077, "step": 108500 }, { "epoch": 1.0659208480427151, "grad_norm": 0.06758717447519302, "learning_rate": 0.0003538927777777778, "loss": 0.0082, "step": 109000 }, { "epoch": 1.0659208480427151, "eval_accuracy": 0.9992373333333333, "eval_loss": 0.005076898727566004, "eval_runtime": 53.4707, "eval_samples_per_second": 561.054, "eval_steps_per_second": 35.066, "step": 109000 }, { "epoch": 1.070810393217223, "grad_norm": 0.04208175465464592, "learning_rate": 0.00035194833333333336, "loss": 0.0079, "step": 109500 }, { "epoch": 1.0756999383917307, "grad_norm": 0.040982868522405624, "learning_rate": 0.0003500038888888889, "loss": 0.0074, "step": 110000 }, { "epoch": 1.0756999383917307, "eval_accuracy": 0.9992489523809523, "eval_loss": 0.00500760693103075, "eval_runtime": 54.1302, "eval_samples_per_second": 554.219, "eval_steps_per_second": 34.639, "step": 110000 }, { "epoch": 1.0805894835662386, "grad_norm": 0.05090247467160225, "learning_rate": 0.0003480594444444444, "loss": 0.0075, "step": 110500 }, { "epoch": 1.0854790287407465, "grad_norm": 0.02564290165901184, "learning_rate": 0.000346115, "loss": 0.0077, "step": 111000 }, { "epoch": 1.0854790287407465, "eval_accuracy": 0.9992412380952381, "eval_loss": 0.005068215075880289, "eval_runtime": 53.2721, "eval_samples_per_second": 563.147, "eval_steps_per_second": 35.197, "step": 111000 }, { "epoch": 1.0903685739152544, "grad_norm": 0.032404959201812744, "learning_rate": 0.0003441705555555556, "loss": 0.0076, "step": 111500 }, { "epoch": 1.0952581190897623, "grad_norm": 0.05177515000104904, "learning_rate": 0.00034222611111111114, "loss": 0.0077, "step": 112000 }, { "epoch": 1.0952581190897623, "eval_accuracy": 0.9992587142857143, "eval_loss": 0.00494408467784524, "eval_runtime": 53.7598, "eval_samples_per_second": 558.038, "eval_steps_per_second": 34.877, "step": 112000 }, { "epoch": 1.10014766426427, "grad_norm": 0.041296541690826416, "learning_rate": 0.00034028166666666664, "loss": 0.0076, "step": 112500 }, { "epoch": 1.105037209438778, "grad_norm": 0.027352752164006233, "learning_rate": 0.0003383372222222222, "loss": 0.0077, "step": 113000 }, { "epoch": 1.105037209438778, "eval_accuracy": 0.9992613333333333, "eval_loss": 0.004911018069833517, "eval_runtime": 53.361, "eval_samples_per_second": 562.209, "eval_steps_per_second": 35.138, "step": 113000 }, { "epoch": 1.1099267546132858, "grad_norm": 0.017891952767968178, "learning_rate": 0.00033639277777777776, "loss": 0.0074, "step": 113500 }, { "epoch": 1.1148162997877937, "grad_norm": 0.10825661569833755, "learning_rate": 0.0003344483333333333, "loss": 0.0077, "step": 114000 }, { "epoch": 1.1148162997877937, "eval_accuracy": 0.9992698095238095, "eval_loss": 0.004937721882015467, "eval_runtime": 53.9545, "eval_samples_per_second": 556.024, "eval_steps_per_second": 34.752, "step": 114000 }, { "epoch": 1.1197058449623016, "grad_norm": 0.0252179317176342, "learning_rate": 0.00033250388888888887, "loss": 0.0072, "step": 114500 }, { "epoch": 1.1245953901368095, "grad_norm": 0.10007605701684952, "learning_rate": 0.0003305594444444445, "loss": 0.0073, "step": 115000 }, { "epoch": 1.1245953901368095, "eval_accuracy": 0.9992664285714286, "eval_loss": 0.005000779405236244, "eval_runtime": 53.4444, "eval_samples_per_second": 561.331, "eval_steps_per_second": 35.083, "step": 115000 }, { "epoch": 1.1294849353113174, "grad_norm": 0.08812825381755829, "learning_rate": 0.000328615, "loss": 0.0076, "step": 115500 }, { "epoch": 1.1343744804858251, "grad_norm": 0.04212397709488869, "learning_rate": 0.00032667055555555554, "loss": 0.0071, "step": 116000 }, { "epoch": 1.1343744804858251, "eval_accuracy": 0.9992689523809524, "eval_loss": 0.0048895059153437614, "eval_runtime": 56.3714, "eval_samples_per_second": 532.185, "eval_steps_per_second": 33.262, "step": 116000 }, { "epoch": 1.139264025660333, "grad_norm": 0.02763226442039013, "learning_rate": 0.0003247261111111111, "loss": 0.0075, "step": 116500 }, { "epoch": 1.144153570834841, "grad_norm": 0.05487339198589325, "learning_rate": 0.00032278166666666665, "loss": 0.0074, "step": 117000 }, { "epoch": 1.144153570834841, "eval_accuracy": 0.9992661428571429, "eval_loss": 0.004837568383663893, "eval_runtime": 54.3925, "eval_samples_per_second": 551.547, "eval_steps_per_second": 34.472, "step": 117000 }, { "epoch": 1.1490431160093488, "grad_norm": 0.04747488722205162, "learning_rate": 0.0003208372222222222, "loss": 0.0075, "step": 117500 }, { "epoch": 1.1539326611838567, "grad_norm": 0.10006921738386154, "learning_rate": 0.00031889277777777777, "loss": 0.0074, "step": 118000 }, { "epoch": 1.1539326611838567, "eval_accuracy": 0.9992860476190476, "eval_loss": 0.0047850459814071655, "eval_runtime": 53.7241, "eval_samples_per_second": 558.408, "eval_steps_per_second": 34.901, "step": 118000 }, { "epoch": 1.1588222063583646, "grad_norm": 0.03712115064263344, "learning_rate": 0.0003169483333333333, "loss": 0.0075, "step": 118500 }, { "epoch": 1.1637117515328723, "grad_norm": 0.05919933691620827, "learning_rate": 0.0003150038888888889, "loss": 0.0073, "step": 119000 }, { "epoch": 1.1637117515328723, "eval_accuracy": 0.9992771428571429, "eval_loss": 0.004803878720849752, "eval_runtime": 53.7517, "eval_samples_per_second": 558.121, "eval_steps_per_second": 34.883, "step": 119000 }, { "epoch": 1.1686012967073802, "grad_norm": 0.017905965447425842, "learning_rate": 0.00031305944444444444, "loss": 0.0069, "step": 119500 }, { "epoch": 1.1734908418818881, "grad_norm": 0.05728234723210335, "learning_rate": 0.000311115, "loss": 0.007, "step": 120000 }, { "epoch": 1.1734908418818881, "eval_accuracy": 0.999289, "eval_loss": 0.004755858797580004, "eval_runtime": 53.6273, "eval_samples_per_second": 559.417, "eval_steps_per_second": 34.964, "step": 120000 }, { "epoch": 1.178380387056396, "grad_norm": 0.05677701532840729, "learning_rate": 0.00030917055555555555, "loss": 0.007, "step": 120500 }, { "epoch": 1.183269932230904, "grad_norm": 0.05953844264149666, "learning_rate": 0.0003072261111111111, "loss": 0.0071, "step": 121000 }, { "epoch": 1.183269932230904, "eval_accuracy": 0.999293619047619, "eval_loss": 0.004746082704514265, "eval_runtime": 55.1206, "eval_samples_per_second": 544.262, "eval_steps_per_second": 34.016, "step": 121000 }, { "epoch": 1.1881594774054118, "grad_norm": 0.03433966636657715, "learning_rate": 0.00030528166666666666, "loss": 0.0071, "step": 121500 }, { "epoch": 1.1930490225799195, "grad_norm": 0.0718400701880455, "learning_rate": 0.0003033372222222222, "loss": 0.0073, "step": 122000 }, { "epoch": 1.1930490225799195, "eval_accuracy": 0.9992973333333334, "eval_loss": 0.004623962566256523, "eval_runtime": 54.751, "eval_samples_per_second": 547.935, "eval_steps_per_second": 34.246, "step": 122000 }, { "epoch": 1.1979385677544274, "grad_norm": 0.026871928945183754, "learning_rate": 0.0003013927777777778, "loss": 0.0065, "step": 122500 }, { "epoch": 1.2028281129289353, "grad_norm": 0.015808627009391785, "learning_rate": 0.00029944833333333333, "loss": 0.0069, "step": 123000 }, { "epoch": 1.2028281129289353, "eval_accuracy": 0.9992959047619048, "eval_loss": 0.004734317306429148, "eval_runtime": 53.9604, "eval_samples_per_second": 555.963, "eval_steps_per_second": 34.748, "step": 123000 }, { "epoch": 1.2077176581034432, "grad_norm": 0.06739887595176697, "learning_rate": 0.0002975038888888889, "loss": 0.0071, "step": 123500 }, { "epoch": 1.2126072032779511, "grad_norm": 0.020941952243447304, "learning_rate": 0.00029555944444444444, "loss": 0.007, "step": 124000 }, { "epoch": 1.2126072032779511, "eval_accuracy": 0.9992935238095239, "eval_loss": 0.004609288647770882, "eval_runtime": 54.1194, "eval_samples_per_second": 554.33, "eval_steps_per_second": 34.646, "step": 124000 }, { "epoch": 1.217496748452459, "grad_norm": 0.027827920392155647, "learning_rate": 0.000293615, "loss": 0.007, "step": 124500 }, { "epoch": 1.222386293626967, "grad_norm": 0.08693556487560272, "learning_rate": 0.00029167055555555556, "loss": 0.0069, "step": 125000 }, { "epoch": 1.222386293626967, "eval_accuracy": 0.9993093333333334, "eval_loss": 0.004602524451911449, "eval_runtime": 53.3938, "eval_samples_per_second": 561.863, "eval_steps_per_second": 35.116, "step": 125000 }, { "epoch": 1.2272758388014746, "grad_norm": 0.04795575141906738, "learning_rate": 0.0002897261111111111, "loss": 0.0069, "step": 125500 }, { "epoch": 1.2321653839759825, "grad_norm": 0.07266402244567871, "learning_rate": 0.00028778166666666667, "loss": 0.0071, "step": 126000 }, { "epoch": 1.2321653839759825, "eval_accuracy": 0.9993089523809524, "eval_loss": 0.00456634908914566, "eval_runtime": 54.0494, "eval_samples_per_second": 555.048, "eval_steps_per_second": 34.69, "step": 126000 }, { "epoch": 1.2370549291504904, "grad_norm": 0.03289886936545372, "learning_rate": 0.0002858372222222222, "loss": 0.0072, "step": 126500 }, { "epoch": 1.2419444743249983, "grad_norm": 0.02240580879151821, "learning_rate": 0.0002838927777777778, "loss": 0.007, "step": 127000 }, { "epoch": 1.2419444743249983, "eval_accuracy": 0.9993215714285715, "eval_loss": 0.004485046491026878, "eval_runtime": 53.4392, "eval_samples_per_second": 561.386, "eval_steps_per_second": 35.087, "step": 127000 }, { "epoch": 1.2468340194995062, "grad_norm": 0.040360696613788605, "learning_rate": 0.00028194833333333334, "loss": 0.0068, "step": 127500 }, { "epoch": 1.251723564674014, "grad_norm": 0.032697584480047226, "learning_rate": 0.0002800038888888889, "loss": 0.0072, "step": 128000 }, { "epoch": 1.251723564674014, "eval_accuracy": 0.9993274761904762, "eval_loss": 0.004469048231840134, "eval_runtime": 53.9627, "eval_samples_per_second": 555.939, "eval_steps_per_second": 34.746, "step": 128000 }, { "epoch": 1.2566131098485218, "grad_norm": 0.021058347076177597, "learning_rate": 0.00027805944444444445, "loss": 0.0069, "step": 128500 }, { "epoch": 1.2615026550230297, "grad_norm": 0.036056675016880035, "learning_rate": 0.000276115, "loss": 0.0067, "step": 129000 }, { "epoch": 1.2615026550230297, "eval_accuracy": 0.9993329047619047, "eval_loss": 0.004397740587592125, "eval_runtime": 53.2747, "eval_samples_per_second": 563.12, "eval_steps_per_second": 35.195, "step": 129000 }, { "epoch": 1.2663922001975376, "grad_norm": 0.034787457436323166, "learning_rate": 0.0002741705555555555, "loss": 0.0066, "step": 129500 }, { "epoch": 1.2712817453720455, "grad_norm": 0.05359942466020584, "learning_rate": 0.0002722261111111111, "loss": 0.0065, "step": 130000 }, { "epoch": 1.2712817453720455, "eval_accuracy": 0.9993344761904762, "eval_loss": 0.004399556666612625, "eval_runtime": 53.9523, "eval_samples_per_second": 556.047, "eval_steps_per_second": 34.753, "step": 130000 }, { "epoch": 1.2761712905465534, "grad_norm": 0.02243073098361492, "learning_rate": 0.0002702816666666667, "loss": 0.0068, "step": 130500 }, { "epoch": 1.2810608357210613, "grad_norm": 0.049295682460069656, "learning_rate": 0.00026833722222222223, "loss": 0.0068, "step": 131000 }, { "epoch": 1.2810608357210613, "eval_accuracy": 0.9993318571428571, "eval_loss": 0.004440919030457735, "eval_runtime": 53.2304, "eval_samples_per_second": 563.587, "eval_steps_per_second": 35.224, "step": 131000 }, { "epoch": 1.285950380895569, "grad_norm": 0.021682027727365494, "learning_rate": 0.0002663927777777778, "loss": 0.0067, "step": 131500 }, { "epoch": 1.290839926070077, "grad_norm": 0.0382467582821846, "learning_rate": 0.00026444833333333335, "loss": 0.0067, "step": 132000 }, { "epoch": 1.290839926070077, "eval_accuracy": 0.9993491904761905, "eval_loss": 0.004402833059430122, "eval_runtime": 53.8618, "eval_samples_per_second": 556.981, "eval_steps_per_second": 34.811, "step": 132000 }, { "epoch": 1.2957294712445848, "grad_norm": 0.041405659168958664, "learning_rate": 0.00026250388888888885, "loss": 0.0068, "step": 132500 }, { "epoch": 1.3006190164190927, "grad_norm": 0.039939701557159424, "learning_rate": 0.00026055944444444446, "loss": 0.0064, "step": 133000 }, { "epoch": 1.3006190164190927, "eval_accuracy": 0.9993461904761904, "eval_loss": 0.004411030560731888, "eval_runtime": 52.9835, "eval_samples_per_second": 566.214, "eval_steps_per_second": 35.388, "step": 133000 }, { "epoch": 1.3055085615936006, "grad_norm": 0.07499232143163681, "learning_rate": 0.000258615, "loss": 0.0068, "step": 133500 }, { "epoch": 1.3103981067681083, "grad_norm": 0.03830355405807495, "learning_rate": 0.0002566705555555556, "loss": 0.0066, "step": 134000 }, { "epoch": 1.3103981067681083, "eval_accuracy": 0.9993475238095239, "eval_loss": 0.004307963885366917, "eval_runtime": 54.0847, "eval_samples_per_second": 554.685, "eval_steps_per_second": 34.668, "step": 134000 }, { "epoch": 1.3152876519426164, "grad_norm": 0.04341171681880951, "learning_rate": 0.00025472611111111113, "loss": 0.0064, "step": 134500 }, { "epoch": 1.3201771971171241, "grad_norm": 0.05085453763604164, "learning_rate": 0.00025278166666666663, "loss": 0.0066, "step": 135000 }, { "epoch": 1.3201771971171241, "eval_accuracy": 0.9993423809523809, "eval_loss": 0.004391905851662159, "eval_runtime": 53.489, "eval_samples_per_second": 560.863, "eval_steps_per_second": 35.054, "step": 135000 }, { "epoch": 1.325066742291632, "grad_norm": 0.05465886369347572, "learning_rate": 0.0002508372222222222, "loss": 0.0065, "step": 135500 }, { "epoch": 1.32995628746614, "grad_norm": 0.028779752552509308, "learning_rate": 0.00024889277777777774, "loss": 0.0065, "step": 136000 }, { "epoch": 1.32995628746614, "eval_accuracy": 0.9993518571428571, "eval_loss": 0.004291407763957977, "eval_runtime": 53.5568, "eval_samples_per_second": 560.153, "eval_steps_per_second": 35.01, "step": 136000 }, { "epoch": 1.3348458326406478, "grad_norm": 0.07813508808612823, "learning_rate": 0.00024694833333333336, "loss": 0.0069, "step": 136500 }, { "epoch": 1.3397353778151557, "grad_norm": 0.034233298152685165, "learning_rate": 0.0002450038888888889, "loss": 0.0064, "step": 137000 }, { "epoch": 1.3397353778151557, "eval_accuracy": 0.9993458095238095, "eval_loss": 0.004360624123364687, "eval_runtime": 52.8603, "eval_samples_per_second": 567.534, "eval_steps_per_second": 35.471, "step": 137000 }, { "epoch": 1.3446249229896634, "grad_norm": 0.08024276047945023, "learning_rate": 0.00024305944444444447, "loss": 0.0061, "step": 137500 }, { "epoch": 1.3495144681641713, "grad_norm": 0.05493255332112312, "learning_rate": 0.00024111499999999997, "loss": 0.0066, "step": 138000 }, { "epoch": 1.3495144681641713, "eval_accuracy": 0.9993639047619047, "eval_loss": 0.00431590573862195, "eval_runtime": 53.7077, "eval_samples_per_second": 558.579, "eval_steps_per_second": 34.911, "step": 138000 }, { "epoch": 1.3544040133386792, "grad_norm": 0.04275180399417877, "learning_rate": 0.00023917055555555555, "loss": 0.0062, "step": 138500 }, { "epoch": 1.3592935585131871, "grad_norm": 0.07628139853477478, "learning_rate": 0.0002372261111111111, "loss": 0.0065, "step": 139000 }, { "epoch": 1.3592935585131871, "eval_accuracy": 0.9993583809523809, "eval_loss": 0.0042925444431602955, "eval_runtime": 53.3087, "eval_samples_per_second": 562.76, "eval_steps_per_second": 35.173, "step": 139000 }, { "epoch": 1.364183103687695, "grad_norm": 0.018862802535295486, "learning_rate": 0.00023528166666666667, "loss": 0.0064, "step": 139500 }, { "epoch": 1.3690726488622027, "grad_norm": 0.059994716197252274, "learning_rate": 0.00023333722222222222, "loss": 0.0061, "step": 140000 }, { "epoch": 1.3690726488622027, "eval_accuracy": 0.9993745714285714, "eval_loss": 0.004216773435473442, "eval_runtime": 53.7427, "eval_samples_per_second": 558.215, "eval_steps_per_second": 34.888, "step": 140000 }, { "epoch": 1.3739621940367108, "grad_norm": 0.02738560363650322, "learning_rate": 0.00023139277777777775, "loss": 0.006, "step": 140500 }, { "epoch": 1.3788517392112185, "grad_norm": 0.16879647970199585, "learning_rate": 0.0002294483333333333, "loss": 0.0062, "step": 141000 }, { "epoch": 1.3788517392112185, "eval_accuracy": 0.9993692380952381, "eval_loss": 0.004215199965983629, "eval_runtime": 53.2674, "eval_samples_per_second": 563.197, "eval_steps_per_second": 35.2, "step": 141000 }, { "epoch": 1.3837412843857264, "grad_norm": 0.03396091237664223, "learning_rate": 0.0002275038888888889, "loss": 0.0062, "step": 141500 }, { "epoch": 1.3886308295602343, "grad_norm": 0.04174041002988815, "learning_rate": 0.00022555944444444445, "loss": 0.0063, "step": 142000 }, { "epoch": 1.3886308295602343, "eval_accuracy": 0.9993620476190476, "eval_loss": 0.00427864259108901, "eval_runtime": 54.516, "eval_samples_per_second": 550.297, "eval_steps_per_second": 34.394, "step": 142000 }, { "epoch": 1.3935203747347422, "grad_norm": 0.032653287053108215, "learning_rate": 0.000223615, "loss": 0.0062, "step": 142500 }, { "epoch": 1.3984099199092501, "grad_norm": 0.04273010045289993, "learning_rate": 0.00022167055555555556, "loss": 0.0061, "step": 143000 }, { "epoch": 1.3984099199092501, "eval_accuracy": 0.9993804761904762, "eval_loss": 0.0041556586511433125, "eval_runtime": 53.4491, "eval_samples_per_second": 561.282, "eval_steps_per_second": 35.08, "step": 143000 }, { "epoch": 1.4032994650837578, "grad_norm": 0.043946944177150726, "learning_rate": 0.0002197261111111111, "loss": 0.0059, "step": 143500 }, { "epoch": 1.4081890102582657, "grad_norm": 0.016042672097682953, "learning_rate": 0.00021778166666666665, "loss": 0.0062, "step": 144000 }, { "epoch": 1.4081890102582657, "eval_accuracy": 0.9993822857142857, "eval_loss": 0.004146920517086983, "eval_runtime": 53.2095, "eval_samples_per_second": 563.809, "eval_steps_per_second": 35.238, "step": 144000 }, { "epoch": 1.4130785554327736, "grad_norm": 0.04190443456172943, "learning_rate": 0.0002158372222222222, "loss": 0.006, "step": 144500 }, { "epoch": 1.4179681006072815, "grad_norm": 0.029104501008987427, "learning_rate": 0.0002138927777777778, "loss": 0.006, "step": 145000 }, { "epoch": 1.4179681006072815, "eval_accuracy": 0.9993911428571428, "eval_loss": 0.004062490537762642, "eval_runtime": 53.4832, "eval_samples_per_second": 560.923, "eval_steps_per_second": 35.058, "step": 145000 }, { "epoch": 1.4228576457817894, "grad_norm": 0.019995709881186485, "learning_rate": 0.00021194833333333335, "loss": 0.0058, "step": 145500 }, { "epoch": 1.4277471909562973, "grad_norm": 0.016850166022777557, "learning_rate": 0.0002100038888888889, "loss": 0.0062, "step": 146000 }, { "epoch": 1.4277471909562973, "eval_accuracy": 0.9993850476190477, "eval_loss": 0.00406758114695549, "eval_runtime": 54.1149, "eval_samples_per_second": 554.376, "eval_steps_per_second": 34.648, "step": 146000 }, { "epoch": 1.4326367361308052, "grad_norm": 0.042491696774959564, "learning_rate": 0.00020805944444444443, "loss": 0.0059, "step": 146500 }, { "epoch": 1.437526281305313, "grad_norm": 0.07708732038736343, "learning_rate": 0.000206115, "loss": 0.006, "step": 147000 }, { "epoch": 1.437526281305313, "eval_accuracy": 0.9993972857142858, "eval_loss": 0.004030513111501932, "eval_runtime": 53.221, "eval_samples_per_second": 563.687, "eval_steps_per_second": 35.23, "step": 147000 }, { "epoch": 1.4424158264798208, "grad_norm": 0.032772552222013474, "learning_rate": 0.00020417055555555554, "loss": 0.0059, "step": 147500 }, { "epoch": 1.4473053716543287, "grad_norm": 0.041167329996824265, "learning_rate": 0.00020222611111111113, "loss": 0.0058, "step": 148000 }, { "epoch": 1.4473053716543287, "eval_accuracy": 0.999398, "eval_loss": 0.004109182395040989, "eval_runtime": 53.8747, "eval_samples_per_second": 556.848, "eval_steps_per_second": 34.803, "step": 148000 }, { "epoch": 1.4521949168288366, "grad_norm": 0.033146705478429794, "learning_rate": 0.00020028166666666668, "loss": 0.0058, "step": 148500 }, { "epoch": 1.4570844620033445, "grad_norm": 0.04614367336034775, "learning_rate": 0.0001983372222222222, "loss": 0.0057, "step": 149000 }, { "epoch": 1.4570844620033445, "eval_accuracy": 0.9994065238095238, "eval_loss": 0.003991841338574886, "eval_runtime": 53.7363, "eval_samples_per_second": 558.282, "eval_steps_per_second": 34.893, "step": 149000 }, { "epoch": 1.4619740071778522, "grad_norm": 0.031296566128730774, "learning_rate": 0.00019639277777777777, "loss": 0.0057, "step": 149500 }, { "epoch": 1.46686355235236, "grad_norm": 0.03523857146501541, "learning_rate": 0.00019444833333333333, "loss": 0.0059, "step": 150000 }, { "epoch": 1.46686355235236, "eval_accuracy": 0.9994045238095238, "eval_loss": 0.00398767227306962, "eval_runtime": 54.0668, "eval_samples_per_second": 554.869, "eval_steps_per_second": 34.679, "step": 150000 }, { "epoch": 1.471753097526868, "grad_norm": 0.030513431876897812, "learning_rate": 0.00019250388888888888, "loss": 0.006, "step": 150500 }, { "epoch": 1.476642642701376, "grad_norm": 0.03433874994516373, "learning_rate": 0.00019055944444444444, "loss": 0.0057, "step": 151000 }, { "epoch": 1.476642642701376, "eval_accuracy": 0.999412, "eval_loss": 0.003936768043786287, "eval_runtime": 53.4197, "eval_samples_per_second": 561.591, "eval_steps_per_second": 35.099, "step": 151000 }, { "epoch": 1.4815321878758838, "grad_norm": 0.03743559867143631, "learning_rate": 0.00018861500000000002, "loss": 0.0059, "step": 151500 }, { "epoch": 1.4864217330503917, "grad_norm": 0.023772869259119034, "learning_rate": 0.00018667055555555553, "loss": 0.0056, "step": 152000 }, { "epoch": 1.4864217330503917, "eval_accuracy": 0.9994103333333333, "eval_loss": 0.00395695585757494, "eval_runtime": 53.4862, "eval_samples_per_second": 560.892, "eval_steps_per_second": 35.056, "step": 152000 }, { "epoch": 1.4913112782248996, "grad_norm": 0.021286042407155037, "learning_rate": 0.0001847261111111111, "loss": 0.0056, "step": 152500 }, { "epoch": 1.4962008233994073, "grad_norm": 0.04487517103552818, "learning_rate": 0.00018278166666666667, "loss": 0.0059, "step": 153000 }, { "epoch": 1.4962008233994073, "eval_accuracy": 0.9994135714285715, "eval_loss": 0.0038883944507688284, "eval_runtime": 53.7959, "eval_samples_per_second": 557.663, "eval_steps_per_second": 34.854, "step": 153000 }, { "epoch": 1.5010903685739152, "grad_norm": 0.02229585126042366, "learning_rate": 0.00018083722222222222, "loss": 0.0056, "step": 153500 }, { "epoch": 1.505979913748423, "grad_norm": 0.06015641614794731, "learning_rate": 0.00017889277777777778, "loss": 0.0055, "step": 154000 }, { "epoch": 1.505979913748423, "eval_accuracy": 0.9994171428571429, "eval_loss": 0.0039031950291246176, "eval_runtime": 53.8206, "eval_samples_per_second": 557.408, "eval_steps_per_second": 34.838, "step": 154000 }, { "epoch": 1.510869458922931, "grad_norm": 0.060777414590120316, "learning_rate": 0.00017694833333333336, "loss": 0.0057, "step": 154500 }, { "epoch": 1.515759004097439, "grad_norm": 0.010729908011853695, "learning_rate": 0.00017500388888888886, "loss": 0.0055, "step": 155000 }, { "epoch": 1.515759004097439, "eval_accuracy": 0.9994168095238095, "eval_loss": 0.0038592983037233353, "eval_runtime": 52.9997, "eval_samples_per_second": 566.041, "eval_steps_per_second": 35.378, "step": 155000 }, { "epoch": 1.5206485492719466, "grad_norm": 0.07996519654989243, "learning_rate": 0.00017305944444444445, "loss": 0.0056, "step": 155500 }, { "epoch": 1.5255380944464547, "grad_norm": 0.05094398185610771, "learning_rate": 0.000171115, "loss": 0.0056, "step": 156000 }, { "epoch": 1.5255380944464547, "eval_accuracy": 0.9994315238095238, "eval_loss": 0.0037978454492986202, "eval_runtime": 53.5723, "eval_samples_per_second": 559.991, "eval_steps_per_second": 34.999, "step": 156000 }, { "epoch": 1.5304276396209624, "grad_norm": 0.038200926035642624, "learning_rate": 0.00016917055555555556, "loss": 0.0055, "step": 156500 }, { "epoch": 1.5353171847954703, "grad_norm": 0.10346455127000809, "learning_rate": 0.00016722611111111112, "loss": 0.0054, "step": 157000 }, { "epoch": 1.5353171847954703, "eval_accuracy": 0.9994299523809523, "eval_loss": 0.0037865168415009975, "eval_runtime": 53.2357, "eval_samples_per_second": 563.531, "eval_steps_per_second": 35.221, "step": 157000 }, { "epoch": 1.5402067299699782, "grad_norm": 0.015595887787640095, "learning_rate": 0.00016528166666666667, "loss": 0.0056, "step": 157500 }, { "epoch": 1.545096275144486, "grad_norm": 0.0232669860124588, "learning_rate": 0.00016333722222222223, "loss": 0.0055, "step": 158000 }, { "epoch": 1.545096275144486, "eval_accuracy": 0.9994310476190477, "eval_loss": 0.003748950082808733, "eval_runtime": 54.3134, "eval_samples_per_second": 552.35, "eval_steps_per_second": 34.522, "step": 158000 }, { "epoch": 1.549985820318994, "grad_norm": 0.04196183383464813, "learning_rate": 0.00016139277777777776, "loss": 0.0054, "step": 158500 }, { "epoch": 1.5548753654935017, "grad_norm": 0.04280064254999161, "learning_rate": 0.00015944833333333334, "loss": 0.0055, "step": 159000 }, { "epoch": 1.5548753654935017, "eval_accuracy": 0.9994327619047619, "eval_loss": 0.00377083383500576, "eval_runtime": 53.1652, "eval_samples_per_second": 564.278, "eval_steps_per_second": 35.267, "step": 159000 }, { "epoch": 1.5597649106680098, "grad_norm": 0.01646304689347744, "learning_rate": 0.00015750388888888887, "loss": 0.0053, "step": 159500 }, { "epoch": 1.5646544558425175, "grad_norm": 0.015490056946873665, "learning_rate": 0.00015555944444444443, "loss": 0.0053, "step": 160000 }, { "epoch": 1.5646544558425175, "eval_accuracy": 0.9994344285714286, "eval_loss": 0.0037254535127431154, "eval_runtime": 55.4441, "eval_samples_per_second": 541.086, "eval_steps_per_second": 33.818, "step": 160000 }, { "epoch": 1.5695440010170254, "grad_norm": 0.034573186188936234, "learning_rate": 0.000153615, "loss": 0.0052, "step": 160500 }, { "epoch": 1.5744335461915333, "grad_norm": 0.0471004843711853, "learning_rate": 0.00015167055555555554, "loss": 0.0055, "step": 161000 }, { "epoch": 1.5744335461915333, "eval_accuracy": 0.9994374285714286, "eval_loss": 0.003749826457351446, "eval_runtime": 52.9976, "eval_samples_per_second": 566.063, "eval_steps_per_second": 35.379, "step": 161000 }, { "epoch": 1.579323091366041, "grad_norm": 0.06533846259117126, "learning_rate": 0.0001497261111111111, "loss": 0.0056, "step": 161500 }, { "epoch": 1.5842126365405491, "grad_norm": 0.009449661709368229, "learning_rate": 0.00014778166666666668, "loss": 0.0053, "step": 162000 }, { "epoch": 1.5842126365405491, "eval_accuracy": 0.9994476666666666, "eval_loss": 0.003748701885342598, "eval_runtime": 53.6491, "eval_samples_per_second": 559.189, "eval_steps_per_second": 34.949, "step": 162000 }, { "epoch": 1.5891021817150568, "grad_norm": 0.009880056604743004, "learning_rate": 0.0001458372222222222, "loss": 0.0055, "step": 162500 }, { "epoch": 1.5939917268895647, "grad_norm": 0.05580669641494751, "learning_rate": 0.00014389277777777777, "loss": 0.0051, "step": 163000 }, { "epoch": 1.5939917268895647, "eval_accuracy": 0.9994498571428572, "eval_loss": 0.0037052214611321688, "eval_runtime": 53.1475, "eval_samples_per_second": 564.467, "eval_steps_per_second": 35.279, "step": 163000 }, { "epoch": 1.5988812720640726, "grad_norm": 0.033147793263196945, "learning_rate": 0.00014194833333333335, "loss": 0.0055, "step": 163500 }, { "epoch": 1.6037708172385805, "grad_norm": 0.04852864146232605, "learning_rate": 0.00014000388888888888, "loss": 0.0054, "step": 164000 }, { "epoch": 1.6037708172385805, "eval_accuracy": 0.9994494761904762, "eval_loss": 0.003642507828772068, "eval_runtime": 53.3294, "eval_samples_per_second": 562.542, "eval_steps_per_second": 35.159, "step": 164000 }, { "epoch": 1.6086603624130884, "grad_norm": 0.04461289569735527, "learning_rate": 0.00013805944444444444, "loss": 0.0053, "step": 164500 }, { "epoch": 1.613549907587596, "grad_norm": 0.04816494509577751, "learning_rate": 0.000136115, "loss": 0.0053, "step": 165000 }, { "epoch": 1.613549907587596, "eval_accuracy": 0.999451380952381, "eval_loss": 0.003629567800089717, "eval_runtime": 53.0304, "eval_samples_per_second": 565.713, "eval_steps_per_second": 35.357, "step": 165000 }, { "epoch": 1.6184394527621042, "grad_norm": 0.04067426174879074, "learning_rate": 0.00013417055555555555, "loss": 0.0063, "step": 165500 }, { "epoch": 1.623328997936612, "grad_norm": 0.040210772305727005, "learning_rate": 0.0001322261111111111, "loss": 0.0053, "step": 166000 }, { "epoch": 1.623328997936612, "eval_accuracy": 0.9994541904761904, "eval_loss": 0.0036138601135462523, "eval_runtime": 53.4406, "eval_samples_per_second": 561.371, "eval_steps_per_second": 35.086, "step": 166000 }, { "epoch": 1.6282185431111198, "grad_norm": 0.04125046357512474, "learning_rate": 0.00013028166666666666, "loss": 0.0053, "step": 166500 }, { "epoch": 1.6331080882856277, "grad_norm": 0.03415411710739136, "learning_rate": 0.00012833722222222222, "loss": 0.0051, "step": 167000 }, { "epoch": 1.6331080882856277, "eval_accuracy": 0.9994632380952381, "eval_loss": 0.003615338122472167, "eval_runtime": 53.1392, "eval_samples_per_second": 564.555, "eval_steps_per_second": 35.285, "step": 167000 }, { "epoch": 1.6379976334601354, "grad_norm": 0.03695495426654816, "learning_rate": 0.00012639277777777778, "loss": 0.0053, "step": 167500 }, { "epoch": 1.6428871786346435, "grad_norm": 0.011762870475649834, "learning_rate": 0.00012444833333333333, "loss": 0.0051, "step": 168000 }, { "epoch": 1.6428871786346435, "eval_accuracy": 0.9994638095238095, "eval_loss": 0.003587596118450165, "eval_runtime": 53.5347, "eval_samples_per_second": 560.384, "eval_steps_per_second": 35.024, "step": 168000 }, { "epoch": 1.6477767238091512, "grad_norm": 0.01232131477445364, "learning_rate": 0.0001225038888888889, "loss": 0.0048, "step": 168500 }, { "epoch": 1.652666268983659, "grad_norm": 0.04049614071846008, "learning_rate": 0.00012055944444444445, "loss": 0.0048, "step": 169000 }, { "epoch": 1.652666268983659, "eval_accuracy": 0.9994665714285714, "eval_loss": 0.003581820521503687, "eval_runtime": 53.0217, "eval_samples_per_second": 565.806, "eval_steps_per_second": 35.363, "step": 169000 }, { "epoch": 1.657555814158167, "grad_norm": 0.04034195467829704, "learning_rate": 0.00011861499999999999, "loss": 0.0051, "step": 169500 }, { "epoch": 1.662445359332675, "grad_norm": 0.014481657184660435, "learning_rate": 0.00011667055555555556, "loss": 0.0051, "step": 170000 }, { "epoch": 1.662445359332675, "eval_accuracy": 0.9994720476190476, "eval_loss": 0.0035638269037008286, "eval_runtime": 54.3974, "eval_samples_per_second": 551.497, "eval_steps_per_second": 34.469, "step": 170000 }, { "epoch": 1.6673349045071828, "grad_norm": 0.025204768404364586, "learning_rate": 0.00011472611111111111, "loss": 0.0051, "step": 170500 }, { "epoch": 1.6722244496816905, "grad_norm": 0.027605898678302765, "learning_rate": 0.00011278166666666666, "loss": 0.0049, "step": 171000 }, { "epoch": 1.6722244496816905, "eval_accuracy": 0.9994744285714285, "eval_loss": 0.003567066974937916, "eval_runtime": 53.8985, "eval_samples_per_second": 556.602, "eval_steps_per_second": 34.788, "step": 171000 }, { "epoch": 1.6771139948561986, "grad_norm": 0.038017790764570236, "learning_rate": 0.00011083722222222223, "loss": 0.005, "step": 171500 }, { "epoch": 1.6820035400307063, "grad_norm": 0.048752035945653915, "learning_rate": 0.00010889277777777778, "loss": 0.005, "step": 172000 }, { "epoch": 1.6820035400307063, "eval_accuracy": 0.9994751428571429, "eval_loss": 0.003484962275251746, "eval_runtime": 54.461, "eval_samples_per_second": 550.853, "eval_steps_per_second": 34.428, "step": 172000 }, { "epoch": 1.6868930852052142, "grad_norm": 0.08453824371099472, "learning_rate": 0.00010694833333333333, "loss": 0.005, "step": 172500 }, { "epoch": 1.691782630379722, "grad_norm": 0.01620589755475521, "learning_rate": 0.00010500388888888888, "loss": 0.005, "step": 173000 }, { "epoch": 1.691782630379722, "eval_accuracy": 0.9994759047619047, "eval_loss": 0.003478883532807231, "eval_runtime": 54.0084, "eval_samples_per_second": 555.469, "eval_steps_per_second": 34.717, "step": 173000 }, { "epoch": 1.69667217555423, "grad_norm": 0.024735888466238976, "learning_rate": 0.00010305944444444445, "loss": 0.005, "step": 173500 }, { "epoch": 1.701561720728738, "grad_norm": 0.020829100161790848, "learning_rate": 0.000101115, "loss": 0.005, "step": 174000 }, { "epoch": 1.701561720728738, "eval_accuracy": 0.9994835714285715, "eval_loss": 0.003460401203483343, "eval_runtime": 53.8886, "eval_samples_per_second": 556.704, "eval_steps_per_second": 34.794, "step": 174000 }, { "epoch": 1.7064512659032456, "grad_norm": 0.02870938368141651, "learning_rate": 9.917055555555555e-05, "loss": 0.0049, "step": 174500 }, { "epoch": 1.7113408110777537, "grad_norm": 0.03082539327442646, "learning_rate": 9.72261111111111e-05, "loss": 0.0049, "step": 175000 }, { "epoch": 1.7113408110777537, "eval_accuracy": 0.9994848095238095, "eval_loss": 0.0034341050777584314, "eval_runtime": 53.6965, "eval_samples_per_second": 558.695, "eval_steps_per_second": 34.918, "step": 175000 }, { "epoch": 1.7162303562522614, "grad_norm": 0.04300360381603241, "learning_rate": 9.528166666666667e-05, "loss": 0.0047, "step": 175500 }, { "epoch": 1.7211199014267693, "grad_norm": 0.010836569592356682, "learning_rate": 9.333722222222222e-05, "loss": 0.0049, "step": 176000 }, { "epoch": 1.7211199014267693, "eval_accuracy": 0.999487, "eval_loss": 0.0034149654675275087, "eval_runtime": 54.2439, "eval_samples_per_second": 553.058, "eval_steps_per_second": 34.566, "step": 176000 }, { "epoch": 1.7260094466012772, "grad_norm": 0.012880703434348106, "learning_rate": 9.139277777777777e-05, "loss": 0.0049, "step": 176500 }, { "epoch": 1.7308989917757849, "grad_norm": 0.029965711757540703, "learning_rate": 8.944833333333334e-05, "loss": 0.0049, "step": 177000 }, { "epoch": 1.7308989917757849, "eval_accuracy": 0.9994862857142857, "eval_loss": 0.0034615020267665386, "eval_runtime": 53.1772, "eval_samples_per_second": 564.151, "eval_steps_per_second": 35.259, "step": 177000 }, { "epoch": 1.735788536950293, "grad_norm": 0.014986414462327957, "learning_rate": 8.750388888888889e-05, "loss": 0.0048, "step": 177500 }, { "epoch": 1.7406780821248007, "grad_norm": 0.02675153873860836, "learning_rate": 8.555944444444445e-05, "loss": 0.0049, "step": 178000 }, { "epoch": 1.7406780821248007, "eval_accuracy": 0.9994909047619047, "eval_loss": 0.003412367310374975, "eval_runtime": 54.1342, "eval_samples_per_second": 554.178, "eval_steps_per_second": 34.636, "step": 178000 }, { "epoch": 1.7455676272993086, "grad_norm": 0.031100204214453697, "learning_rate": 8.3615e-05, "loss": 0.0051, "step": 178500 }, { "epoch": 1.7504571724738165, "grad_norm": 0.04925690218806267, "learning_rate": 8.167055555555555e-05, "loss": 0.005, "step": 179000 }, { "epoch": 1.7504571724738165, "eval_accuracy": 0.9994981428571429, "eval_loss": 0.003331870539113879, "eval_runtime": 53.5829, "eval_samples_per_second": 559.881, "eval_steps_per_second": 34.993, "step": 179000 }, { "epoch": 1.7553467176483244, "grad_norm": 0.029799846932291985, "learning_rate": 7.972611111111112e-05, "loss": 0.0048, "step": 179500 }, { "epoch": 1.7602362628228323, "grad_norm": 0.012169072404503822, "learning_rate": 7.778166666666666e-05, "loss": 0.005, "step": 180000 }, { "epoch": 1.7602362628228323, "eval_accuracy": 0.9994982380952381, "eval_loss": 0.003362874034792185, "eval_runtime": 53.5982, "eval_samples_per_second": 559.72, "eval_steps_per_second": 34.983, "step": 180000 }, { "epoch": 1.76512580799734, "grad_norm": 0.016585633158683777, "learning_rate": 7.583722222222222e-05, "loss": 0.0045, "step": 180500 }, { "epoch": 1.770015353171848, "grad_norm": 0.025369074195623398, "learning_rate": 7.389277777777777e-05, "loss": 0.0047, "step": 181000 }, { "epoch": 1.770015353171848, "eval_accuracy": 0.9995001904761904, "eval_loss": 0.0033530080690979958, "eval_runtime": 53.4193, "eval_samples_per_second": 561.595, "eval_steps_per_second": 35.1, "step": 181000 }, { "epoch": 1.7749048983463558, "grad_norm": 0.04421771690249443, "learning_rate": 7.194833333333333e-05, "loss": 0.0046, "step": 181500 }, { "epoch": 1.7797944435208637, "grad_norm": 0.05346609279513359, "learning_rate": 7.000388888888889e-05, "loss": 0.0048, "step": 182000 }, { "epoch": 1.7797944435208637, "eval_accuracy": 0.9995021428571429, "eval_loss": 0.0033386677969247103, "eval_runtime": 53.8476, "eval_samples_per_second": 557.128, "eval_steps_per_second": 34.82, "step": 182000 }, { "epoch": 1.7846839886953716, "grad_norm": 0.019687172025442123, "learning_rate": 6.805944444444444e-05, "loss": 0.0048, "step": 182500 }, { "epoch": 1.7895735338698793, "grad_norm": 0.026194104924798012, "learning_rate": 6.6115e-05, "loss": 0.0048, "step": 183000 }, { "epoch": 1.7895735338698793, "eval_accuracy": 0.9995039523809524, "eval_loss": 0.003320470917969942, "eval_runtime": 54.9547, "eval_samples_per_second": 545.904, "eval_steps_per_second": 34.119, "step": 183000 }, { "epoch": 1.7944630790443874, "grad_norm": 0.039239440113306046, "learning_rate": 6.417055555555556e-05, "loss": 0.0046, "step": 183500 }, { "epoch": 1.799352624218895, "grad_norm": 0.007467139046639204, "learning_rate": 6.222611111111111e-05, "loss": 0.0045, "step": 184000 }, { "epoch": 1.799352624218895, "eval_accuracy": 0.9995039047619048, "eval_loss": 0.003313555382192135, "eval_runtime": 54.3263, "eval_samples_per_second": 552.218, "eval_steps_per_second": 34.514, "step": 184000 }, { "epoch": 1.804242169393403, "grad_norm": 0.015036596916615963, "learning_rate": 6.028166666666666e-05, "loss": 0.0047, "step": 184500 }, { "epoch": 1.8091317145679109, "grad_norm": 0.03583378717303276, "learning_rate": 5.8337222222222226e-05, "loss": 0.0045, "step": 185000 }, { "epoch": 1.8091317145679109, "eval_accuracy": 0.9995053333333334, "eval_loss": 0.003319466719403863, "eval_runtime": 53.7318, "eval_samples_per_second": 558.329, "eval_steps_per_second": 34.896, "step": 185000 }, { "epoch": 1.8140212597424188, "grad_norm": 0.025585120543837547, "learning_rate": 5.6392777777777775e-05, "loss": 0.0046, "step": 185500 }, { "epoch": 1.8189108049169267, "grad_norm": 0.05633428320288658, "learning_rate": 5.444833333333333e-05, "loss": 0.0049, "step": 186000 }, { "epoch": 1.8189108049169267, "eval_accuracy": 0.9995094285714285, "eval_loss": 0.0032863873057067394, "eval_runtime": 54.2808, "eval_samples_per_second": 552.682, "eval_steps_per_second": 34.543, "step": 186000 }, { "epoch": 1.8238003500914344, "grad_norm": 0.08839651942253113, "learning_rate": 5.2503888888888895e-05, "loss": 0.0046, "step": 186500 }, { "epoch": 1.8286898952659425, "grad_norm": 0.02346086874604225, "learning_rate": 5.0559444444444445e-05, "loss": 0.0046, "step": 187000 }, { "epoch": 1.8286898952659425, "eval_accuracy": 0.999513, "eval_loss": 0.0032574611250311136, "eval_runtime": 53.1956, "eval_samples_per_second": 563.957, "eval_steps_per_second": 35.247, "step": 187000 }, { "epoch": 1.8335794404404502, "grad_norm": 0.04460394009947777, "learning_rate": 4.8615e-05, "loss": 0.0048, "step": 187500 }, { "epoch": 1.838468985614958, "grad_norm": 0.039988644421100616, "learning_rate": 4.667055555555555e-05, "loss": 0.0045, "step": 188000 }, { "epoch": 1.838468985614958, "eval_accuracy": 0.999518380952381, "eval_loss": 0.0032375219743698835, "eval_runtime": 54.4684, "eval_samples_per_second": 550.778, "eval_steps_per_second": 34.424, "step": 188000 }, { "epoch": 1.843358530789466, "grad_norm": 0.026043614372611046, "learning_rate": 4.4726111111111114e-05, "loss": 0.0045, "step": 188500 }, { "epoch": 1.8482480759639737, "grad_norm": 0.03250015527009964, "learning_rate": 4.2781666666666664e-05, "loss": 0.0046, "step": 189000 }, { "epoch": 1.8482480759639737, "eval_accuracy": 0.999518, "eval_loss": 0.003229686524719, "eval_runtime": 53.1577, "eval_samples_per_second": 564.358, "eval_steps_per_second": 35.272, "step": 189000 }, { "epoch": 1.8531376211384818, "grad_norm": 0.041333604604005814, "learning_rate": 4.083722222222222e-05, "loss": 0.0045, "step": 189500 }, { "epoch": 1.8580271663129895, "grad_norm": 0.030839432030916214, "learning_rate": 3.889277777777778e-05, "loss": 0.0044, "step": 190000 }, { "epoch": 1.8580271663129895, "eval_accuracy": 0.9995217142857142, "eval_loss": 0.0032240275759249926, "eval_runtime": 53.3872, "eval_samples_per_second": 561.933, "eval_steps_per_second": 35.121, "step": 190000 }, { "epoch": 1.8629167114874974, "grad_norm": 0.0212627574801445, "learning_rate": 3.694833333333333e-05, "loss": 0.0044, "step": 190500 }, { "epoch": 1.8678062566620053, "grad_norm": 0.04159221053123474, "learning_rate": 3.500388888888889e-05, "loss": 0.0046, "step": 191000 }, { "epoch": 1.8678062566620053, "eval_accuracy": 0.9995220476190476, "eval_loss": 0.0032232191879302263, "eval_runtime": 53.7977, "eval_samples_per_second": 557.645, "eval_steps_per_second": 34.853, "step": 191000 }, { "epoch": 1.8726958018365132, "grad_norm": 0.02389533445239067, "learning_rate": 3.3059444444444446e-05, "loss": 0.0045, "step": 191500 }, { "epoch": 1.877585347011021, "grad_norm": 0.02341424487531185, "learning_rate": 3.1115e-05, "loss": 0.0045, "step": 192000 }, { "epoch": 1.877585347011021, "eval_accuracy": 0.9995232857142857, "eval_loss": 0.0032045834232121706, "eval_runtime": 54.1632, "eval_samples_per_second": 553.881, "eval_steps_per_second": 34.618, "step": 192000 }, { "epoch": 1.8824748921855288, "grad_norm": 0.03770390897989273, "learning_rate": 2.9170555555555556e-05, "loss": 0.0046, "step": 192500 }, { "epoch": 1.887364437360037, "grad_norm": 0.024086985737085342, "learning_rate": 2.7226111111111112e-05, "loss": 0.0044, "step": 193000 }, { "epoch": 1.887364437360037, "eval_accuracy": 0.9995234285714286, "eval_loss": 0.003206311957910657, "eval_runtime": 53.6844, "eval_samples_per_second": 558.822, "eval_steps_per_second": 34.926, "step": 193000 }, { "epoch": 1.8922539825345446, "grad_norm": 0.02860225737094879, "learning_rate": 2.5281666666666665e-05, "loss": 0.0043, "step": 193500 }, { "epoch": 1.8971435277090525, "grad_norm": 0.034325193613767624, "learning_rate": 2.3337222222222222e-05, "loss": 0.0044, "step": 194000 }, { "epoch": 1.8971435277090525, "eval_accuracy": 0.9995254285714286, "eval_loss": 0.0031913991551846266, "eval_runtime": 54.2224, "eval_samples_per_second": 553.276, "eval_steps_per_second": 34.58, "step": 194000 }, { "epoch": 1.9020330728835604, "grad_norm": 0.03300917148590088, "learning_rate": 2.139277777777778e-05, "loss": 0.0045, "step": 194500 }, { "epoch": 1.9069226180580683, "grad_norm": 0.037190355360507965, "learning_rate": 1.9448333333333335e-05, "loss": 0.0043, "step": 195000 }, { "epoch": 1.9069226180580683, "eval_accuracy": 0.9995253333333334, "eval_loss": 0.0031883243937045336, "eval_runtime": 54.2098, "eval_samples_per_second": 553.405, "eval_steps_per_second": 34.588, "step": 195000 }, { "epoch": 1.9118121632325762, "grad_norm": 0.1029098629951477, "learning_rate": 1.7503888888888888e-05, "loss": 0.0045, "step": 195500 }, { "epoch": 1.9167017084070839, "grad_norm": 0.027764180675148964, "learning_rate": 1.5559444444444444e-05, "loss": 0.0043, "step": 196000 }, { "epoch": 1.9167017084070839, "eval_accuracy": 0.9995274761904762, "eval_loss": 0.0031636343337595463, "eval_runtime": 54.5719, "eval_samples_per_second": 549.733, "eval_steps_per_second": 34.358, "step": 196000 }, { "epoch": 1.921591253581592, "grad_norm": 0.031358424574136734, "learning_rate": 1.3615e-05, "loss": 0.0043, "step": 196500 }, { "epoch": 1.9264807987560997, "grad_norm": 0.035557158291339874, "learning_rate": 1.1670555555555556e-05, "loss": 0.0046, "step": 197000 }, { "epoch": 1.9264807987560997, "eval_accuracy": 0.999529761904762, "eval_loss": 0.0031551867723464966, "eval_runtime": 56.5808, "eval_samples_per_second": 530.215, "eval_steps_per_second": 33.138, "step": 197000 }, { "epoch": 1.9313703439306076, "grad_norm": 0.034682463854551315, "learning_rate": 9.72611111111111e-06, "loss": 0.0045, "step": 197500 }, { "epoch": 1.9362598891051155, "grad_norm": 0.023823970928788185, "learning_rate": 7.781666666666667e-06, "loss": 0.0043, "step": 198000 }, { "epoch": 1.9362598891051155, "eval_accuracy": 0.9995307142857143, "eval_loss": 0.0031563735101372004, "eval_runtime": 56.8893, "eval_samples_per_second": 527.34, "eval_steps_per_second": 32.959, "step": 198000 }, { "epoch": 1.9411494342796232, "grad_norm": 0.020929349586367607, "learning_rate": 5.837222222222222e-06, "loss": 0.0044, "step": 198500 }, { "epoch": 1.9460389794541313, "grad_norm": 0.025913028046488762, "learning_rate": 3.892777777777778e-06, "loss": 0.0044, "step": 199000 }, { "epoch": 1.9460389794541313, "eval_accuracy": 0.9995307619047619, "eval_loss": 0.003153804922476411, "eval_runtime": 54.8658, "eval_samples_per_second": 546.788, "eval_steps_per_second": 34.174, "step": 199000 }, { "epoch": 1.950928524628639, "grad_norm": 0.02582838013768196, "learning_rate": 1.9483333333333335e-06, "loss": 0.0044, "step": 199500 }, { "epoch": 1.9558180698031469, "grad_norm": 0.016758419573307037, "learning_rate": 3.888888888888889e-09, "loss": 0.0042, "step": 200000 }, { "epoch": 1.9558180698031469, "eval_accuracy": 0.9995309047619048, "eval_loss": 0.003156075021252036, "eval_runtime": 55.5903, "eval_samples_per_second": 539.663, "eval_steps_per_second": 33.729, "step": 200000 } ], "logging_steps": 500, "max_steps": 200000, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 1000, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 7, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 1 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.480299103223808e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }