| { | |
| "best_global_step": 199000, | |
| "best_metric": 0.003153804922476411, | |
| "best_model_checkpoint": "./models/t5-small-separated-augmented-200k\\checkpoint-199000", | |
| "epoch": 1.9558180698031469, | |
| "eval_steps": 1000, | |
| "global_step": 200000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.004889545174507868, | |
| "grad_norm": 0.5060574412345886, | |
| "learning_rate": 1.7465e-05, | |
| "loss": 6.6771, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.009779090349015735, | |
| "grad_norm": 0.35827475786209106, | |
| "learning_rate": 3.4965e-05, | |
| "loss": 0.1893, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.009779090349015735, | |
| "eval_accuracy": 0.9878454285714285, | |
| "eval_loss": 0.09678807854652405, | |
| "eval_runtime": 54.2373, | |
| "eval_samples_per_second": 553.125, | |
| "eval_steps_per_second": 34.57, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.014668635523523602, | |
| "grad_norm": 0.34889769554138184, | |
| "learning_rate": 5.2465e-05, | |
| "loss": 0.1447, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.01955818069803147, | |
| "grad_norm": 0.23819516599178314, | |
| "learning_rate": 6.9965e-05, | |
| "loss": 0.1206, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.01955818069803147, | |
| "eval_accuracy": 0.9902656666666667, | |
| "eval_loss": 0.073659747838974, | |
| "eval_runtime": 53.4879, | |
| "eval_samples_per_second": 560.875, | |
| "eval_steps_per_second": 35.055, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.024447725872539336, | |
| "grad_norm": 0.22346411645412445, | |
| "learning_rate": 8.7465e-05, | |
| "loss": 0.1088, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.029337271047047205, | |
| "grad_norm": 0.1609542965888977, | |
| "learning_rate": 0.000104965, | |
| "loss": 0.1005, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.029337271047047205, | |
| "eval_accuracy": 0.9918363809523809, | |
| "eval_loss": 0.05878164619207382, | |
| "eval_runtime": 55.0708, | |
| "eval_samples_per_second": 544.753, | |
| "eval_steps_per_second": 34.047, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.03422681622155507, | |
| "grad_norm": 0.16877996921539307, | |
| "learning_rate": 0.000122465, | |
| "loss": 0.0921, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.03911636139606294, | |
| "grad_norm": 0.19500480592250824, | |
| "learning_rate": 0.00013996499999999998, | |
| "loss": 0.0857, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.03911636139606294, | |
| "eval_accuracy": 0.992877380952381, | |
| "eval_loss": 0.05044129863381386, | |
| "eval_runtime": 52.8632, | |
| "eval_samples_per_second": 567.503, | |
| "eval_steps_per_second": 35.469, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.0440059065705708, | |
| "grad_norm": 0.2271735668182373, | |
| "learning_rate": 0.000157465, | |
| "loss": 0.0785, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.04889545174507867, | |
| "grad_norm": 0.15599773824214935, | |
| "learning_rate": 0.000174965, | |
| "loss": 0.0743, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.04889545174507867, | |
| "eval_accuracy": 0.993641380952381, | |
| "eval_loss": 0.04416767507791519, | |
| "eval_runtime": 53.9114, | |
| "eval_samples_per_second": 556.469, | |
| "eval_steps_per_second": 34.779, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.05378499691958654, | |
| "grad_norm": 0.13649936020374298, | |
| "learning_rate": 0.000192465, | |
| "loss": 0.0696, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 0.05867454209409441, | |
| "grad_norm": 0.17215129733085632, | |
| "learning_rate": 0.000209965, | |
| "loss": 0.0669, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.05867454209409441, | |
| "eval_accuracy": 0.994305380952381, | |
| "eval_loss": 0.03922554850578308, | |
| "eval_runtime": 53.2344, | |
| "eval_samples_per_second": 563.546, | |
| "eval_steps_per_second": 35.222, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.06356408726860227, | |
| "grad_norm": 0.21248804032802582, | |
| "learning_rate": 0.00022746500000000002, | |
| "loss": 0.0636, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 0.06845363244311015, | |
| "grad_norm": 0.2209671139717102, | |
| "learning_rate": 0.000244965, | |
| "loss": 0.062, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 0.06845363244311015, | |
| "eval_accuracy": 0.9948234285714286, | |
| "eval_loss": 0.035148605704307556, | |
| "eval_runtime": 53.6983, | |
| "eval_samples_per_second": 558.677, | |
| "eval_steps_per_second": 34.917, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 0.07334317761761801, | |
| "grad_norm": 0.16804896295070648, | |
| "learning_rate": 0.000262465, | |
| "loss": 0.0584, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 0.07823272279212588, | |
| "grad_norm": 0.13331238925457, | |
| "learning_rate": 0.000279965, | |
| "loss": 0.0576, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 0.07823272279212588, | |
| "eval_accuracy": 0.995110619047619, | |
| "eval_loss": 0.03278239071369171, | |
| "eval_runtime": 52.8459, | |
| "eval_samples_per_second": 567.688, | |
| "eval_steps_per_second": 35.481, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 0.08312226796663374, | |
| "grad_norm": 0.15275965631008148, | |
| "learning_rate": 0.000297465, | |
| "loss": 0.0545, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 0.0880118131411416, | |
| "grad_norm": 0.14770014584064484, | |
| "learning_rate": 0.000314965, | |
| "loss": 0.0509, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 0.0880118131411416, | |
| "eval_accuracy": 0.9955557619047619, | |
| "eval_loss": 0.029773302376270294, | |
| "eval_runtime": 53.681, | |
| "eval_samples_per_second": 558.857, | |
| "eval_steps_per_second": 34.929, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 0.09290135831564948, | |
| "grad_norm": 0.13802163302898407, | |
| "learning_rate": 0.000332465, | |
| "loss": 0.0503, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 0.09779090349015734, | |
| "grad_norm": 0.16345028579235077, | |
| "learning_rate": 0.000349965, | |
| "loss": 0.0492, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 0.09779090349015734, | |
| "eval_accuracy": 0.9958419047619048, | |
| "eval_loss": 0.027848461642861366, | |
| "eval_runtime": 53.5151, | |
| "eval_samples_per_second": 560.589, | |
| "eval_steps_per_second": 35.037, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 0.10268044866466522, | |
| "grad_norm": 0.09112809598445892, | |
| "learning_rate": 0.00036746500000000003, | |
| "loss": 0.0475, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 0.10756999383917308, | |
| "grad_norm": 0.20798154175281525, | |
| "learning_rate": 0.000384965, | |
| "loss": 0.046, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 0.10756999383917308, | |
| "eval_accuracy": 0.99604, | |
| "eval_loss": 0.026027251034975052, | |
| "eval_runtime": 53.2138, | |
| "eval_samples_per_second": 563.764, | |
| "eval_steps_per_second": 35.235, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 0.11245953901368094, | |
| "grad_norm": 0.19015829265117645, | |
| "learning_rate": 0.00040246499999999996, | |
| "loss": 0.0432, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 0.11734908418818882, | |
| "grad_norm": 0.12272685021162033, | |
| "learning_rate": 0.000419965, | |
| "loss": 0.0434, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 0.11734908418818882, | |
| "eval_accuracy": 0.9963316666666666, | |
| "eval_loss": 0.024126138538122177, | |
| "eval_runtime": 53.0184, | |
| "eval_samples_per_second": 565.841, | |
| "eval_steps_per_second": 35.365, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 0.12223862936269668, | |
| "grad_norm": 0.10090415924787521, | |
| "learning_rate": 0.000437465, | |
| "loss": 0.0425, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 0.12712817453720454, | |
| "grad_norm": 0.113510861992836, | |
| "learning_rate": 0.000454965, | |
| "loss": 0.0412, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 0.12712817453720454, | |
| "eval_accuracy": 0.9965295714285715, | |
| "eval_loss": 0.022982601076364517, | |
| "eval_runtime": 54.4384, | |
| "eval_samples_per_second": 551.082, | |
| "eval_steps_per_second": 34.443, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 0.13201771971171242, | |
| "grad_norm": 0.09937796741724014, | |
| "learning_rate": 0.00047246500000000004, | |
| "loss": 0.04, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 0.1369072648862203, | |
| "grad_norm": 0.11914831399917603, | |
| "learning_rate": 0.000489965, | |
| "loss": 0.0389, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 0.1369072648862203, | |
| "eval_accuracy": 0.996802, | |
| "eval_loss": 0.02113029547035694, | |
| "eval_runtime": 53.2003, | |
| "eval_samples_per_second": 563.907, | |
| "eval_steps_per_second": 35.244, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 0.14179681006072814, | |
| "grad_norm": 0.17324307560920715, | |
| "learning_rate": 0.000507465, | |
| "loss": 0.0384, | |
| "step": 14500 | |
| }, | |
| { | |
| "epoch": 0.14668635523523602, | |
| "grad_norm": 0.12623025476932526, | |
| "learning_rate": 0.000524965, | |
| "loss": 0.0364, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 0.14668635523523602, | |
| "eval_accuracy": 0.9968850476190476, | |
| "eval_loss": 0.020160900428891182, | |
| "eval_runtime": 53.6937, | |
| "eval_samples_per_second": 558.725, | |
| "eval_steps_per_second": 34.92, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 0.1515759004097439, | |
| "grad_norm": 0.1337081342935562, | |
| "learning_rate": 0.000542465, | |
| "loss": 0.0367, | |
| "step": 15500 | |
| }, | |
| { | |
| "epoch": 0.15646544558425177, | |
| "grad_norm": 0.16239804029464722, | |
| "learning_rate": 0.000559965, | |
| "loss": 0.0357, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 0.15646544558425177, | |
| "eval_accuracy": 0.9969695714285715, | |
| "eval_loss": 0.020250126719474792, | |
| "eval_runtime": 54.2376, | |
| "eval_samples_per_second": 553.122, | |
| "eval_steps_per_second": 34.57, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 0.16135499075875961, | |
| "grad_norm": 0.09299212694168091, | |
| "learning_rate": 0.000577465, | |
| "loss": 0.0356, | |
| "step": 16500 | |
| }, | |
| { | |
| "epoch": 0.1662445359332675, | |
| "grad_norm": 0.12462040781974792, | |
| "learning_rate": 0.000594965, | |
| "loss": 0.0343, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 0.1662445359332675, | |
| "eval_accuracy": 0.9971193333333334, | |
| "eval_loss": 0.01877717673778534, | |
| "eval_runtime": 54.3351, | |
| "eval_samples_per_second": 552.13, | |
| "eval_steps_per_second": 34.508, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 0.17113408110777537, | |
| "grad_norm": 0.08858466893434525, | |
| "learning_rate": 0.000612465, | |
| "loss": 0.0337, | |
| "step": 17500 | |
| }, | |
| { | |
| "epoch": 0.1760236262822832, | |
| "grad_norm": 0.14879809319972992, | |
| "learning_rate": 0.000629965, | |
| "loss": 0.0335, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 0.1760236262822832, | |
| "eval_accuracy": 0.9971792380952381, | |
| "eval_loss": 0.018667874857783318, | |
| "eval_runtime": 54.7854, | |
| "eval_samples_per_second": 547.591, | |
| "eval_steps_per_second": 34.224, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 0.1809131714567911, | |
| "grad_norm": 0.10354409366846085, | |
| "learning_rate": 0.0006474650000000001, | |
| "loss": 0.032, | |
| "step": 18500 | |
| }, | |
| { | |
| "epoch": 0.18580271663129896, | |
| "grad_norm": 0.1182965636253357, | |
| "learning_rate": 0.000664965, | |
| "loss": 0.0318, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 0.18580271663129896, | |
| "eval_accuracy": 0.9973930952380953, | |
| "eval_loss": 0.017232514917850494, | |
| "eval_runtime": 53.7973, | |
| "eval_samples_per_second": 557.649, | |
| "eval_steps_per_second": 34.853, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 0.1906922618058068, | |
| "grad_norm": 0.05959112569689751, | |
| "learning_rate": 0.0006824649999999999, | |
| "loss": 0.0318, | |
| "step": 19500 | |
| }, | |
| { | |
| "epoch": 0.1955818069803147, | |
| "grad_norm": 0.1270582675933838, | |
| "learning_rate": 0.000699965, | |
| "loss": 0.0307, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 0.1955818069803147, | |
| "eval_accuracy": 0.9973767619047619, | |
| "eval_loss": 0.01737845316529274, | |
| "eval_runtime": 53.2789, | |
| "eval_samples_per_second": 563.075, | |
| "eval_steps_per_second": 35.192, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 0.20047135215482256, | |
| "grad_norm": 0.08427739888429642, | |
| "learning_rate": 0.0006980594444444445, | |
| "loss": 0.0298, | |
| "step": 20500 | |
| }, | |
| { | |
| "epoch": 0.20536089732933044, | |
| "grad_norm": 0.07171203941106796, | |
| "learning_rate": 0.000696115, | |
| "loss": 0.0293, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 0.20536089732933044, | |
| "eval_accuracy": 0.9975350952380952, | |
| "eval_loss": 0.016114523634314537, | |
| "eval_runtime": 53.7368, | |
| "eval_samples_per_second": 558.277, | |
| "eval_steps_per_second": 34.892, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 0.2102504425038383, | |
| "grad_norm": 0.07719539105892181, | |
| "learning_rate": 0.0006941705555555555, | |
| "loss": 0.0291, | |
| "step": 21500 | |
| }, | |
| { | |
| "epoch": 0.21513998767834616, | |
| "grad_norm": 0.08832105249166489, | |
| "learning_rate": 0.0006922261111111111, | |
| "loss": 0.0286, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 0.21513998767834616, | |
| "eval_accuracy": 0.9976384285714286, | |
| "eval_loss": 0.015542366541922092, | |
| "eval_runtime": 53.6783, | |
| "eval_samples_per_second": 558.885, | |
| "eval_steps_per_second": 34.93, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 0.22002953285285404, | |
| "grad_norm": 0.1472863107919693, | |
| "learning_rate": 0.0006902816666666667, | |
| "loss": 0.0277, | |
| "step": 22500 | |
| }, | |
| { | |
| "epoch": 0.22491907802736189, | |
| "grad_norm": 0.09753895550966263, | |
| "learning_rate": 0.0006883372222222222, | |
| "loss": 0.0268, | |
| "step": 23000 | |
| }, | |
| { | |
| "epoch": 0.22491907802736189, | |
| "eval_accuracy": 0.9977074761904762, | |
| "eval_loss": 0.015192433260381222, | |
| "eval_runtime": 53.436, | |
| "eval_samples_per_second": 561.419, | |
| "eval_steps_per_second": 35.089, | |
| "step": 23000 | |
| }, | |
| { | |
| "epoch": 0.22980862320186976, | |
| "grad_norm": 0.12348861992359161, | |
| "learning_rate": 0.0006863927777777778, | |
| "loss": 0.026, | |
| "step": 23500 | |
| }, | |
| { | |
| "epoch": 0.23469816837637764, | |
| "grad_norm": 0.1123756393790245, | |
| "learning_rate": 0.0006844483333333333, | |
| "loss": 0.0257, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 0.23469816837637764, | |
| "eval_accuracy": 0.997726761904762, | |
| "eval_loss": 0.014932113699615002, | |
| "eval_runtime": 53.1941, | |
| "eval_samples_per_second": 563.972, | |
| "eval_steps_per_second": 35.248, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 0.23958771355088548, | |
| "grad_norm": 0.07256095856428146, | |
| "learning_rate": 0.0006825038888888889, | |
| "loss": 0.0256, | |
| "step": 24500 | |
| }, | |
| { | |
| "epoch": 0.24447725872539336, | |
| "grad_norm": 0.05496814846992493, | |
| "learning_rate": 0.0006805594444444444, | |
| "loss": 0.0251, | |
| "step": 25000 | |
| }, | |
| { | |
| "epoch": 0.24447725872539336, | |
| "eval_accuracy": 0.9978721904761905, | |
| "eval_loss": 0.01384472381323576, | |
| "eval_runtime": 54.0604, | |
| "eval_samples_per_second": 554.935, | |
| "eval_steps_per_second": 34.683, | |
| "step": 25000 | |
| }, | |
| { | |
| "epoch": 0.24936680389990123, | |
| "grad_norm": 0.09915214031934738, | |
| "learning_rate": 0.000678615, | |
| "loss": 0.0251, | |
| "step": 25500 | |
| }, | |
| { | |
| "epoch": 0.2542563490744091, | |
| "grad_norm": 0.14060749113559723, | |
| "learning_rate": 0.0006766705555555555, | |
| "loss": 0.0244, | |
| "step": 26000 | |
| }, | |
| { | |
| "epoch": 0.2542563490744091, | |
| "eval_accuracy": 0.9979192857142857, | |
| "eval_loss": 0.01368007156997919, | |
| "eval_runtime": 52.8524, | |
| "eval_samples_per_second": 567.618, | |
| "eval_steps_per_second": 35.476, | |
| "step": 26000 | |
| }, | |
| { | |
| "epoch": 0.259145894248917, | |
| "grad_norm": 0.09252548217773438, | |
| "learning_rate": 0.0006747261111111111, | |
| "loss": 0.024, | |
| "step": 26500 | |
| }, | |
| { | |
| "epoch": 0.26403543942342483, | |
| "grad_norm": 0.11915791034698486, | |
| "learning_rate": 0.0006727816666666666, | |
| "loss": 0.0232, | |
| "step": 27000 | |
| }, | |
| { | |
| "epoch": 0.26403543942342483, | |
| "eval_accuracy": 0.9980117142857143, | |
| "eval_loss": 0.012998638674616814, | |
| "eval_runtime": 54.0246, | |
| "eval_samples_per_second": 555.303, | |
| "eval_steps_per_second": 34.706, | |
| "step": 27000 | |
| }, | |
| { | |
| "epoch": 0.2689249845979327, | |
| "grad_norm": 0.10810112953186035, | |
| "learning_rate": 0.0006708372222222222, | |
| "loss": 0.0233, | |
| "step": 27500 | |
| }, | |
| { | |
| "epoch": 0.2738145297724406, | |
| "grad_norm": 0.07593973726034164, | |
| "learning_rate": 0.0006688927777777778, | |
| "loss": 0.0227, | |
| "step": 28000 | |
| }, | |
| { | |
| "epoch": 0.2738145297724406, | |
| "eval_accuracy": 0.9980548095238095, | |
| "eval_loss": 0.012805027887225151, | |
| "eval_runtime": 53.176, | |
| "eval_samples_per_second": 564.164, | |
| "eval_steps_per_second": 35.26, | |
| "step": 28000 | |
| }, | |
| { | |
| "epoch": 0.27870407494694843, | |
| "grad_norm": 0.06336738914251328, | |
| "learning_rate": 0.0006669483333333333, | |
| "loss": 0.0229, | |
| "step": 28500 | |
| }, | |
| { | |
| "epoch": 0.2835936201214563, | |
| "grad_norm": 0.12944093346595764, | |
| "learning_rate": 0.0006650038888888889, | |
| "loss": 0.0221, | |
| "step": 29000 | |
| }, | |
| { | |
| "epoch": 0.2835936201214563, | |
| "eval_accuracy": 0.9980741428571429, | |
| "eval_loss": 0.012613357976078987, | |
| "eval_runtime": 53.5915, | |
| "eval_samples_per_second": 559.79, | |
| "eval_steps_per_second": 34.987, | |
| "step": 29000 | |
| }, | |
| { | |
| "epoch": 0.2884831652959642, | |
| "grad_norm": 0.09919234365224838, | |
| "learning_rate": 0.0006630594444444445, | |
| "loss": 0.0213, | |
| "step": 29500 | |
| }, | |
| { | |
| "epoch": 0.29337271047047203, | |
| "grad_norm": 0.08204931020736694, | |
| "learning_rate": 0.000661115, | |
| "loss": 0.0219, | |
| "step": 30000 | |
| }, | |
| { | |
| "epoch": 0.29337271047047203, | |
| "eval_accuracy": 0.998159, | |
| "eval_loss": 0.011940201744437218, | |
| "eval_runtime": 53.1317, | |
| "eval_samples_per_second": 564.635, | |
| "eval_steps_per_second": 35.29, | |
| "step": 30000 | |
| }, | |
| { | |
| "epoch": 0.2982622556449799, | |
| "grad_norm": 0.11553770303726196, | |
| "learning_rate": 0.0006591705555555556, | |
| "loss": 0.0208, | |
| "step": 30500 | |
| }, | |
| { | |
| "epoch": 0.3031518008194878, | |
| "grad_norm": 0.12381038069725037, | |
| "learning_rate": 0.0006572261111111111, | |
| "loss": 0.0205, | |
| "step": 31000 | |
| }, | |
| { | |
| "epoch": 0.3031518008194878, | |
| "eval_accuracy": 0.9982196666666666, | |
| "eval_loss": 0.011603106744587421, | |
| "eval_runtime": 53.375, | |
| "eval_samples_per_second": 562.061, | |
| "eval_steps_per_second": 35.129, | |
| "step": 31000 | |
| }, | |
| { | |
| "epoch": 0.30804134599399563, | |
| "grad_norm": 0.06441524624824524, | |
| "learning_rate": 0.0006552816666666667, | |
| "loss": 0.0204, | |
| "step": 31500 | |
| }, | |
| { | |
| "epoch": 0.31293089116850353, | |
| "grad_norm": 0.08449769020080566, | |
| "learning_rate": 0.0006533372222222222, | |
| "loss": 0.0206, | |
| "step": 32000 | |
| }, | |
| { | |
| "epoch": 0.31293089116850353, | |
| "eval_accuracy": 0.9982467142857143, | |
| "eval_loss": 0.011421745643019676, | |
| "eval_runtime": 53.2003, | |
| "eval_samples_per_second": 563.907, | |
| "eval_steps_per_second": 35.244, | |
| "step": 32000 | |
| }, | |
| { | |
| "epoch": 0.3178204363430114, | |
| "grad_norm": 0.07885874062776566, | |
| "learning_rate": 0.0006513927777777777, | |
| "loss": 0.02, | |
| "step": 32500 | |
| }, | |
| { | |
| "epoch": 0.32270998151751923, | |
| "grad_norm": 0.07178321480751038, | |
| "learning_rate": 0.0006494483333333333, | |
| "loss": 0.0193, | |
| "step": 33000 | |
| }, | |
| { | |
| "epoch": 0.32270998151751923, | |
| "eval_accuracy": 0.9983428571428571, | |
| "eval_loss": 0.011021795682609081, | |
| "eval_runtime": 53.8106, | |
| "eval_samples_per_second": 557.511, | |
| "eval_steps_per_second": 34.844, | |
| "step": 33000 | |
| }, | |
| { | |
| "epoch": 0.32759952669202713, | |
| "grad_norm": 0.06164510175585747, | |
| "learning_rate": 0.0006475038888888888, | |
| "loss": 0.0192, | |
| "step": 33500 | |
| }, | |
| { | |
| "epoch": 0.332489071866535, | |
| "grad_norm": 0.11073775589466095, | |
| "learning_rate": 0.0006455594444444444, | |
| "loss": 0.0193, | |
| "step": 34000 | |
| }, | |
| { | |
| "epoch": 0.332489071866535, | |
| "eval_accuracy": 0.9983445238095238, | |
| "eval_loss": 0.010947330854833126, | |
| "eval_runtime": 53.4068, | |
| "eval_samples_per_second": 561.727, | |
| "eval_steps_per_second": 35.108, | |
| "step": 34000 | |
| }, | |
| { | |
| "epoch": 0.3373786170410428, | |
| "grad_norm": 0.1216714084148407, | |
| "learning_rate": 0.0006436149999999999, | |
| "loss": 0.0191, | |
| "step": 34500 | |
| }, | |
| { | |
| "epoch": 0.34226816221555073, | |
| "grad_norm": 0.07570644468069077, | |
| "learning_rate": 0.0006416705555555556, | |
| "loss": 0.0189, | |
| "step": 35000 | |
| }, | |
| { | |
| "epoch": 0.34226816221555073, | |
| "eval_accuracy": 0.9984051904761905, | |
| "eval_loss": 0.01051774900406599, | |
| "eval_runtime": 53.8775, | |
| "eval_samples_per_second": 556.819, | |
| "eval_steps_per_second": 34.801, | |
| "step": 35000 | |
| }, | |
| { | |
| "epoch": 0.3471577073900586, | |
| "grad_norm": 0.10820703208446503, | |
| "learning_rate": 0.0006397261111111112, | |
| "loss": 0.0187, | |
| "step": 35500 | |
| }, | |
| { | |
| "epoch": 0.3520472525645664, | |
| "grad_norm": 0.13289569318294525, | |
| "learning_rate": 0.0006377816666666667, | |
| "loss": 0.0181, | |
| "step": 36000 | |
| }, | |
| { | |
| "epoch": 0.3520472525645664, | |
| "eval_accuracy": 0.9984183333333333, | |
| "eval_loss": 0.010617985390126705, | |
| "eval_runtime": 53.6453, | |
| "eval_samples_per_second": 559.229, | |
| "eval_steps_per_second": 34.952, | |
| "step": 36000 | |
| }, | |
| { | |
| "epoch": 0.35693679773907433, | |
| "grad_norm": 0.09950833022594452, | |
| "learning_rate": 0.0006358372222222223, | |
| "loss": 0.0178, | |
| "step": 36500 | |
| }, | |
| { | |
| "epoch": 0.3618263429135822, | |
| "grad_norm": 0.12055996805429459, | |
| "learning_rate": 0.0006338927777777778, | |
| "loss": 0.0174, | |
| "step": 37000 | |
| }, | |
| { | |
| "epoch": 0.3618263429135822, | |
| "eval_accuracy": 0.9984319047619048, | |
| "eval_loss": 0.01043427549302578, | |
| "eval_runtime": 53.8445, | |
| "eval_samples_per_second": 557.16, | |
| "eval_steps_per_second": 34.823, | |
| "step": 37000 | |
| }, | |
| { | |
| "epoch": 0.36671588808809, | |
| "grad_norm": 0.08831817656755447, | |
| "learning_rate": 0.0006319483333333334, | |
| "loss": 0.0183, | |
| "step": 37500 | |
| }, | |
| { | |
| "epoch": 0.37160543326259793, | |
| "grad_norm": 0.09790224581956863, | |
| "learning_rate": 0.0006300038888888889, | |
| "loss": 0.0171, | |
| "step": 38000 | |
| }, | |
| { | |
| "epoch": 0.37160543326259793, | |
| "eval_accuracy": 0.9984588571428571, | |
| "eval_loss": 0.010330071672797203, | |
| "eval_runtime": 53.8512, | |
| "eval_samples_per_second": 557.091, | |
| "eval_steps_per_second": 34.818, | |
| "step": 38000 | |
| }, | |
| { | |
| "epoch": 0.3764949784371058, | |
| "grad_norm": 0.05283864215016365, | |
| "learning_rate": 0.0006280594444444444, | |
| "loss": 0.017, | |
| "step": 38500 | |
| }, | |
| { | |
| "epoch": 0.3813845236116136, | |
| "grad_norm": 0.12874823808670044, | |
| "learning_rate": 0.000626115, | |
| "loss": 0.0173, | |
| "step": 39000 | |
| }, | |
| { | |
| "epoch": 0.3813845236116136, | |
| "eval_accuracy": 0.9984891904761904, | |
| "eval_loss": 0.009993654675781727, | |
| "eval_runtime": 53.3842, | |
| "eval_samples_per_second": 561.964, | |
| "eval_steps_per_second": 35.123, | |
| "step": 39000 | |
| }, | |
| { | |
| "epoch": 0.3862740687861215, | |
| "grad_norm": 0.08774898201227188, | |
| "learning_rate": 0.0006241705555555555, | |
| "loss": 0.0173, | |
| "step": 39500 | |
| }, | |
| { | |
| "epoch": 0.3911636139606294, | |
| "grad_norm": 0.092228963971138, | |
| "learning_rate": 0.0006222261111111111, | |
| "loss": 0.0169, | |
| "step": 40000 | |
| }, | |
| { | |
| "epoch": 0.3911636139606294, | |
| "eval_accuracy": 0.9984447142857142, | |
| "eval_loss": 0.010405597276985645, | |
| "eval_runtime": 53.1659, | |
| "eval_samples_per_second": 564.272, | |
| "eval_steps_per_second": 35.267, | |
| "step": 40000 | |
| }, | |
| { | |
| "epoch": 0.3960531591351372, | |
| "grad_norm": 0.08975362032651901, | |
| "learning_rate": 0.0006202816666666666, | |
| "loss": 0.0174, | |
| "step": 40500 | |
| }, | |
| { | |
| "epoch": 0.4009427043096451, | |
| "grad_norm": 0.09612125158309937, | |
| "learning_rate": 0.0006183372222222222, | |
| "loss": 0.0168, | |
| "step": 41000 | |
| }, | |
| { | |
| "epoch": 0.4009427043096451, | |
| "eval_accuracy": 0.9985740952380953, | |
| "eval_loss": 0.009390046820044518, | |
| "eval_runtime": 53.7483, | |
| "eval_samples_per_second": 558.158, | |
| "eval_steps_per_second": 34.885, | |
| "step": 41000 | |
| }, | |
| { | |
| "epoch": 0.405832249484153, | |
| "grad_norm": 0.04056503251194954, | |
| "learning_rate": 0.0006163927777777777, | |
| "loss": 0.0163, | |
| "step": 41500 | |
| }, | |
| { | |
| "epoch": 0.4107217946586609, | |
| "grad_norm": 0.11665570735931396, | |
| "learning_rate": 0.0006144483333333333, | |
| "loss": 0.0165, | |
| "step": 42000 | |
| }, | |
| { | |
| "epoch": 0.4107217946586609, | |
| "eval_accuracy": 0.998547619047619, | |
| "eval_loss": 0.009648078121244907, | |
| "eval_runtime": 53.5013, | |
| "eval_samples_per_second": 560.734, | |
| "eval_steps_per_second": 35.046, | |
| "step": 42000 | |
| }, | |
| { | |
| "epoch": 0.4156113398331687, | |
| "grad_norm": 0.10102874785661697, | |
| "learning_rate": 0.000612503888888889, | |
| "loss": 0.0163, | |
| "step": 42500 | |
| }, | |
| { | |
| "epoch": 0.4205008850076766, | |
| "grad_norm": 0.08108735084533691, | |
| "learning_rate": 0.0006105594444444445, | |
| "loss": 0.0154, | |
| "step": 43000 | |
| }, | |
| { | |
| "epoch": 0.4205008850076766, | |
| "eval_accuracy": 0.998580380952381, | |
| "eval_loss": 0.009399999864399433, | |
| "eval_runtime": 53.6417, | |
| "eval_samples_per_second": 559.266, | |
| "eval_steps_per_second": 34.954, | |
| "step": 43000 | |
| }, | |
| { | |
| "epoch": 0.4253904301821845, | |
| "grad_norm": 0.07910118252038956, | |
| "learning_rate": 0.000608615, | |
| "loss": 0.0158, | |
| "step": 43500 | |
| }, | |
| { | |
| "epoch": 0.4302799753566923, | |
| "grad_norm": 0.0742466077208519, | |
| "learning_rate": 0.0006066705555555556, | |
| "loss": 0.0154, | |
| "step": 44000 | |
| }, | |
| { | |
| "epoch": 0.4302799753566923, | |
| "eval_accuracy": 0.9986305238095238, | |
| "eval_loss": 0.009053844027221203, | |
| "eval_runtime": 53.2625, | |
| "eval_samples_per_second": 563.248, | |
| "eval_steps_per_second": 35.203, | |
| "step": 44000 | |
| }, | |
| { | |
| "epoch": 0.43516952053120017, | |
| "grad_norm": 0.06712730973958969, | |
| "learning_rate": 0.0006047261111111111, | |
| "loss": 0.0157, | |
| "step": 44500 | |
| }, | |
| { | |
| "epoch": 0.4400590657057081, | |
| "grad_norm": 0.049518078565597534, | |
| "learning_rate": 0.0006027816666666667, | |
| "loss": 0.0154, | |
| "step": 45000 | |
| }, | |
| { | |
| "epoch": 0.4400590657057081, | |
| "eval_accuracy": 0.9986142857142857, | |
| "eval_loss": 0.009205291979014874, | |
| "eval_runtime": 55.4539, | |
| "eval_samples_per_second": 540.99, | |
| "eval_steps_per_second": 33.812, | |
| "step": 45000 | |
| }, | |
| { | |
| "epoch": 0.4449486108802159, | |
| "grad_norm": 0.0538068488240242, | |
| "learning_rate": 0.0006008372222222222, | |
| "loss": 0.0154, | |
| "step": 45500 | |
| }, | |
| { | |
| "epoch": 0.44983815605472377, | |
| "grad_norm": 0.08187378942966461, | |
| "learning_rate": 0.0005988927777777778, | |
| "loss": 0.015, | |
| "step": 46000 | |
| }, | |
| { | |
| "epoch": 0.44983815605472377, | |
| "eval_accuracy": 0.9986327142857143, | |
| "eval_loss": 0.009027380496263504, | |
| "eval_runtime": 53.2362, | |
| "eval_samples_per_second": 563.526, | |
| "eval_steps_per_second": 35.22, | |
| "step": 46000 | |
| }, | |
| { | |
| "epoch": 0.4547277012292317, | |
| "grad_norm": 0.04306895285844803, | |
| "learning_rate": 0.0005969483333333333, | |
| "loss": 0.0153, | |
| "step": 46500 | |
| }, | |
| { | |
| "epoch": 0.4596172464037395, | |
| "grad_norm": 0.053645290434360504, | |
| "learning_rate": 0.0005950038888888889, | |
| "loss": 0.0146, | |
| "step": 47000 | |
| }, | |
| { | |
| "epoch": 0.4596172464037395, | |
| "eval_accuracy": 0.998660619047619, | |
| "eval_loss": 0.008829508908092976, | |
| "eval_runtime": 54.0772, | |
| "eval_samples_per_second": 554.763, | |
| "eval_steps_per_second": 34.673, | |
| "step": 47000 | |
| }, | |
| { | |
| "epoch": 0.46450679157824737, | |
| "grad_norm": 0.08367203176021576, | |
| "learning_rate": 0.0005930594444444444, | |
| "loss": 0.0149, | |
| "step": 47500 | |
| }, | |
| { | |
| "epoch": 0.46939633675275527, | |
| "grad_norm": 0.06427811086177826, | |
| "learning_rate": 0.000591115, | |
| "loss": 0.0146, | |
| "step": 48000 | |
| }, | |
| { | |
| "epoch": 0.46939633675275527, | |
| "eval_accuracy": 0.9986682857142857, | |
| "eval_loss": 0.008711729198694229, | |
| "eval_runtime": 54.7568, | |
| "eval_samples_per_second": 547.877, | |
| "eval_steps_per_second": 34.242, | |
| "step": 48000 | |
| }, | |
| { | |
| "epoch": 0.4742858819272631, | |
| "grad_norm": 0.09367698431015015, | |
| "learning_rate": 0.0005891705555555556, | |
| "loss": 0.0146, | |
| "step": 48500 | |
| }, | |
| { | |
| "epoch": 0.47917542710177097, | |
| "grad_norm": 0.023252153769135475, | |
| "learning_rate": 0.0005872261111111111, | |
| "loss": 0.0143, | |
| "step": 49000 | |
| }, | |
| { | |
| "epoch": 0.47917542710177097, | |
| "eval_accuracy": 0.9987029047619047, | |
| "eval_loss": 0.00848183874040842, | |
| "eval_runtime": 53.9633, | |
| "eval_samples_per_second": 555.933, | |
| "eval_steps_per_second": 34.746, | |
| "step": 49000 | |
| }, | |
| { | |
| "epoch": 0.48406497227627887, | |
| "grad_norm": 0.038976676762104034, | |
| "learning_rate": 0.0005852816666666666, | |
| "loss": 0.0142, | |
| "step": 49500 | |
| }, | |
| { | |
| "epoch": 0.4889545174507867, | |
| "grad_norm": 0.048157546669244766, | |
| "learning_rate": 0.0005833372222222221, | |
| "loss": 0.0146, | |
| "step": 50000 | |
| }, | |
| { | |
| "epoch": 0.4889545174507867, | |
| "eval_accuracy": 0.9986898571428572, | |
| "eval_loss": 0.008633621968328953, | |
| "eval_runtime": 53.3435, | |
| "eval_samples_per_second": 562.392, | |
| "eval_steps_per_second": 35.15, | |
| "step": 50000 | |
| }, | |
| { | |
| "epoch": 0.4938440626252946, | |
| "grad_norm": 0.04257979243993759, | |
| "learning_rate": 0.0005813927777777777, | |
| "loss": 0.0145, | |
| "step": 50500 | |
| }, | |
| { | |
| "epoch": 0.49873360779980247, | |
| "grad_norm": 0.09921249002218246, | |
| "learning_rate": 0.0005794483333333334, | |
| "loss": 0.0142, | |
| "step": 51000 | |
| }, | |
| { | |
| "epoch": 0.49873360779980247, | |
| "eval_accuracy": 0.9987676666666667, | |
| "eval_loss": 0.008316335268318653, | |
| "eval_runtime": 53.6985, | |
| "eval_samples_per_second": 558.675, | |
| "eval_steps_per_second": 34.917, | |
| "step": 51000 | |
| }, | |
| { | |
| "epoch": 0.5036231529743104, | |
| "grad_norm": 0.048569273203611374, | |
| "learning_rate": 0.0005775038888888889, | |
| "loss": 0.0135, | |
| "step": 51500 | |
| }, | |
| { | |
| "epoch": 0.5085126981488182, | |
| "grad_norm": 0.06064219772815704, | |
| "learning_rate": 0.0005755594444444445, | |
| "loss": 0.0139, | |
| "step": 52000 | |
| }, | |
| { | |
| "epoch": 0.5085126981488182, | |
| "eval_accuracy": 0.9987182380952381, | |
| "eval_loss": 0.008500739932060242, | |
| "eval_runtime": 53.1478, | |
| "eval_samples_per_second": 564.463, | |
| "eval_steps_per_second": 35.279, | |
| "step": 52000 | |
| }, | |
| { | |
| "epoch": 0.5134022433233261, | |
| "grad_norm": 0.043598126620054245, | |
| "learning_rate": 0.000573615, | |
| "loss": 0.0145, | |
| "step": 52500 | |
| }, | |
| { | |
| "epoch": 0.518291788497834, | |
| "grad_norm": 0.059862203896045685, | |
| "learning_rate": 0.0005716705555555556, | |
| "loss": 0.0134, | |
| "step": 53000 | |
| }, | |
| { | |
| "epoch": 0.518291788497834, | |
| "eval_accuracy": 0.9987784761904762, | |
| "eval_loss": 0.008033830672502518, | |
| "eval_runtime": 55.7465, | |
| "eval_samples_per_second": 538.15, | |
| "eval_steps_per_second": 33.634, | |
| "step": 53000 | |
| }, | |
| { | |
| "epoch": 0.5231813336723418, | |
| "grad_norm": 0.05372610315680504, | |
| "learning_rate": 0.0005697261111111111, | |
| "loss": 0.0136, | |
| "step": 53500 | |
| }, | |
| { | |
| "epoch": 0.5280708788468497, | |
| "grad_norm": 0.08553345501422882, | |
| "learning_rate": 0.0005677816666666667, | |
| "loss": 0.0138, | |
| "step": 54000 | |
| }, | |
| { | |
| "epoch": 0.5280708788468497, | |
| "eval_accuracy": 0.9988229047619047, | |
| "eval_loss": 0.007664266973733902, | |
| "eval_runtime": 53.9758, | |
| "eval_samples_per_second": 555.805, | |
| "eval_steps_per_second": 34.738, | |
| "step": 54000 | |
| }, | |
| { | |
| "epoch": 0.5329604240213576, | |
| "grad_norm": 0.03992351144552231, | |
| "learning_rate": 0.0005658372222222222, | |
| "loss": 0.0133, | |
| "step": 54500 | |
| }, | |
| { | |
| "epoch": 0.5378499691958654, | |
| "grad_norm": 0.051119010895490646, | |
| "learning_rate": 0.0005638927777777777, | |
| "loss": 0.0135, | |
| "step": 55000 | |
| }, | |
| { | |
| "epoch": 0.5378499691958654, | |
| "eval_accuracy": 0.9988099523809524, | |
| "eval_loss": 0.007848628796637058, | |
| "eval_runtime": 54.3513, | |
| "eval_samples_per_second": 551.965, | |
| "eval_steps_per_second": 34.498, | |
| "step": 55000 | |
| }, | |
| { | |
| "epoch": 0.5427395143703733, | |
| "grad_norm": 0.08714370429515839, | |
| "learning_rate": 0.0005619483333333333, | |
| "loss": 0.0128, | |
| "step": 55500 | |
| }, | |
| { | |
| "epoch": 0.5476290595448812, | |
| "grad_norm": 0.07373756170272827, | |
| "learning_rate": 0.0005600038888888888, | |
| "loss": 0.013, | |
| "step": 56000 | |
| }, | |
| { | |
| "epoch": 0.5476290595448812, | |
| "eval_accuracy": 0.9988279047619048, | |
| "eval_loss": 0.007725988980382681, | |
| "eval_runtime": 53.269, | |
| "eval_samples_per_second": 563.179, | |
| "eval_steps_per_second": 35.199, | |
| "step": 56000 | |
| }, | |
| { | |
| "epoch": 0.552518604719389, | |
| "grad_norm": 0.04964112490415573, | |
| "learning_rate": 0.0005580594444444444, | |
| "loss": 0.0132, | |
| "step": 56500 | |
| }, | |
| { | |
| "epoch": 0.5574081498938969, | |
| "grad_norm": 0.08856749534606934, | |
| "learning_rate": 0.000556115, | |
| "loss": 0.0128, | |
| "step": 57000 | |
| }, | |
| { | |
| "epoch": 0.5574081498938969, | |
| "eval_accuracy": 0.998819, | |
| "eval_loss": 0.007981804199516773, | |
| "eval_runtime": 54.4577, | |
| "eval_samples_per_second": 550.886, | |
| "eval_steps_per_second": 34.43, | |
| "step": 57000 | |
| }, | |
| { | |
| "epoch": 0.5622976950684048, | |
| "grad_norm": 0.06801512092351913, | |
| "learning_rate": 0.0005541705555555555, | |
| "loss": 0.0129, | |
| "step": 57500 | |
| }, | |
| { | |
| "epoch": 0.5671872402429126, | |
| "grad_norm": 0.21337199211120605, | |
| "learning_rate": 0.0005522261111111112, | |
| "loss": 0.0131, | |
| "step": 58000 | |
| }, | |
| { | |
| "epoch": 0.5671872402429126, | |
| "eval_accuracy": 0.9988361904761904, | |
| "eval_loss": 0.0077649368904531, | |
| "eval_runtime": 53.1495, | |
| "eval_samples_per_second": 564.445, | |
| "eval_steps_per_second": 35.278, | |
| "step": 58000 | |
| }, | |
| { | |
| "epoch": 0.5720767854174205, | |
| "grad_norm": 0.0754612609744072, | |
| "learning_rate": 0.0005502816666666667, | |
| "loss": 0.013, | |
| "step": 58500 | |
| }, | |
| { | |
| "epoch": 0.5769663305919284, | |
| "grad_norm": 0.053277261555194855, | |
| "learning_rate": 0.0005483372222222223, | |
| "loss": 0.013, | |
| "step": 59000 | |
| }, | |
| { | |
| "epoch": 0.5769663305919284, | |
| "eval_accuracy": 0.9988713333333333, | |
| "eval_loss": 0.007438257802277803, | |
| "eval_runtime": 53.9725, | |
| "eval_samples_per_second": 555.839, | |
| "eval_steps_per_second": 34.74, | |
| "step": 59000 | |
| }, | |
| { | |
| "epoch": 0.5818558757664362, | |
| "grad_norm": 0.057580217719078064, | |
| "learning_rate": 0.0005463927777777778, | |
| "loss": 0.0126, | |
| "step": 59500 | |
| }, | |
| { | |
| "epoch": 0.5867454209409441, | |
| "grad_norm": 0.08538717031478882, | |
| "learning_rate": 0.0005444483333333334, | |
| "loss": 0.0125, | |
| "step": 60000 | |
| }, | |
| { | |
| "epoch": 0.5867454209409441, | |
| "eval_accuracy": 0.9988772857142857, | |
| "eval_loss": 0.0073426892049610615, | |
| "eval_runtime": 53.9301, | |
| "eval_samples_per_second": 556.276, | |
| "eval_steps_per_second": 34.767, | |
| "step": 60000 | |
| }, | |
| { | |
| "epoch": 0.591634966115452, | |
| "grad_norm": 0.07628747820854187, | |
| "learning_rate": 0.0005425038888888889, | |
| "loss": 0.0127, | |
| "step": 60500 | |
| }, | |
| { | |
| "epoch": 0.5965245112899598, | |
| "grad_norm": 0.059503812342882156, | |
| "learning_rate": 0.0005405594444444444, | |
| "loss": 0.0119, | |
| "step": 61000 | |
| }, | |
| { | |
| "epoch": 0.5965245112899598, | |
| "eval_accuracy": 0.9988955714285714, | |
| "eval_loss": 0.007260579615831375, | |
| "eval_runtime": 54.1591, | |
| "eval_samples_per_second": 553.924, | |
| "eval_steps_per_second": 34.62, | |
| "step": 61000 | |
| }, | |
| { | |
| "epoch": 0.6014140564644677, | |
| "grad_norm": 0.07128513604402542, | |
| "learning_rate": 0.000538615, | |
| "loss": 0.012, | |
| "step": 61500 | |
| }, | |
| { | |
| "epoch": 0.6063036016389756, | |
| "grad_norm": 0.0615658275783062, | |
| "learning_rate": 0.0005366705555555555, | |
| "loss": 0.0121, | |
| "step": 62000 | |
| }, | |
| { | |
| "epoch": 0.6063036016389756, | |
| "eval_accuracy": 0.9988924285714286, | |
| "eval_loss": 0.007354605942964554, | |
| "eval_runtime": 53.6133, | |
| "eval_samples_per_second": 559.563, | |
| "eval_steps_per_second": 34.973, | |
| "step": 62000 | |
| }, | |
| { | |
| "epoch": 0.6111931468134834, | |
| "grad_norm": 0.04977503791451454, | |
| "learning_rate": 0.0005347261111111111, | |
| "loss": 0.0125, | |
| "step": 62500 | |
| }, | |
| { | |
| "epoch": 0.6160826919879913, | |
| "grad_norm": 0.06748691946268082, | |
| "learning_rate": 0.0005327816666666666, | |
| "loss": 0.0123, | |
| "step": 63000 | |
| }, | |
| { | |
| "epoch": 0.6160826919879913, | |
| "eval_accuracy": 0.9989074761904762, | |
| "eval_loss": 0.007279036566615105, | |
| "eval_runtime": 54.221, | |
| "eval_samples_per_second": 553.291, | |
| "eval_steps_per_second": 34.581, | |
| "step": 63000 | |
| }, | |
| { | |
| "epoch": 0.6209722371624992, | |
| "grad_norm": 0.08432789891958237, | |
| "learning_rate": 0.0005308372222222222, | |
| "loss": 0.0119, | |
| "step": 63500 | |
| }, | |
| { | |
| "epoch": 0.6258617823370071, | |
| "grad_norm": 0.08450587093830109, | |
| "learning_rate": 0.0005288927777777778, | |
| "loss": 0.0123, | |
| "step": 64000 | |
| }, | |
| { | |
| "epoch": 0.6258617823370071, | |
| "eval_accuracy": 0.998906619047619, | |
| "eval_loss": 0.007195043843239546, | |
| "eval_runtime": 53.6077, | |
| "eval_samples_per_second": 559.621, | |
| "eval_steps_per_second": 34.976, | |
| "step": 64000 | |
| }, | |
| { | |
| "epoch": 0.6307513275115149, | |
| "grad_norm": 0.05454770103096962, | |
| "learning_rate": 0.0005269483333333333, | |
| "loss": 0.0119, | |
| "step": 64500 | |
| }, | |
| { | |
| "epoch": 0.6356408726860228, | |
| "grad_norm": 0.029517434537410736, | |
| "learning_rate": 0.0005250038888888889, | |
| "loss": 0.0115, | |
| "step": 65000 | |
| }, | |
| { | |
| "epoch": 0.6356408726860228, | |
| "eval_accuracy": 0.99894, | |
| "eval_loss": 0.006976461503654718, | |
| "eval_runtime": 54.3436, | |
| "eval_samples_per_second": 552.043, | |
| "eval_steps_per_second": 34.503, | |
| "step": 65000 | |
| }, | |
| { | |
| "epoch": 0.6405304178605307, | |
| "grad_norm": 0.08749569207429886, | |
| "learning_rate": 0.0005230594444444444, | |
| "loss": 0.0117, | |
| "step": 65500 | |
| }, | |
| { | |
| "epoch": 0.6454199630350385, | |
| "grad_norm": 0.08669404685497284, | |
| "learning_rate": 0.000521115, | |
| "loss": 0.0118, | |
| "step": 66000 | |
| }, | |
| { | |
| "epoch": 0.6454199630350385, | |
| "eval_accuracy": 0.9989269523809524, | |
| "eval_loss": 0.0070405821315944195, | |
| "eval_runtime": 53.0176, | |
| "eval_samples_per_second": 565.85, | |
| "eval_steps_per_second": 35.366, | |
| "step": 66000 | |
| }, | |
| { | |
| "epoch": 0.6503095082095464, | |
| "grad_norm": 0.08068472146987915, | |
| "learning_rate": 0.0005191705555555556, | |
| "loss": 0.012, | |
| "step": 66500 | |
| }, | |
| { | |
| "epoch": 0.6551990533840543, | |
| "grad_norm": 0.06560824811458588, | |
| "learning_rate": 0.0005172261111111111, | |
| "loss": 0.0113, | |
| "step": 67000 | |
| }, | |
| { | |
| "epoch": 0.6551990533840543, | |
| "eval_accuracy": 0.9989625238095238, | |
| "eval_loss": 0.006877726875245571, | |
| "eval_runtime": 57.7066, | |
| "eval_samples_per_second": 519.871, | |
| "eval_steps_per_second": 32.492, | |
| "step": 67000 | |
| }, | |
| { | |
| "epoch": 0.6600885985585621, | |
| "grad_norm": 0.10351342707872391, | |
| "learning_rate": 0.0005152816666666667, | |
| "loss": 0.0117, | |
| "step": 67500 | |
| }, | |
| { | |
| "epoch": 0.66497814373307, | |
| "grad_norm": 0.06295846402645111, | |
| "learning_rate": 0.0005133372222222222, | |
| "loss": 0.0113, | |
| "step": 68000 | |
| }, | |
| { | |
| "epoch": 0.66497814373307, | |
| "eval_accuracy": 0.9989663333333333, | |
| "eval_loss": 0.006734638474881649, | |
| "eval_runtime": 56.5776, | |
| "eval_samples_per_second": 530.245, | |
| "eval_steps_per_second": 33.14, | |
| "step": 68000 | |
| }, | |
| { | |
| "epoch": 0.6698676889075779, | |
| "grad_norm": 0.07197780162096024, | |
| "learning_rate": 0.0005113927777777778, | |
| "loss": 0.0112, | |
| "step": 68500 | |
| }, | |
| { | |
| "epoch": 0.6747572340820857, | |
| "grad_norm": 0.05394699051976204, | |
| "learning_rate": 0.0005094483333333333, | |
| "loss": 0.0111, | |
| "step": 69000 | |
| }, | |
| { | |
| "epoch": 0.6747572340820857, | |
| "eval_accuracy": 0.9989654761904762, | |
| "eval_loss": 0.006897720508277416, | |
| "eval_runtime": 53.9516, | |
| "eval_samples_per_second": 556.054, | |
| "eval_steps_per_second": 34.753, | |
| "step": 69000 | |
| }, | |
| { | |
| "epoch": 0.6796467792565936, | |
| "grad_norm": 0.08804675191640854, | |
| "learning_rate": 0.0005075038888888889, | |
| "loss": 0.0114, | |
| "step": 69500 | |
| }, | |
| { | |
| "epoch": 0.6845363244311015, | |
| "grad_norm": 0.061258211731910706, | |
| "learning_rate": 0.0005055594444444445, | |
| "loss": 0.0116, | |
| "step": 70000 | |
| }, | |
| { | |
| "epoch": 0.6845363244311015, | |
| "eval_accuracy": 0.998991619047619, | |
| "eval_loss": 0.006613132543861866, | |
| "eval_runtime": 53.4248, | |
| "eval_samples_per_second": 561.537, | |
| "eval_steps_per_second": 35.096, | |
| "step": 70000 | |
| }, | |
| { | |
| "epoch": 0.6894258696056093, | |
| "grad_norm": 0.047413647174835205, | |
| "learning_rate": 0.000503615, | |
| "loss": 0.0114, | |
| "step": 70500 | |
| }, | |
| { | |
| "epoch": 0.6943154147801172, | |
| "grad_norm": 0.048444923013448715, | |
| "learning_rate": 0.0005016705555555556, | |
| "loss": 0.0111, | |
| "step": 71000 | |
| }, | |
| { | |
| "epoch": 0.6943154147801172, | |
| "eval_accuracy": 0.9989892857142857, | |
| "eval_loss": 0.006757956929504871, | |
| "eval_runtime": 54.0915, | |
| "eval_samples_per_second": 554.616, | |
| "eval_steps_per_second": 34.663, | |
| "step": 71000 | |
| }, | |
| { | |
| "epoch": 0.6992049599546251, | |
| "grad_norm": 0.0633966252207756, | |
| "learning_rate": 0.0004997261111111111, | |
| "loss": 0.011, | |
| "step": 71500 | |
| }, | |
| { | |
| "epoch": 0.7040945051291329, | |
| "grad_norm": 0.05330997332930565, | |
| "learning_rate": 0.0004977816666666666, | |
| "loss": 0.0111, | |
| "step": 72000 | |
| }, | |
| { | |
| "epoch": 0.7040945051291329, | |
| "eval_accuracy": 0.9989945238095238, | |
| "eval_loss": 0.006628294009715319, | |
| "eval_runtime": 53.5745, | |
| "eval_samples_per_second": 559.968, | |
| "eval_steps_per_second": 34.998, | |
| "step": 72000 | |
| }, | |
| { | |
| "epoch": 0.7089840503036408, | |
| "grad_norm": 0.08384311944246292, | |
| "learning_rate": 0.0004958372222222222, | |
| "loss": 0.0112, | |
| "step": 72500 | |
| }, | |
| { | |
| "epoch": 0.7138735954781487, | |
| "grad_norm": 0.012912419624626637, | |
| "learning_rate": 0.0004938927777777777, | |
| "loss": 0.0108, | |
| "step": 73000 | |
| }, | |
| { | |
| "epoch": 0.7138735954781487, | |
| "eval_accuracy": 0.9990231904761905, | |
| "eval_loss": 0.0064848195761442184, | |
| "eval_runtime": 53.4714, | |
| "eval_samples_per_second": 561.048, | |
| "eval_steps_per_second": 35.065, | |
| "step": 73000 | |
| }, | |
| { | |
| "epoch": 0.7187631406526565, | |
| "grad_norm": 0.03586062043905258, | |
| "learning_rate": 0.0004919483333333333, | |
| "loss": 0.0106, | |
| "step": 73500 | |
| }, | |
| { | |
| "epoch": 0.7236526858271644, | |
| "grad_norm": 0.03920240327715874, | |
| "learning_rate": 0.0004900038888888888, | |
| "loss": 0.0108, | |
| "step": 74000 | |
| }, | |
| { | |
| "epoch": 0.7236526858271644, | |
| "eval_accuracy": 0.9990156666666666, | |
| "eval_loss": 0.00646663922816515, | |
| "eval_runtime": 53.3973, | |
| "eval_samples_per_second": 561.827, | |
| "eval_steps_per_second": 35.114, | |
| "step": 74000 | |
| }, | |
| { | |
| "epoch": 0.7285422310016723, | |
| "grad_norm": 0.07299363613128662, | |
| "learning_rate": 0.00048805944444444446, | |
| "loss": 0.0109, | |
| "step": 74500 | |
| }, | |
| { | |
| "epoch": 0.73343177617618, | |
| "grad_norm": 0.061152711510658264, | |
| "learning_rate": 0.000486115, | |
| "loss": 0.0102, | |
| "step": 75000 | |
| }, | |
| { | |
| "epoch": 0.73343177617618, | |
| "eval_accuracy": 0.9990235714285715, | |
| "eval_loss": 0.00648918654769659, | |
| "eval_runtime": 54.8259, | |
| "eval_samples_per_second": 547.187, | |
| "eval_steps_per_second": 34.199, | |
| "step": 75000 | |
| }, | |
| { | |
| "epoch": 0.738321321350688, | |
| "grad_norm": 0.052978385239839554, | |
| "learning_rate": 0.0004841705555555556, | |
| "loss": 0.0108, | |
| "step": 75500 | |
| }, | |
| { | |
| "epoch": 0.7432108665251959, | |
| "grad_norm": 0.03460371494293213, | |
| "learning_rate": 0.00048222611111111113, | |
| "loss": 0.0104, | |
| "step": 76000 | |
| }, | |
| { | |
| "epoch": 0.7432108665251959, | |
| "eval_accuracy": 0.9990159047619047, | |
| "eval_loss": 0.006446553394198418, | |
| "eval_runtime": 53.4946, | |
| "eval_samples_per_second": 560.804, | |
| "eval_steps_per_second": 35.05, | |
| "step": 76000 | |
| }, | |
| { | |
| "epoch": 0.7481004116997036, | |
| "grad_norm": 0.08936499804258347, | |
| "learning_rate": 0.0004802816666666667, | |
| "loss": 0.0105, | |
| "step": 76500 | |
| }, | |
| { | |
| "epoch": 0.7529899568742116, | |
| "grad_norm": 0.04613318666815758, | |
| "learning_rate": 0.00047833722222222224, | |
| "loss": 0.0104, | |
| "step": 77000 | |
| }, | |
| { | |
| "epoch": 0.7529899568742116, | |
| "eval_accuracy": 0.9990385714285714, | |
| "eval_loss": 0.0063977050594985485, | |
| "eval_runtime": 54.4466, | |
| "eval_samples_per_second": 550.998, | |
| "eval_steps_per_second": 34.437, | |
| "step": 77000 | |
| }, | |
| { | |
| "epoch": 0.7578795020487195, | |
| "grad_norm": 0.05318485200405121, | |
| "learning_rate": 0.00047639277777777775, | |
| "loss": 0.0106, | |
| "step": 77500 | |
| }, | |
| { | |
| "epoch": 0.7627690472232272, | |
| "grad_norm": 0.061067450791597366, | |
| "learning_rate": 0.0004744483333333333, | |
| "loss": 0.0101, | |
| "step": 78000 | |
| }, | |
| { | |
| "epoch": 0.7627690472232272, | |
| "eval_accuracy": 0.9990490952380953, | |
| "eval_loss": 0.006357032340019941, | |
| "eval_runtime": 53.2067, | |
| "eval_samples_per_second": 563.839, | |
| "eval_steps_per_second": 35.24, | |
| "step": 78000 | |
| }, | |
| { | |
| "epoch": 0.7676585923977352, | |
| "grad_norm": 0.042733557522296906, | |
| "learning_rate": 0.00047250388888888886, | |
| "loss": 0.01, | |
| "step": 78500 | |
| }, | |
| { | |
| "epoch": 0.772548137572243, | |
| "grad_norm": 0.08034121245145798, | |
| "learning_rate": 0.0004705594444444444, | |
| "loss": 0.0103, | |
| "step": 79000 | |
| }, | |
| { | |
| "epoch": 0.772548137572243, | |
| "eval_accuracy": 0.9990574761904762, | |
| "eval_loss": 0.0062187593430280685, | |
| "eval_runtime": 53.8428, | |
| "eval_samples_per_second": 557.177, | |
| "eval_steps_per_second": 34.824, | |
| "step": 79000 | |
| }, | |
| { | |
| "epoch": 0.7774376827467508, | |
| "grad_norm": 0.07830695807933807, | |
| "learning_rate": 0.00046861499999999997, | |
| "loss": 0.0101, | |
| "step": 79500 | |
| }, | |
| { | |
| "epoch": 0.7823272279212587, | |
| "grad_norm": 0.07382604479789734, | |
| "learning_rate": 0.00046667055555555553, | |
| "loss": 0.0103, | |
| "step": 80000 | |
| }, | |
| { | |
| "epoch": 0.7823272279212587, | |
| "eval_accuracy": 0.9990759047619048, | |
| "eval_loss": 0.006141056306660175, | |
| "eval_runtime": 53.2408, | |
| "eval_samples_per_second": 563.478, | |
| "eval_steps_per_second": 35.217, | |
| "step": 80000 | |
| }, | |
| { | |
| "epoch": 0.7872167730957667, | |
| "grad_norm": 0.1125330850481987, | |
| "learning_rate": 0.00046472611111111114, | |
| "loss": 0.0102, | |
| "step": 80500 | |
| }, | |
| { | |
| "epoch": 0.7921063182702744, | |
| "grad_norm": 0.03520214557647705, | |
| "learning_rate": 0.0004627816666666667, | |
| "loss": 0.01, | |
| "step": 81000 | |
| }, | |
| { | |
| "epoch": 0.7921063182702744, | |
| "eval_accuracy": 0.9990739047619047, | |
| "eval_loss": 0.006076267920434475, | |
| "eval_runtime": 53.791, | |
| "eval_samples_per_second": 557.715, | |
| "eval_steps_per_second": 34.857, | |
| "step": 81000 | |
| }, | |
| { | |
| "epoch": 0.7969958634447823, | |
| "grad_norm": 0.042487915605306625, | |
| "learning_rate": 0.00046083722222222225, | |
| "loss": 0.0097, | |
| "step": 81500 | |
| }, | |
| { | |
| "epoch": 0.8018854086192903, | |
| "grad_norm": 0.054117601364851, | |
| "learning_rate": 0.0004588927777777778, | |
| "loss": 0.0101, | |
| "step": 82000 | |
| }, | |
| { | |
| "epoch": 0.8018854086192903, | |
| "eval_accuracy": 0.9990634761904762, | |
| "eval_loss": 0.006164718419313431, | |
| "eval_runtime": 53.5332, | |
| "eval_samples_per_second": 560.4, | |
| "eval_steps_per_second": 35.025, | |
| "step": 82000 | |
| }, | |
| { | |
| "epoch": 0.8067749537937982, | |
| "grad_norm": 0.04976029694080353, | |
| "learning_rate": 0.00045694833333333336, | |
| "loss": 0.0099, | |
| "step": 82500 | |
| }, | |
| { | |
| "epoch": 0.811664498968306, | |
| "grad_norm": 0.054267916828393936, | |
| "learning_rate": 0.00045500388888888887, | |
| "loss": 0.0097, | |
| "step": 83000 | |
| }, | |
| { | |
| "epoch": 0.811664498968306, | |
| "eval_accuracy": 0.9990979047619047, | |
| "eval_loss": 0.005987876560539007, | |
| "eval_runtime": 54.2289, | |
| "eval_samples_per_second": 553.211, | |
| "eval_steps_per_second": 34.576, | |
| "step": 83000 | |
| }, | |
| { | |
| "epoch": 0.8165540441428139, | |
| "grad_norm": 0.03349093720316887, | |
| "learning_rate": 0.0004530594444444444, | |
| "loss": 0.0094, | |
| "step": 83500 | |
| }, | |
| { | |
| "epoch": 0.8214435893173218, | |
| "grad_norm": 0.04999032989144325, | |
| "learning_rate": 0.000451115, | |
| "loss": 0.0101, | |
| "step": 84000 | |
| }, | |
| { | |
| "epoch": 0.8214435893173218, | |
| "eval_accuracy": 0.9990905714285714, | |
| "eval_loss": 0.006010835990309715, | |
| "eval_runtime": 53.4299, | |
| "eval_samples_per_second": 561.483, | |
| "eval_steps_per_second": 35.093, | |
| "step": 84000 | |
| }, | |
| { | |
| "epoch": 0.8263331344918295, | |
| "grad_norm": 0.045149870216846466, | |
| "learning_rate": 0.00044917055555555554, | |
| "loss": 0.0097, | |
| "step": 84500 | |
| }, | |
| { | |
| "epoch": 0.8312226796663374, | |
| "grad_norm": 0.0918109267950058, | |
| "learning_rate": 0.0004472261111111111, | |
| "loss": 0.0099, | |
| "step": 85000 | |
| }, | |
| { | |
| "epoch": 0.8312226796663374, | |
| "eval_accuracy": 0.9990772380952381, | |
| "eval_loss": 0.006181794218719006, | |
| "eval_runtime": 54.1897, | |
| "eval_samples_per_second": 553.611, | |
| "eval_steps_per_second": 34.601, | |
| "step": 85000 | |
| }, | |
| { | |
| "epoch": 0.8361122248408454, | |
| "grad_norm": 0.0643276646733284, | |
| "learning_rate": 0.00044528166666666665, | |
| "loss": 0.0099, | |
| "step": 85500 | |
| }, | |
| { | |
| "epoch": 0.8410017700153531, | |
| "grad_norm": 0.06930361688137054, | |
| "learning_rate": 0.0004433372222222222, | |
| "loss": 0.0095, | |
| "step": 86000 | |
| }, | |
| { | |
| "epoch": 0.8410017700153531, | |
| "eval_accuracy": 0.9991025238095238, | |
| "eval_loss": 0.00590873695909977, | |
| "eval_runtime": 52.8942, | |
| "eval_samples_per_second": 567.169, | |
| "eval_steps_per_second": 35.448, | |
| "step": 86000 | |
| }, | |
| { | |
| "epoch": 0.845891315189861, | |
| "grad_norm": 0.08163397759199142, | |
| "learning_rate": 0.00044139277777777776, | |
| "loss": 0.0099, | |
| "step": 86500 | |
| }, | |
| { | |
| "epoch": 0.850780860364369, | |
| "grad_norm": 0.0483279749751091, | |
| "learning_rate": 0.00043944833333333337, | |
| "loss": 0.0092, | |
| "step": 87000 | |
| }, | |
| { | |
| "epoch": 0.850780860364369, | |
| "eval_accuracy": 0.9991092857142857, | |
| "eval_loss": 0.006001894827932119, | |
| "eval_runtime": 53.2268, | |
| "eval_samples_per_second": 563.626, | |
| "eval_steps_per_second": 35.227, | |
| "step": 87000 | |
| }, | |
| { | |
| "epoch": 0.8556704055388767, | |
| "grad_norm": 0.02636638656258583, | |
| "learning_rate": 0.00043750388888888893, | |
| "loss": 0.0094, | |
| "step": 87500 | |
| }, | |
| { | |
| "epoch": 0.8605599507133846, | |
| "grad_norm": 0.042217135429382324, | |
| "learning_rate": 0.0004355594444444445, | |
| "loss": 0.0092, | |
| "step": 88000 | |
| }, | |
| { | |
| "epoch": 0.8605599507133846, | |
| "eval_accuracy": 0.999128, | |
| "eval_loss": 0.005815317388623953, | |
| "eval_runtime": 53.8299, | |
| "eval_samples_per_second": 557.311, | |
| "eval_steps_per_second": 34.832, | |
| "step": 88000 | |
| }, | |
| { | |
| "epoch": 0.8654494958878926, | |
| "grad_norm": 0.08632192760705948, | |
| "learning_rate": 0.00043361499999999993, | |
| "loss": 0.0092, | |
| "step": 88500 | |
| }, | |
| { | |
| "epoch": 0.8703390410624003, | |
| "grad_norm": 0.04315312206745148, | |
| "learning_rate": 0.00043167055555555554, | |
| "loss": 0.0094, | |
| "step": 89000 | |
| }, | |
| { | |
| "epoch": 0.8703390410624003, | |
| "eval_accuracy": 0.9991279047619047, | |
| "eval_loss": 0.0056898752227425575, | |
| "eval_runtime": 53.7075, | |
| "eval_samples_per_second": 558.581, | |
| "eval_steps_per_second": 34.911, | |
| "step": 89000 | |
| }, | |
| { | |
| "epoch": 0.8752285862369082, | |
| "grad_norm": 0.03837065026164055, | |
| "learning_rate": 0.0004297261111111111, | |
| "loss": 0.0094, | |
| "step": 89500 | |
| }, | |
| { | |
| "epoch": 0.8801181314114161, | |
| "grad_norm": 0.04201444238424301, | |
| "learning_rate": 0.00042778166666666666, | |
| "loss": 0.0093, | |
| "step": 90000 | |
| }, | |
| { | |
| "epoch": 0.8801181314114161, | |
| "eval_accuracy": 0.9991310952380953, | |
| "eval_loss": 0.00587738212198019, | |
| "eval_runtime": 53.5135, | |
| "eval_samples_per_second": 560.606, | |
| "eval_steps_per_second": 35.038, | |
| "step": 90000 | |
| }, | |
| { | |
| "epoch": 0.8850076765859239, | |
| "grad_norm": 0.061635617166757584, | |
| "learning_rate": 0.0004258372222222222, | |
| "loss": 0.0092, | |
| "step": 90500 | |
| }, | |
| { | |
| "epoch": 0.8898972217604318, | |
| "grad_norm": 0.03518196567893028, | |
| "learning_rate": 0.00042389277777777777, | |
| "loss": 0.0088, | |
| "step": 91000 | |
| }, | |
| { | |
| "epoch": 0.8898972217604318, | |
| "eval_accuracy": 0.9991415238095238, | |
| "eval_loss": 0.005721970461308956, | |
| "eval_runtime": 53.7456, | |
| "eval_samples_per_second": 558.185, | |
| "eval_steps_per_second": 34.887, | |
| "step": 91000 | |
| }, | |
| { | |
| "epoch": 0.8947867669349397, | |
| "grad_norm": 0.06095174327492714, | |
| "learning_rate": 0.0004219483333333333, | |
| "loss": 0.0095, | |
| "step": 91500 | |
| }, | |
| { | |
| "epoch": 0.8996763121094475, | |
| "grad_norm": 0.03404530510306358, | |
| "learning_rate": 0.0004200038888888889, | |
| "loss": 0.0091, | |
| "step": 92000 | |
| }, | |
| { | |
| "epoch": 0.8996763121094475, | |
| "eval_accuracy": 0.9991448571428572, | |
| "eval_loss": 0.0056047323159873486, | |
| "eval_runtime": 53.6229, | |
| "eval_samples_per_second": 559.463, | |
| "eval_steps_per_second": 34.966, | |
| "step": 92000 | |
| }, | |
| { | |
| "epoch": 0.9045658572839554, | |
| "grad_norm": 0.044711388647556305, | |
| "learning_rate": 0.00041805944444444444, | |
| "loss": 0.0094, | |
| "step": 92500 | |
| }, | |
| { | |
| "epoch": 0.9094554024584633, | |
| "grad_norm": 0.025318428874015808, | |
| "learning_rate": 0.000416115, | |
| "loss": 0.0091, | |
| "step": 93000 | |
| }, | |
| { | |
| "epoch": 0.9094554024584633, | |
| "eval_accuracy": 0.9991459047619048, | |
| "eval_loss": 0.0056663015857338905, | |
| "eval_runtime": 53.7217, | |
| "eval_samples_per_second": 558.433, | |
| "eval_steps_per_second": 34.902, | |
| "step": 93000 | |
| }, | |
| { | |
| "epoch": 0.9143449476329711, | |
| "grad_norm": 0.09479326009750366, | |
| "learning_rate": 0.0004141705555555556, | |
| "loss": 0.0091, | |
| "step": 93500 | |
| }, | |
| { | |
| "epoch": 0.919234492807479, | |
| "grad_norm": 0.04621125012636185, | |
| "learning_rate": 0.00041222611111111116, | |
| "loss": 0.0091, | |
| "step": 94000 | |
| }, | |
| { | |
| "epoch": 0.919234492807479, | |
| "eval_accuracy": 0.9991637619047619, | |
| "eval_loss": 0.005490881856530905, | |
| "eval_runtime": 52.8914, | |
| "eval_samples_per_second": 567.2, | |
| "eval_steps_per_second": 35.45, | |
| "step": 94000 | |
| }, | |
| { | |
| "epoch": 0.924124037981987, | |
| "grad_norm": 0.11758420616388321, | |
| "learning_rate": 0.0004102816666666666, | |
| "loss": 0.0091, | |
| "step": 94500 | |
| }, | |
| { | |
| "epoch": 0.9290135831564947, | |
| "grad_norm": 0.048568353056907654, | |
| "learning_rate": 0.00040833722222222217, | |
| "loss": 0.0085, | |
| "step": 95000 | |
| }, | |
| { | |
| "epoch": 0.9290135831564947, | |
| "eval_accuracy": 0.9991408571428572, | |
| "eval_loss": 0.0056878020986914635, | |
| "eval_runtime": 54.7817, | |
| "eval_samples_per_second": 547.628, | |
| "eval_steps_per_second": 34.227, | |
| "step": 95000 | |
| }, | |
| { | |
| "epoch": 0.9339031283310026, | |
| "grad_norm": 0.12460034340620041, | |
| "learning_rate": 0.0004063927777777778, | |
| "loss": 0.0089, | |
| "step": 95500 | |
| }, | |
| { | |
| "epoch": 0.9387926735055105, | |
| "grad_norm": 0.04623766988515854, | |
| "learning_rate": 0.00040444833333333334, | |
| "loss": 0.0087, | |
| "step": 96000 | |
| }, | |
| { | |
| "epoch": 0.9387926735055105, | |
| "eval_accuracy": 0.9991676190476191, | |
| "eval_loss": 0.005500451661646366, | |
| "eval_runtime": 53.9981, | |
| "eval_samples_per_second": 555.575, | |
| "eval_steps_per_second": 34.723, | |
| "step": 96000 | |
| }, | |
| { | |
| "epoch": 0.9436822186800183, | |
| "grad_norm": 0.08665420114994049, | |
| "learning_rate": 0.0004025038888888889, | |
| "loss": 0.0087, | |
| "step": 96500 | |
| }, | |
| { | |
| "epoch": 0.9485717638545262, | |
| "grad_norm": 0.0452926941215992, | |
| "learning_rate": 0.00040055944444444445, | |
| "loss": 0.0084, | |
| "step": 97000 | |
| }, | |
| { | |
| "epoch": 0.9485717638545262, | |
| "eval_accuracy": 0.999164, | |
| "eval_loss": 0.005574519746005535, | |
| "eval_runtime": 54.6981, | |
| "eval_samples_per_second": 548.465, | |
| "eval_steps_per_second": 34.279, | |
| "step": 97000 | |
| }, | |
| { | |
| "epoch": 0.9534613090290341, | |
| "grad_norm": 0.03491511195898056, | |
| "learning_rate": 0.000398615, | |
| "loss": 0.0086, | |
| "step": 97500 | |
| }, | |
| { | |
| "epoch": 0.9583508542035419, | |
| "grad_norm": 0.044573381543159485, | |
| "learning_rate": 0.00039667055555555556, | |
| "loss": 0.0089, | |
| "step": 98000 | |
| }, | |
| { | |
| "epoch": 0.9583508542035419, | |
| "eval_accuracy": 0.9991894285714286, | |
| "eval_loss": 0.005372173152863979, | |
| "eval_runtime": 53.4094, | |
| "eval_samples_per_second": 561.699, | |
| "eval_steps_per_second": 35.106, | |
| "step": 98000 | |
| }, | |
| { | |
| "epoch": 0.9632403993780498, | |
| "grad_norm": 0.02608780935406685, | |
| "learning_rate": 0.0003947261111111111, | |
| "loss": 0.0086, | |
| "step": 98500 | |
| }, | |
| { | |
| "epoch": 0.9681299445525577, | |
| "grad_norm": 0.04312971234321594, | |
| "learning_rate": 0.0003927816666666667, | |
| "loss": 0.0086, | |
| "step": 99000 | |
| }, | |
| { | |
| "epoch": 0.9681299445525577, | |
| "eval_accuracy": 0.9991722380952381, | |
| "eval_loss": 0.0054678237065672874, | |
| "eval_runtime": 54.0015, | |
| "eval_samples_per_second": 555.541, | |
| "eval_steps_per_second": 34.721, | |
| "step": 99000 | |
| }, | |
| { | |
| "epoch": 0.9730194897270655, | |
| "grad_norm": 0.06294015049934387, | |
| "learning_rate": 0.00039083722222222223, | |
| "loss": 0.0085, | |
| "step": 99500 | |
| }, | |
| { | |
| "epoch": 0.9779090349015734, | |
| "grad_norm": 0.029000315815210342, | |
| "learning_rate": 0.00038889277777777773, | |
| "loss": 0.0087, | |
| "step": 100000 | |
| }, | |
| { | |
| "epoch": 0.9779090349015734, | |
| "eval_accuracy": 0.999185380952381, | |
| "eval_loss": 0.005396171938627958, | |
| "eval_runtime": 55.6579, | |
| "eval_samples_per_second": 539.007, | |
| "eval_steps_per_second": 33.688, | |
| "step": 100000 | |
| }, | |
| { | |
| "epoch": 0.9827985800760813, | |
| "grad_norm": 0.04323006793856621, | |
| "learning_rate": 0.0003869483333333333, | |
| "loss": 0.0087, | |
| "step": 100500 | |
| }, | |
| { | |
| "epoch": 0.9876881252505892, | |
| "grad_norm": 0.0731167271733284, | |
| "learning_rate": 0.00038500388888888885, | |
| "loss": 0.0081, | |
| "step": 101000 | |
| }, | |
| { | |
| "epoch": 0.9876881252505892, | |
| "eval_accuracy": 0.9991765238095238, | |
| "eval_loss": 0.005412892438471317, | |
| "eval_runtime": 55.9769, | |
| "eval_samples_per_second": 535.935, | |
| "eval_steps_per_second": 33.496, | |
| "step": 101000 | |
| }, | |
| { | |
| "epoch": 0.992577670425097, | |
| "grad_norm": 0.023585299029946327, | |
| "learning_rate": 0.0003830594444444444, | |
| "loss": 0.0088, | |
| "step": 101500 | |
| }, | |
| { | |
| "epoch": 0.9974672155996049, | |
| "grad_norm": 0.08938384801149368, | |
| "learning_rate": 0.000381115, | |
| "loss": 0.0086, | |
| "step": 102000 | |
| }, | |
| { | |
| "epoch": 0.9974672155996049, | |
| "eval_accuracy": 0.9991979047619047, | |
| "eval_loss": 0.005323469173163176, | |
| "eval_runtime": 53.2851, | |
| "eval_samples_per_second": 563.009, | |
| "eval_steps_per_second": 35.188, | |
| "step": 102000 | |
| }, | |
| { | |
| "epoch": 1.0023567607741128, | |
| "grad_norm": 0.038682036101818085, | |
| "learning_rate": 0.00037917055555555557, | |
| "loss": 0.0082, | |
| "step": 102500 | |
| }, | |
| { | |
| "epoch": 1.0072463059486207, | |
| "grad_norm": 0.07080361992120743, | |
| "learning_rate": 0.0003772261111111111, | |
| "loss": 0.0081, | |
| "step": 103000 | |
| }, | |
| { | |
| "epoch": 1.0072463059486207, | |
| "eval_accuracy": 0.9992074761904762, | |
| "eval_loss": 0.00541540514677763, | |
| "eval_runtime": 54.1542, | |
| "eval_samples_per_second": 553.974, | |
| "eval_steps_per_second": 34.623, | |
| "step": 103000 | |
| }, | |
| { | |
| "epoch": 1.0121358511231284, | |
| "grad_norm": 0.0545232892036438, | |
| "learning_rate": 0.0003752816666666667, | |
| "loss": 0.0079, | |
| "step": 103500 | |
| }, | |
| { | |
| "epoch": 1.0170253962976363, | |
| "grad_norm": 0.05419744551181793, | |
| "learning_rate": 0.00037333722222222224, | |
| "loss": 0.0083, | |
| "step": 104000 | |
| }, | |
| { | |
| "epoch": 1.0170253962976363, | |
| "eval_accuracy": 0.999227380952381, | |
| "eval_loss": 0.005181997548788786, | |
| "eval_runtime": 54.7563, | |
| "eval_samples_per_second": 547.882, | |
| "eval_steps_per_second": 34.243, | |
| "step": 104000 | |
| }, | |
| { | |
| "epoch": 1.0219149414721442, | |
| "grad_norm": 0.062064480036497116, | |
| "learning_rate": 0.0003713927777777778, | |
| "loss": 0.0078, | |
| "step": 104500 | |
| }, | |
| { | |
| "epoch": 1.0268044866466521, | |
| "grad_norm": 0.0431884303689003, | |
| "learning_rate": 0.00036944833333333335, | |
| "loss": 0.0078, | |
| "step": 105000 | |
| }, | |
| { | |
| "epoch": 1.0268044866466521, | |
| "eval_accuracy": 0.999227380952381, | |
| "eval_loss": 0.005218331702053547, | |
| "eval_runtime": 53.5479, | |
| "eval_samples_per_second": 560.246, | |
| "eval_steps_per_second": 35.015, | |
| "step": 105000 | |
| }, | |
| { | |
| "epoch": 1.03169403182116, | |
| "grad_norm": 0.035419270396232605, | |
| "learning_rate": 0.00036750388888888885, | |
| "loss": 0.0079, | |
| "step": 105500 | |
| }, | |
| { | |
| "epoch": 1.036583576995668, | |
| "grad_norm": 0.03565732017159462, | |
| "learning_rate": 0.0003655594444444444, | |
| "loss": 0.0078, | |
| "step": 106000 | |
| }, | |
| { | |
| "epoch": 1.036583576995668, | |
| "eval_accuracy": 0.9992299523809524, | |
| "eval_loss": 0.005135852377861738, | |
| "eval_runtime": 54.0577, | |
| "eval_samples_per_second": 554.962, | |
| "eval_steps_per_second": 34.685, | |
| "step": 106000 | |
| }, | |
| { | |
| "epoch": 1.0414731221701756, | |
| "grad_norm": 0.04575124382972717, | |
| "learning_rate": 0.00036361499999999997, | |
| "loss": 0.0076, | |
| "step": 106500 | |
| }, | |
| { | |
| "epoch": 1.0463626673446835, | |
| "grad_norm": 0.07697087526321411, | |
| "learning_rate": 0.0003616705555555555, | |
| "loss": 0.0076, | |
| "step": 107000 | |
| }, | |
| { | |
| "epoch": 1.0463626673446835, | |
| "eval_accuracy": 0.9992333809523809, | |
| "eval_loss": 0.005050502717494965, | |
| "eval_runtime": 53.4533, | |
| "eval_samples_per_second": 561.238, | |
| "eval_steps_per_second": 35.077, | |
| "step": 107000 | |
| }, | |
| { | |
| "epoch": 1.0512522125191914, | |
| "grad_norm": 0.05499347671866417, | |
| "learning_rate": 0.0003597261111111111, | |
| "loss": 0.0079, | |
| "step": 107500 | |
| }, | |
| { | |
| "epoch": 1.0561417576936993, | |
| "grad_norm": 0.035594772547483444, | |
| "learning_rate": 0.00035778166666666664, | |
| "loss": 0.0081, | |
| "step": 108000 | |
| }, | |
| { | |
| "epoch": 1.0561417576936993, | |
| "eval_accuracy": 0.9992301428571428, | |
| "eval_loss": 0.0050900341011583805, | |
| "eval_runtime": 53.2622, | |
| "eval_samples_per_second": 563.251, | |
| "eval_steps_per_second": 35.203, | |
| "step": 108000 | |
| }, | |
| { | |
| "epoch": 1.0610313028682072, | |
| "grad_norm": 0.020569855347275734, | |
| "learning_rate": 0.00035583722222222225, | |
| "loss": 0.0077, | |
| "step": 108500 | |
| }, | |
| { | |
| "epoch": 1.0659208480427151, | |
| "grad_norm": 0.06758717447519302, | |
| "learning_rate": 0.0003538927777777778, | |
| "loss": 0.0082, | |
| "step": 109000 | |
| }, | |
| { | |
| "epoch": 1.0659208480427151, | |
| "eval_accuracy": 0.9992373333333333, | |
| "eval_loss": 0.005076898727566004, | |
| "eval_runtime": 53.4707, | |
| "eval_samples_per_second": 561.054, | |
| "eval_steps_per_second": 35.066, | |
| "step": 109000 | |
| }, | |
| { | |
| "epoch": 1.070810393217223, | |
| "grad_norm": 0.04208175465464592, | |
| "learning_rate": 0.00035194833333333336, | |
| "loss": 0.0079, | |
| "step": 109500 | |
| }, | |
| { | |
| "epoch": 1.0756999383917307, | |
| "grad_norm": 0.040982868522405624, | |
| "learning_rate": 0.0003500038888888889, | |
| "loss": 0.0074, | |
| "step": 110000 | |
| }, | |
| { | |
| "epoch": 1.0756999383917307, | |
| "eval_accuracy": 0.9992489523809523, | |
| "eval_loss": 0.00500760693103075, | |
| "eval_runtime": 54.1302, | |
| "eval_samples_per_second": 554.219, | |
| "eval_steps_per_second": 34.639, | |
| "step": 110000 | |
| }, | |
| { | |
| "epoch": 1.0805894835662386, | |
| "grad_norm": 0.05090247467160225, | |
| "learning_rate": 0.0003480594444444444, | |
| "loss": 0.0075, | |
| "step": 110500 | |
| }, | |
| { | |
| "epoch": 1.0854790287407465, | |
| "grad_norm": 0.02564290165901184, | |
| "learning_rate": 0.000346115, | |
| "loss": 0.0077, | |
| "step": 111000 | |
| }, | |
| { | |
| "epoch": 1.0854790287407465, | |
| "eval_accuracy": 0.9992412380952381, | |
| "eval_loss": 0.005068215075880289, | |
| "eval_runtime": 53.2721, | |
| "eval_samples_per_second": 563.147, | |
| "eval_steps_per_second": 35.197, | |
| "step": 111000 | |
| }, | |
| { | |
| "epoch": 1.0903685739152544, | |
| "grad_norm": 0.032404959201812744, | |
| "learning_rate": 0.0003441705555555556, | |
| "loss": 0.0076, | |
| "step": 111500 | |
| }, | |
| { | |
| "epoch": 1.0952581190897623, | |
| "grad_norm": 0.05177515000104904, | |
| "learning_rate": 0.00034222611111111114, | |
| "loss": 0.0077, | |
| "step": 112000 | |
| }, | |
| { | |
| "epoch": 1.0952581190897623, | |
| "eval_accuracy": 0.9992587142857143, | |
| "eval_loss": 0.00494408467784524, | |
| "eval_runtime": 53.7598, | |
| "eval_samples_per_second": 558.038, | |
| "eval_steps_per_second": 34.877, | |
| "step": 112000 | |
| }, | |
| { | |
| "epoch": 1.10014766426427, | |
| "grad_norm": 0.041296541690826416, | |
| "learning_rate": 0.00034028166666666664, | |
| "loss": 0.0076, | |
| "step": 112500 | |
| }, | |
| { | |
| "epoch": 1.105037209438778, | |
| "grad_norm": 0.027352752164006233, | |
| "learning_rate": 0.0003383372222222222, | |
| "loss": 0.0077, | |
| "step": 113000 | |
| }, | |
| { | |
| "epoch": 1.105037209438778, | |
| "eval_accuracy": 0.9992613333333333, | |
| "eval_loss": 0.004911018069833517, | |
| "eval_runtime": 53.361, | |
| "eval_samples_per_second": 562.209, | |
| "eval_steps_per_second": 35.138, | |
| "step": 113000 | |
| }, | |
| { | |
| "epoch": 1.1099267546132858, | |
| "grad_norm": 0.017891952767968178, | |
| "learning_rate": 0.00033639277777777776, | |
| "loss": 0.0074, | |
| "step": 113500 | |
| }, | |
| { | |
| "epoch": 1.1148162997877937, | |
| "grad_norm": 0.10825661569833755, | |
| "learning_rate": 0.0003344483333333333, | |
| "loss": 0.0077, | |
| "step": 114000 | |
| }, | |
| { | |
| "epoch": 1.1148162997877937, | |
| "eval_accuracy": 0.9992698095238095, | |
| "eval_loss": 0.004937721882015467, | |
| "eval_runtime": 53.9545, | |
| "eval_samples_per_second": 556.024, | |
| "eval_steps_per_second": 34.752, | |
| "step": 114000 | |
| }, | |
| { | |
| "epoch": 1.1197058449623016, | |
| "grad_norm": 0.0252179317176342, | |
| "learning_rate": 0.00033250388888888887, | |
| "loss": 0.0072, | |
| "step": 114500 | |
| }, | |
| { | |
| "epoch": 1.1245953901368095, | |
| "grad_norm": 0.10007605701684952, | |
| "learning_rate": 0.0003305594444444445, | |
| "loss": 0.0073, | |
| "step": 115000 | |
| }, | |
| { | |
| "epoch": 1.1245953901368095, | |
| "eval_accuracy": 0.9992664285714286, | |
| "eval_loss": 0.005000779405236244, | |
| "eval_runtime": 53.4444, | |
| "eval_samples_per_second": 561.331, | |
| "eval_steps_per_second": 35.083, | |
| "step": 115000 | |
| }, | |
| { | |
| "epoch": 1.1294849353113174, | |
| "grad_norm": 0.08812825381755829, | |
| "learning_rate": 0.000328615, | |
| "loss": 0.0076, | |
| "step": 115500 | |
| }, | |
| { | |
| "epoch": 1.1343744804858251, | |
| "grad_norm": 0.04212397709488869, | |
| "learning_rate": 0.00032667055555555554, | |
| "loss": 0.0071, | |
| "step": 116000 | |
| }, | |
| { | |
| "epoch": 1.1343744804858251, | |
| "eval_accuracy": 0.9992689523809524, | |
| "eval_loss": 0.0048895059153437614, | |
| "eval_runtime": 56.3714, | |
| "eval_samples_per_second": 532.185, | |
| "eval_steps_per_second": 33.262, | |
| "step": 116000 | |
| }, | |
| { | |
| "epoch": 1.139264025660333, | |
| "grad_norm": 0.02763226442039013, | |
| "learning_rate": 0.0003247261111111111, | |
| "loss": 0.0075, | |
| "step": 116500 | |
| }, | |
| { | |
| "epoch": 1.144153570834841, | |
| "grad_norm": 0.05487339198589325, | |
| "learning_rate": 0.00032278166666666665, | |
| "loss": 0.0074, | |
| "step": 117000 | |
| }, | |
| { | |
| "epoch": 1.144153570834841, | |
| "eval_accuracy": 0.9992661428571429, | |
| "eval_loss": 0.004837568383663893, | |
| "eval_runtime": 54.3925, | |
| "eval_samples_per_second": 551.547, | |
| "eval_steps_per_second": 34.472, | |
| "step": 117000 | |
| }, | |
| { | |
| "epoch": 1.1490431160093488, | |
| "grad_norm": 0.04747488722205162, | |
| "learning_rate": 0.0003208372222222222, | |
| "loss": 0.0075, | |
| "step": 117500 | |
| }, | |
| { | |
| "epoch": 1.1539326611838567, | |
| "grad_norm": 0.10006921738386154, | |
| "learning_rate": 0.00031889277777777777, | |
| "loss": 0.0074, | |
| "step": 118000 | |
| }, | |
| { | |
| "epoch": 1.1539326611838567, | |
| "eval_accuracy": 0.9992860476190476, | |
| "eval_loss": 0.0047850459814071655, | |
| "eval_runtime": 53.7241, | |
| "eval_samples_per_second": 558.408, | |
| "eval_steps_per_second": 34.901, | |
| "step": 118000 | |
| }, | |
| { | |
| "epoch": 1.1588222063583646, | |
| "grad_norm": 0.03712115064263344, | |
| "learning_rate": 0.0003169483333333333, | |
| "loss": 0.0075, | |
| "step": 118500 | |
| }, | |
| { | |
| "epoch": 1.1637117515328723, | |
| "grad_norm": 0.05919933691620827, | |
| "learning_rate": 0.0003150038888888889, | |
| "loss": 0.0073, | |
| "step": 119000 | |
| }, | |
| { | |
| "epoch": 1.1637117515328723, | |
| "eval_accuracy": 0.9992771428571429, | |
| "eval_loss": 0.004803878720849752, | |
| "eval_runtime": 53.7517, | |
| "eval_samples_per_second": 558.121, | |
| "eval_steps_per_second": 34.883, | |
| "step": 119000 | |
| }, | |
| { | |
| "epoch": 1.1686012967073802, | |
| "grad_norm": 0.017905965447425842, | |
| "learning_rate": 0.00031305944444444444, | |
| "loss": 0.0069, | |
| "step": 119500 | |
| }, | |
| { | |
| "epoch": 1.1734908418818881, | |
| "grad_norm": 0.05728234723210335, | |
| "learning_rate": 0.000311115, | |
| "loss": 0.007, | |
| "step": 120000 | |
| }, | |
| { | |
| "epoch": 1.1734908418818881, | |
| "eval_accuracy": 0.999289, | |
| "eval_loss": 0.004755858797580004, | |
| "eval_runtime": 53.6273, | |
| "eval_samples_per_second": 559.417, | |
| "eval_steps_per_second": 34.964, | |
| "step": 120000 | |
| }, | |
| { | |
| "epoch": 1.178380387056396, | |
| "grad_norm": 0.05677701532840729, | |
| "learning_rate": 0.00030917055555555555, | |
| "loss": 0.007, | |
| "step": 120500 | |
| }, | |
| { | |
| "epoch": 1.183269932230904, | |
| "grad_norm": 0.05953844264149666, | |
| "learning_rate": 0.0003072261111111111, | |
| "loss": 0.0071, | |
| "step": 121000 | |
| }, | |
| { | |
| "epoch": 1.183269932230904, | |
| "eval_accuracy": 0.999293619047619, | |
| "eval_loss": 0.004746082704514265, | |
| "eval_runtime": 55.1206, | |
| "eval_samples_per_second": 544.262, | |
| "eval_steps_per_second": 34.016, | |
| "step": 121000 | |
| }, | |
| { | |
| "epoch": 1.1881594774054118, | |
| "grad_norm": 0.03433966636657715, | |
| "learning_rate": 0.00030528166666666666, | |
| "loss": 0.0071, | |
| "step": 121500 | |
| }, | |
| { | |
| "epoch": 1.1930490225799195, | |
| "grad_norm": 0.0718400701880455, | |
| "learning_rate": 0.0003033372222222222, | |
| "loss": 0.0073, | |
| "step": 122000 | |
| }, | |
| { | |
| "epoch": 1.1930490225799195, | |
| "eval_accuracy": 0.9992973333333334, | |
| "eval_loss": 0.004623962566256523, | |
| "eval_runtime": 54.751, | |
| "eval_samples_per_second": 547.935, | |
| "eval_steps_per_second": 34.246, | |
| "step": 122000 | |
| }, | |
| { | |
| "epoch": 1.1979385677544274, | |
| "grad_norm": 0.026871928945183754, | |
| "learning_rate": 0.0003013927777777778, | |
| "loss": 0.0065, | |
| "step": 122500 | |
| }, | |
| { | |
| "epoch": 1.2028281129289353, | |
| "grad_norm": 0.015808627009391785, | |
| "learning_rate": 0.00029944833333333333, | |
| "loss": 0.0069, | |
| "step": 123000 | |
| }, | |
| { | |
| "epoch": 1.2028281129289353, | |
| "eval_accuracy": 0.9992959047619048, | |
| "eval_loss": 0.004734317306429148, | |
| "eval_runtime": 53.9604, | |
| "eval_samples_per_second": 555.963, | |
| "eval_steps_per_second": 34.748, | |
| "step": 123000 | |
| }, | |
| { | |
| "epoch": 1.2077176581034432, | |
| "grad_norm": 0.06739887595176697, | |
| "learning_rate": 0.0002975038888888889, | |
| "loss": 0.0071, | |
| "step": 123500 | |
| }, | |
| { | |
| "epoch": 1.2126072032779511, | |
| "grad_norm": 0.020941952243447304, | |
| "learning_rate": 0.00029555944444444444, | |
| "loss": 0.007, | |
| "step": 124000 | |
| }, | |
| { | |
| "epoch": 1.2126072032779511, | |
| "eval_accuracy": 0.9992935238095239, | |
| "eval_loss": 0.004609288647770882, | |
| "eval_runtime": 54.1194, | |
| "eval_samples_per_second": 554.33, | |
| "eval_steps_per_second": 34.646, | |
| "step": 124000 | |
| }, | |
| { | |
| "epoch": 1.217496748452459, | |
| "grad_norm": 0.027827920392155647, | |
| "learning_rate": 0.000293615, | |
| "loss": 0.007, | |
| "step": 124500 | |
| }, | |
| { | |
| "epoch": 1.222386293626967, | |
| "grad_norm": 0.08693556487560272, | |
| "learning_rate": 0.00029167055555555556, | |
| "loss": 0.0069, | |
| "step": 125000 | |
| }, | |
| { | |
| "epoch": 1.222386293626967, | |
| "eval_accuracy": 0.9993093333333334, | |
| "eval_loss": 0.004602524451911449, | |
| "eval_runtime": 53.3938, | |
| "eval_samples_per_second": 561.863, | |
| "eval_steps_per_second": 35.116, | |
| "step": 125000 | |
| }, | |
| { | |
| "epoch": 1.2272758388014746, | |
| "grad_norm": 0.04795575141906738, | |
| "learning_rate": 0.0002897261111111111, | |
| "loss": 0.0069, | |
| "step": 125500 | |
| }, | |
| { | |
| "epoch": 1.2321653839759825, | |
| "grad_norm": 0.07266402244567871, | |
| "learning_rate": 0.00028778166666666667, | |
| "loss": 0.0071, | |
| "step": 126000 | |
| }, | |
| { | |
| "epoch": 1.2321653839759825, | |
| "eval_accuracy": 0.9993089523809524, | |
| "eval_loss": 0.00456634908914566, | |
| "eval_runtime": 54.0494, | |
| "eval_samples_per_second": 555.048, | |
| "eval_steps_per_second": 34.69, | |
| "step": 126000 | |
| }, | |
| { | |
| "epoch": 1.2370549291504904, | |
| "grad_norm": 0.03289886936545372, | |
| "learning_rate": 0.0002858372222222222, | |
| "loss": 0.0072, | |
| "step": 126500 | |
| }, | |
| { | |
| "epoch": 1.2419444743249983, | |
| "grad_norm": 0.02240580879151821, | |
| "learning_rate": 0.0002838927777777778, | |
| "loss": 0.007, | |
| "step": 127000 | |
| }, | |
| { | |
| "epoch": 1.2419444743249983, | |
| "eval_accuracy": 0.9993215714285715, | |
| "eval_loss": 0.004485046491026878, | |
| "eval_runtime": 53.4392, | |
| "eval_samples_per_second": 561.386, | |
| "eval_steps_per_second": 35.087, | |
| "step": 127000 | |
| }, | |
| { | |
| "epoch": 1.2468340194995062, | |
| "grad_norm": 0.040360696613788605, | |
| "learning_rate": 0.00028194833333333334, | |
| "loss": 0.0068, | |
| "step": 127500 | |
| }, | |
| { | |
| "epoch": 1.251723564674014, | |
| "grad_norm": 0.032697584480047226, | |
| "learning_rate": 0.0002800038888888889, | |
| "loss": 0.0072, | |
| "step": 128000 | |
| }, | |
| { | |
| "epoch": 1.251723564674014, | |
| "eval_accuracy": 0.9993274761904762, | |
| "eval_loss": 0.004469048231840134, | |
| "eval_runtime": 53.9627, | |
| "eval_samples_per_second": 555.939, | |
| "eval_steps_per_second": 34.746, | |
| "step": 128000 | |
| }, | |
| { | |
| "epoch": 1.2566131098485218, | |
| "grad_norm": 0.021058347076177597, | |
| "learning_rate": 0.00027805944444444445, | |
| "loss": 0.0069, | |
| "step": 128500 | |
| }, | |
| { | |
| "epoch": 1.2615026550230297, | |
| "grad_norm": 0.036056675016880035, | |
| "learning_rate": 0.000276115, | |
| "loss": 0.0067, | |
| "step": 129000 | |
| }, | |
| { | |
| "epoch": 1.2615026550230297, | |
| "eval_accuracy": 0.9993329047619047, | |
| "eval_loss": 0.004397740587592125, | |
| "eval_runtime": 53.2747, | |
| "eval_samples_per_second": 563.12, | |
| "eval_steps_per_second": 35.195, | |
| "step": 129000 | |
| }, | |
| { | |
| "epoch": 1.2663922001975376, | |
| "grad_norm": 0.034787457436323166, | |
| "learning_rate": 0.0002741705555555555, | |
| "loss": 0.0066, | |
| "step": 129500 | |
| }, | |
| { | |
| "epoch": 1.2712817453720455, | |
| "grad_norm": 0.05359942466020584, | |
| "learning_rate": 0.0002722261111111111, | |
| "loss": 0.0065, | |
| "step": 130000 | |
| }, | |
| { | |
| "epoch": 1.2712817453720455, | |
| "eval_accuracy": 0.9993344761904762, | |
| "eval_loss": 0.004399556666612625, | |
| "eval_runtime": 53.9523, | |
| "eval_samples_per_second": 556.047, | |
| "eval_steps_per_second": 34.753, | |
| "step": 130000 | |
| }, | |
| { | |
| "epoch": 1.2761712905465534, | |
| "grad_norm": 0.02243073098361492, | |
| "learning_rate": 0.0002702816666666667, | |
| "loss": 0.0068, | |
| "step": 130500 | |
| }, | |
| { | |
| "epoch": 1.2810608357210613, | |
| "grad_norm": 0.049295682460069656, | |
| "learning_rate": 0.00026833722222222223, | |
| "loss": 0.0068, | |
| "step": 131000 | |
| }, | |
| { | |
| "epoch": 1.2810608357210613, | |
| "eval_accuracy": 0.9993318571428571, | |
| "eval_loss": 0.004440919030457735, | |
| "eval_runtime": 53.2304, | |
| "eval_samples_per_second": 563.587, | |
| "eval_steps_per_second": 35.224, | |
| "step": 131000 | |
| }, | |
| { | |
| "epoch": 1.285950380895569, | |
| "grad_norm": 0.021682027727365494, | |
| "learning_rate": 0.0002663927777777778, | |
| "loss": 0.0067, | |
| "step": 131500 | |
| }, | |
| { | |
| "epoch": 1.290839926070077, | |
| "grad_norm": 0.0382467582821846, | |
| "learning_rate": 0.00026444833333333335, | |
| "loss": 0.0067, | |
| "step": 132000 | |
| }, | |
| { | |
| "epoch": 1.290839926070077, | |
| "eval_accuracy": 0.9993491904761905, | |
| "eval_loss": 0.004402833059430122, | |
| "eval_runtime": 53.8618, | |
| "eval_samples_per_second": 556.981, | |
| "eval_steps_per_second": 34.811, | |
| "step": 132000 | |
| }, | |
| { | |
| "epoch": 1.2957294712445848, | |
| "grad_norm": 0.041405659168958664, | |
| "learning_rate": 0.00026250388888888885, | |
| "loss": 0.0068, | |
| "step": 132500 | |
| }, | |
| { | |
| "epoch": 1.3006190164190927, | |
| "grad_norm": 0.039939701557159424, | |
| "learning_rate": 0.00026055944444444446, | |
| "loss": 0.0064, | |
| "step": 133000 | |
| }, | |
| { | |
| "epoch": 1.3006190164190927, | |
| "eval_accuracy": 0.9993461904761904, | |
| "eval_loss": 0.004411030560731888, | |
| "eval_runtime": 52.9835, | |
| "eval_samples_per_second": 566.214, | |
| "eval_steps_per_second": 35.388, | |
| "step": 133000 | |
| }, | |
| { | |
| "epoch": 1.3055085615936006, | |
| "grad_norm": 0.07499232143163681, | |
| "learning_rate": 0.000258615, | |
| "loss": 0.0068, | |
| "step": 133500 | |
| }, | |
| { | |
| "epoch": 1.3103981067681083, | |
| "grad_norm": 0.03830355405807495, | |
| "learning_rate": 0.0002566705555555556, | |
| "loss": 0.0066, | |
| "step": 134000 | |
| }, | |
| { | |
| "epoch": 1.3103981067681083, | |
| "eval_accuracy": 0.9993475238095239, | |
| "eval_loss": 0.004307963885366917, | |
| "eval_runtime": 54.0847, | |
| "eval_samples_per_second": 554.685, | |
| "eval_steps_per_second": 34.668, | |
| "step": 134000 | |
| }, | |
| { | |
| "epoch": 1.3152876519426164, | |
| "grad_norm": 0.04341171681880951, | |
| "learning_rate": 0.00025472611111111113, | |
| "loss": 0.0064, | |
| "step": 134500 | |
| }, | |
| { | |
| "epoch": 1.3201771971171241, | |
| "grad_norm": 0.05085453763604164, | |
| "learning_rate": 0.00025278166666666663, | |
| "loss": 0.0066, | |
| "step": 135000 | |
| }, | |
| { | |
| "epoch": 1.3201771971171241, | |
| "eval_accuracy": 0.9993423809523809, | |
| "eval_loss": 0.004391905851662159, | |
| "eval_runtime": 53.489, | |
| "eval_samples_per_second": 560.863, | |
| "eval_steps_per_second": 35.054, | |
| "step": 135000 | |
| }, | |
| { | |
| "epoch": 1.325066742291632, | |
| "grad_norm": 0.05465886369347572, | |
| "learning_rate": 0.0002508372222222222, | |
| "loss": 0.0065, | |
| "step": 135500 | |
| }, | |
| { | |
| "epoch": 1.32995628746614, | |
| "grad_norm": 0.028779752552509308, | |
| "learning_rate": 0.00024889277777777774, | |
| "loss": 0.0065, | |
| "step": 136000 | |
| }, | |
| { | |
| "epoch": 1.32995628746614, | |
| "eval_accuracy": 0.9993518571428571, | |
| "eval_loss": 0.004291407763957977, | |
| "eval_runtime": 53.5568, | |
| "eval_samples_per_second": 560.153, | |
| "eval_steps_per_second": 35.01, | |
| "step": 136000 | |
| }, | |
| { | |
| "epoch": 1.3348458326406478, | |
| "grad_norm": 0.07813508808612823, | |
| "learning_rate": 0.00024694833333333336, | |
| "loss": 0.0069, | |
| "step": 136500 | |
| }, | |
| { | |
| "epoch": 1.3397353778151557, | |
| "grad_norm": 0.034233298152685165, | |
| "learning_rate": 0.0002450038888888889, | |
| "loss": 0.0064, | |
| "step": 137000 | |
| }, | |
| { | |
| "epoch": 1.3397353778151557, | |
| "eval_accuracy": 0.9993458095238095, | |
| "eval_loss": 0.004360624123364687, | |
| "eval_runtime": 52.8603, | |
| "eval_samples_per_second": 567.534, | |
| "eval_steps_per_second": 35.471, | |
| "step": 137000 | |
| }, | |
| { | |
| "epoch": 1.3446249229896634, | |
| "grad_norm": 0.08024276047945023, | |
| "learning_rate": 0.00024305944444444447, | |
| "loss": 0.0061, | |
| "step": 137500 | |
| }, | |
| { | |
| "epoch": 1.3495144681641713, | |
| "grad_norm": 0.05493255332112312, | |
| "learning_rate": 0.00024111499999999997, | |
| "loss": 0.0066, | |
| "step": 138000 | |
| }, | |
| { | |
| "epoch": 1.3495144681641713, | |
| "eval_accuracy": 0.9993639047619047, | |
| "eval_loss": 0.00431590573862195, | |
| "eval_runtime": 53.7077, | |
| "eval_samples_per_second": 558.579, | |
| "eval_steps_per_second": 34.911, | |
| "step": 138000 | |
| }, | |
| { | |
| "epoch": 1.3544040133386792, | |
| "grad_norm": 0.04275180399417877, | |
| "learning_rate": 0.00023917055555555555, | |
| "loss": 0.0062, | |
| "step": 138500 | |
| }, | |
| { | |
| "epoch": 1.3592935585131871, | |
| "grad_norm": 0.07628139853477478, | |
| "learning_rate": 0.0002372261111111111, | |
| "loss": 0.0065, | |
| "step": 139000 | |
| }, | |
| { | |
| "epoch": 1.3592935585131871, | |
| "eval_accuracy": 0.9993583809523809, | |
| "eval_loss": 0.0042925444431602955, | |
| "eval_runtime": 53.3087, | |
| "eval_samples_per_second": 562.76, | |
| "eval_steps_per_second": 35.173, | |
| "step": 139000 | |
| }, | |
| { | |
| "epoch": 1.364183103687695, | |
| "grad_norm": 0.018862802535295486, | |
| "learning_rate": 0.00023528166666666667, | |
| "loss": 0.0064, | |
| "step": 139500 | |
| }, | |
| { | |
| "epoch": 1.3690726488622027, | |
| "grad_norm": 0.059994716197252274, | |
| "learning_rate": 0.00023333722222222222, | |
| "loss": 0.0061, | |
| "step": 140000 | |
| }, | |
| { | |
| "epoch": 1.3690726488622027, | |
| "eval_accuracy": 0.9993745714285714, | |
| "eval_loss": 0.004216773435473442, | |
| "eval_runtime": 53.7427, | |
| "eval_samples_per_second": 558.215, | |
| "eval_steps_per_second": 34.888, | |
| "step": 140000 | |
| }, | |
| { | |
| "epoch": 1.3739621940367108, | |
| "grad_norm": 0.02738560363650322, | |
| "learning_rate": 0.00023139277777777775, | |
| "loss": 0.006, | |
| "step": 140500 | |
| }, | |
| { | |
| "epoch": 1.3788517392112185, | |
| "grad_norm": 0.16879647970199585, | |
| "learning_rate": 0.0002294483333333333, | |
| "loss": 0.0062, | |
| "step": 141000 | |
| }, | |
| { | |
| "epoch": 1.3788517392112185, | |
| "eval_accuracy": 0.9993692380952381, | |
| "eval_loss": 0.004215199965983629, | |
| "eval_runtime": 53.2674, | |
| "eval_samples_per_second": 563.197, | |
| "eval_steps_per_second": 35.2, | |
| "step": 141000 | |
| }, | |
| { | |
| "epoch": 1.3837412843857264, | |
| "grad_norm": 0.03396091237664223, | |
| "learning_rate": 0.0002275038888888889, | |
| "loss": 0.0062, | |
| "step": 141500 | |
| }, | |
| { | |
| "epoch": 1.3886308295602343, | |
| "grad_norm": 0.04174041002988815, | |
| "learning_rate": 0.00022555944444444445, | |
| "loss": 0.0063, | |
| "step": 142000 | |
| }, | |
| { | |
| "epoch": 1.3886308295602343, | |
| "eval_accuracy": 0.9993620476190476, | |
| "eval_loss": 0.00427864259108901, | |
| "eval_runtime": 54.516, | |
| "eval_samples_per_second": 550.297, | |
| "eval_steps_per_second": 34.394, | |
| "step": 142000 | |
| }, | |
| { | |
| "epoch": 1.3935203747347422, | |
| "grad_norm": 0.032653287053108215, | |
| "learning_rate": 0.000223615, | |
| "loss": 0.0062, | |
| "step": 142500 | |
| }, | |
| { | |
| "epoch": 1.3984099199092501, | |
| "grad_norm": 0.04273010045289993, | |
| "learning_rate": 0.00022167055555555556, | |
| "loss": 0.0061, | |
| "step": 143000 | |
| }, | |
| { | |
| "epoch": 1.3984099199092501, | |
| "eval_accuracy": 0.9993804761904762, | |
| "eval_loss": 0.0041556586511433125, | |
| "eval_runtime": 53.4491, | |
| "eval_samples_per_second": 561.282, | |
| "eval_steps_per_second": 35.08, | |
| "step": 143000 | |
| }, | |
| { | |
| "epoch": 1.4032994650837578, | |
| "grad_norm": 0.043946944177150726, | |
| "learning_rate": 0.0002197261111111111, | |
| "loss": 0.0059, | |
| "step": 143500 | |
| }, | |
| { | |
| "epoch": 1.4081890102582657, | |
| "grad_norm": 0.016042672097682953, | |
| "learning_rate": 0.00021778166666666665, | |
| "loss": 0.0062, | |
| "step": 144000 | |
| }, | |
| { | |
| "epoch": 1.4081890102582657, | |
| "eval_accuracy": 0.9993822857142857, | |
| "eval_loss": 0.004146920517086983, | |
| "eval_runtime": 53.2095, | |
| "eval_samples_per_second": 563.809, | |
| "eval_steps_per_second": 35.238, | |
| "step": 144000 | |
| }, | |
| { | |
| "epoch": 1.4130785554327736, | |
| "grad_norm": 0.04190443456172943, | |
| "learning_rate": 0.0002158372222222222, | |
| "loss": 0.006, | |
| "step": 144500 | |
| }, | |
| { | |
| "epoch": 1.4179681006072815, | |
| "grad_norm": 0.029104501008987427, | |
| "learning_rate": 0.0002138927777777778, | |
| "loss": 0.006, | |
| "step": 145000 | |
| }, | |
| { | |
| "epoch": 1.4179681006072815, | |
| "eval_accuracy": 0.9993911428571428, | |
| "eval_loss": 0.004062490537762642, | |
| "eval_runtime": 53.4832, | |
| "eval_samples_per_second": 560.923, | |
| "eval_steps_per_second": 35.058, | |
| "step": 145000 | |
| }, | |
| { | |
| "epoch": 1.4228576457817894, | |
| "grad_norm": 0.019995709881186485, | |
| "learning_rate": 0.00021194833333333335, | |
| "loss": 0.0058, | |
| "step": 145500 | |
| }, | |
| { | |
| "epoch": 1.4277471909562973, | |
| "grad_norm": 0.016850166022777557, | |
| "learning_rate": 0.0002100038888888889, | |
| "loss": 0.0062, | |
| "step": 146000 | |
| }, | |
| { | |
| "epoch": 1.4277471909562973, | |
| "eval_accuracy": 0.9993850476190477, | |
| "eval_loss": 0.00406758114695549, | |
| "eval_runtime": 54.1149, | |
| "eval_samples_per_second": 554.376, | |
| "eval_steps_per_second": 34.648, | |
| "step": 146000 | |
| }, | |
| { | |
| "epoch": 1.4326367361308052, | |
| "grad_norm": 0.042491696774959564, | |
| "learning_rate": 0.00020805944444444443, | |
| "loss": 0.0059, | |
| "step": 146500 | |
| }, | |
| { | |
| "epoch": 1.437526281305313, | |
| "grad_norm": 0.07708732038736343, | |
| "learning_rate": 0.000206115, | |
| "loss": 0.006, | |
| "step": 147000 | |
| }, | |
| { | |
| "epoch": 1.437526281305313, | |
| "eval_accuracy": 0.9993972857142858, | |
| "eval_loss": 0.004030513111501932, | |
| "eval_runtime": 53.221, | |
| "eval_samples_per_second": 563.687, | |
| "eval_steps_per_second": 35.23, | |
| "step": 147000 | |
| }, | |
| { | |
| "epoch": 1.4424158264798208, | |
| "grad_norm": 0.032772552222013474, | |
| "learning_rate": 0.00020417055555555554, | |
| "loss": 0.0059, | |
| "step": 147500 | |
| }, | |
| { | |
| "epoch": 1.4473053716543287, | |
| "grad_norm": 0.041167329996824265, | |
| "learning_rate": 0.00020222611111111113, | |
| "loss": 0.0058, | |
| "step": 148000 | |
| }, | |
| { | |
| "epoch": 1.4473053716543287, | |
| "eval_accuracy": 0.999398, | |
| "eval_loss": 0.004109182395040989, | |
| "eval_runtime": 53.8747, | |
| "eval_samples_per_second": 556.848, | |
| "eval_steps_per_second": 34.803, | |
| "step": 148000 | |
| }, | |
| { | |
| "epoch": 1.4521949168288366, | |
| "grad_norm": 0.033146705478429794, | |
| "learning_rate": 0.00020028166666666668, | |
| "loss": 0.0058, | |
| "step": 148500 | |
| }, | |
| { | |
| "epoch": 1.4570844620033445, | |
| "grad_norm": 0.04614367336034775, | |
| "learning_rate": 0.0001983372222222222, | |
| "loss": 0.0057, | |
| "step": 149000 | |
| }, | |
| { | |
| "epoch": 1.4570844620033445, | |
| "eval_accuracy": 0.9994065238095238, | |
| "eval_loss": 0.003991841338574886, | |
| "eval_runtime": 53.7363, | |
| "eval_samples_per_second": 558.282, | |
| "eval_steps_per_second": 34.893, | |
| "step": 149000 | |
| }, | |
| { | |
| "epoch": 1.4619740071778522, | |
| "grad_norm": 0.031296566128730774, | |
| "learning_rate": 0.00019639277777777777, | |
| "loss": 0.0057, | |
| "step": 149500 | |
| }, | |
| { | |
| "epoch": 1.46686355235236, | |
| "grad_norm": 0.03523857146501541, | |
| "learning_rate": 0.00019444833333333333, | |
| "loss": 0.0059, | |
| "step": 150000 | |
| }, | |
| { | |
| "epoch": 1.46686355235236, | |
| "eval_accuracy": 0.9994045238095238, | |
| "eval_loss": 0.00398767227306962, | |
| "eval_runtime": 54.0668, | |
| "eval_samples_per_second": 554.869, | |
| "eval_steps_per_second": 34.679, | |
| "step": 150000 | |
| }, | |
| { | |
| "epoch": 1.471753097526868, | |
| "grad_norm": 0.030513431876897812, | |
| "learning_rate": 0.00019250388888888888, | |
| "loss": 0.006, | |
| "step": 150500 | |
| }, | |
| { | |
| "epoch": 1.476642642701376, | |
| "grad_norm": 0.03433874994516373, | |
| "learning_rate": 0.00019055944444444444, | |
| "loss": 0.0057, | |
| "step": 151000 | |
| }, | |
| { | |
| "epoch": 1.476642642701376, | |
| "eval_accuracy": 0.999412, | |
| "eval_loss": 0.003936768043786287, | |
| "eval_runtime": 53.4197, | |
| "eval_samples_per_second": 561.591, | |
| "eval_steps_per_second": 35.099, | |
| "step": 151000 | |
| }, | |
| { | |
| "epoch": 1.4815321878758838, | |
| "grad_norm": 0.03743559867143631, | |
| "learning_rate": 0.00018861500000000002, | |
| "loss": 0.0059, | |
| "step": 151500 | |
| }, | |
| { | |
| "epoch": 1.4864217330503917, | |
| "grad_norm": 0.023772869259119034, | |
| "learning_rate": 0.00018667055555555553, | |
| "loss": 0.0056, | |
| "step": 152000 | |
| }, | |
| { | |
| "epoch": 1.4864217330503917, | |
| "eval_accuracy": 0.9994103333333333, | |
| "eval_loss": 0.00395695585757494, | |
| "eval_runtime": 53.4862, | |
| "eval_samples_per_second": 560.892, | |
| "eval_steps_per_second": 35.056, | |
| "step": 152000 | |
| }, | |
| { | |
| "epoch": 1.4913112782248996, | |
| "grad_norm": 0.021286042407155037, | |
| "learning_rate": 0.0001847261111111111, | |
| "loss": 0.0056, | |
| "step": 152500 | |
| }, | |
| { | |
| "epoch": 1.4962008233994073, | |
| "grad_norm": 0.04487517103552818, | |
| "learning_rate": 0.00018278166666666667, | |
| "loss": 0.0059, | |
| "step": 153000 | |
| }, | |
| { | |
| "epoch": 1.4962008233994073, | |
| "eval_accuracy": 0.9994135714285715, | |
| "eval_loss": 0.0038883944507688284, | |
| "eval_runtime": 53.7959, | |
| "eval_samples_per_second": 557.663, | |
| "eval_steps_per_second": 34.854, | |
| "step": 153000 | |
| }, | |
| { | |
| "epoch": 1.5010903685739152, | |
| "grad_norm": 0.02229585126042366, | |
| "learning_rate": 0.00018083722222222222, | |
| "loss": 0.0056, | |
| "step": 153500 | |
| }, | |
| { | |
| "epoch": 1.505979913748423, | |
| "grad_norm": 0.06015641614794731, | |
| "learning_rate": 0.00017889277777777778, | |
| "loss": 0.0055, | |
| "step": 154000 | |
| }, | |
| { | |
| "epoch": 1.505979913748423, | |
| "eval_accuracy": 0.9994171428571429, | |
| "eval_loss": 0.0039031950291246176, | |
| "eval_runtime": 53.8206, | |
| "eval_samples_per_second": 557.408, | |
| "eval_steps_per_second": 34.838, | |
| "step": 154000 | |
| }, | |
| { | |
| "epoch": 1.510869458922931, | |
| "grad_norm": 0.060777414590120316, | |
| "learning_rate": 0.00017694833333333336, | |
| "loss": 0.0057, | |
| "step": 154500 | |
| }, | |
| { | |
| "epoch": 1.515759004097439, | |
| "grad_norm": 0.010729908011853695, | |
| "learning_rate": 0.00017500388888888886, | |
| "loss": 0.0055, | |
| "step": 155000 | |
| }, | |
| { | |
| "epoch": 1.515759004097439, | |
| "eval_accuracy": 0.9994168095238095, | |
| "eval_loss": 0.0038592983037233353, | |
| "eval_runtime": 52.9997, | |
| "eval_samples_per_second": 566.041, | |
| "eval_steps_per_second": 35.378, | |
| "step": 155000 | |
| }, | |
| { | |
| "epoch": 1.5206485492719466, | |
| "grad_norm": 0.07996519654989243, | |
| "learning_rate": 0.00017305944444444445, | |
| "loss": 0.0056, | |
| "step": 155500 | |
| }, | |
| { | |
| "epoch": 1.5255380944464547, | |
| "grad_norm": 0.05094398185610771, | |
| "learning_rate": 0.000171115, | |
| "loss": 0.0056, | |
| "step": 156000 | |
| }, | |
| { | |
| "epoch": 1.5255380944464547, | |
| "eval_accuracy": 0.9994315238095238, | |
| "eval_loss": 0.0037978454492986202, | |
| "eval_runtime": 53.5723, | |
| "eval_samples_per_second": 559.991, | |
| "eval_steps_per_second": 34.999, | |
| "step": 156000 | |
| }, | |
| { | |
| "epoch": 1.5304276396209624, | |
| "grad_norm": 0.038200926035642624, | |
| "learning_rate": 0.00016917055555555556, | |
| "loss": 0.0055, | |
| "step": 156500 | |
| }, | |
| { | |
| "epoch": 1.5353171847954703, | |
| "grad_norm": 0.10346455127000809, | |
| "learning_rate": 0.00016722611111111112, | |
| "loss": 0.0054, | |
| "step": 157000 | |
| }, | |
| { | |
| "epoch": 1.5353171847954703, | |
| "eval_accuracy": 0.9994299523809523, | |
| "eval_loss": 0.0037865168415009975, | |
| "eval_runtime": 53.2357, | |
| "eval_samples_per_second": 563.531, | |
| "eval_steps_per_second": 35.221, | |
| "step": 157000 | |
| }, | |
| { | |
| "epoch": 1.5402067299699782, | |
| "grad_norm": 0.015595887787640095, | |
| "learning_rate": 0.00016528166666666667, | |
| "loss": 0.0056, | |
| "step": 157500 | |
| }, | |
| { | |
| "epoch": 1.545096275144486, | |
| "grad_norm": 0.0232669860124588, | |
| "learning_rate": 0.00016333722222222223, | |
| "loss": 0.0055, | |
| "step": 158000 | |
| }, | |
| { | |
| "epoch": 1.545096275144486, | |
| "eval_accuracy": 0.9994310476190477, | |
| "eval_loss": 0.003748950082808733, | |
| "eval_runtime": 54.3134, | |
| "eval_samples_per_second": 552.35, | |
| "eval_steps_per_second": 34.522, | |
| "step": 158000 | |
| }, | |
| { | |
| "epoch": 1.549985820318994, | |
| "grad_norm": 0.04196183383464813, | |
| "learning_rate": 0.00016139277777777776, | |
| "loss": 0.0054, | |
| "step": 158500 | |
| }, | |
| { | |
| "epoch": 1.5548753654935017, | |
| "grad_norm": 0.04280064254999161, | |
| "learning_rate": 0.00015944833333333334, | |
| "loss": 0.0055, | |
| "step": 159000 | |
| }, | |
| { | |
| "epoch": 1.5548753654935017, | |
| "eval_accuracy": 0.9994327619047619, | |
| "eval_loss": 0.00377083383500576, | |
| "eval_runtime": 53.1652, | |
| "eval_samples_per_second": 564.278, | |
| "eval_steps_per_second": 35.267, | |
| "step": 159000 | |
| }, | |
| { | |
| "epoch": 1.5597649106680098, | |
| "grad_norm": 0.01646304689347744, | |
| "learning_rate": 0.00015750388888888887, | |
| "loss": 0.0053, | |
| "step": 159500 | |
| }, | |
| { | |
| "epoch": 1.5646544558425175, | |
| "grad_norm": 0.015490056946873665, | |
| "learning_rate": 0.00015555944444444443, | |
| "loss": 0.0053, | |
| "step": 160000 | |
| }, | |
| { | |
| "epoch": 1.5646544558425175, | |
| "eval_accuracy": 0.9994344285714286, | |
| "eval_loss": 0.0037254535127431154, | |
| "eval_runtime": 55.4441, | |
| "eval_samples_per_second": 541.086, | |
| "eval_steps_per_second": 33.818, | |
| "step": 160000 | |
| }, | |
| { | |
| "epoch": 1.5695440010170254, | |
| "grad_norm": 0.034573186188936234, | |
| "learning_rate": 0.000153615, | |
| "loss": 0.0052, | |
| "step": 160500 | |
| }, | |
| { | |
| "epoch": 1.5744335461915333, | |
| "grad_norm": 0.0471004843711853, | |
| "learning_rate": 0.00015167055555555554, | |
| "loss": 0.0055, | |
| "step": 161000 | |
| }, | |
| { | |
| "epoch": 1.5744335461915333, | |
| "eval_accuracy": 0.9994374285714286, | |
| "eval_loss": 0.003749826457351446, | |
| "eval_runtime": 52.9976, | |
| "eval_samples_per_second": 566.063, | |
| "eval_steps_per_second": 35.379, | |
| "step": 161000 | |
| }, | |
| { | |
| "epoch": 1.579323091366041, | |
| "grad_norm": 0.06533846259117126, | |
| "learning_rate": 0.0001497261111111111, | |
| "loss": 0.0056, | |
| "step": 161500 | |
| }, | |
| { | |
| "epoch": 1.5842126365405491, | |
| "grad_norm": 0.009449661709368229, | |
| "learning_rate": 0.00014778166666666668, | |
| "loss": 0.0053, | |
| "step": 162000 | |
| }, | |
| { | |
| "epoch": 1.5842126365405491, | |
| "eval_accuracy": 0.9994476666666666, | |
| "eval_loss": 0.003748701885342598, | |
| "eval_runtime": 53.6491, | |
| "eval_samples_per_second": 559.189, | |
| "eval_steps_per_second": 34.949, | |
| "step": 162000 | |
| }, | |
| { | |
| "epoch": 1.5891021817150568, | |
| "grad_norm": 0.009880056604743004, | |
| "learning_rate": 0.0001458372222222222, | |
| "loss": 0.0055, | |
| "step": 162500 | |
| }, | |
| { | |
| "epoch": 1.5939917268895647, | |
| "grad_norm": 0.05580669641494751, | |
| "learning_rate": 0.00014389277777777777, | |
| "loss": 0.0051, | |
| "step": 163000 | |
| }, | |
| { | |
| "epoch": 1.5939917268895647, | |
| "eval_accuracy": 0.9994498571428572, | |
| "eval_loss": 0.0037052214611321688, | |
| "eval_runtime": 53.1475, | |
| "eval_samples_per_second": 564.467, | |
| "eval_steps_per_second": 35.279, | |
| "step": 163000 | |
| }, | |
| { | |
| "epoch": 1.5988812720640726, | |
| "grad_norm": 0.033147793263196945, | |
| "learning_rate": 0.00014194833333333335, | |
| "loss": 0.0055, | |
| "step": 163500 | |
| }, | |
| { | |
| "epoch": 1.6037708172385805, | |
| "grad_norm": 0.04852864146232605, | |
| "learning_rate": 0.00014000388888888888, | |
| "loss": 0.0054, | |
| "step": 164000 | |
| }, | |
| { | |
| "epoch": 1.6037708172385805, | |
| "eval_accuracy": 0.9994494761904762, | |
| "eval_loss": 0.003642507828772068, | |
| "eval_runtime": 53.3294, | |
| "eval_samples_per_second": 562.542, | |
| "eval_steps_per_second": 35.159, | |
| "step": 164000 | |
| }, | |
| { | |
| "epoch": 1.6086603624130884, | |
| "grad_norm": 0.04461289569735527, | |
| "learning_rate": 0.00013805944444444444, | |
| "loss": 0.0053, | |
| "step": 164500 | |
| }, | |
| { | |
| "epoch": 1.613549907587596, | |
| "grad_norm": 0.04816494509577751, | |
| "learning_rate": 0.000136115, | |
| "loss": 0.0053, | |
| "step": 165000 | |
| }, | |
| { | |
| "epoch": 1.613549907587596, | |
| "eval_accuracy": 0.999451380952381, | |
| "eval_loss": 0.003629567800089717, | |
| "eval_runtime": 53.0304, | |
| "eval_samples_per_second": 565.713, | |
| "eval_steps_per_second": 35.357, | |
| "step": 165000 | |
| }, | |
| { | |
| "epoch": 1.6184394527621042, | |
| "grad_norm": 0.04067426174879074, | |
| "learning_rate": 0.00013417055555555555, | |
| "loss": 0.0063, | |
| "step": 165500 | |
| }, | |
| { | |
| "epoch": 1.623328997936612, | |
| "grad_norm": 0.040210772305727005, | |
| "learning_rate": 0.0001322261111111111, | |
| "loss": 0.0053, | |
| "step": 166000 | |
| }, | |
| { | |
| "epoch": 1.623328997936612, | |
| "eval_accuracy": 0.9994541904761904, | |
| "eval_loss": 0.0036138601135462523, | |
| "eval_runtime": 53.4406, | |
| "eval_samples_per_second": 561.371, | |
| "eval_steps_per_second": 35.086, | |
| "step": 166000 | |
| }, | |
| { | |
| "epoch": 1.6282185431111198, | |
| "grad_norm": 0.04125046357512474, | |
| "learning_rate": 0.00013028166666666666, | |
| "loss": 0.0053, | |
| "step": 166500 | |
| }, | |
| { | |
| "epoch": 1.6331080882856277, | |
| "grad_norm": 0.03415411710739136, | |
| "learning_rate": 0.00012833722222222222, | |
| "loss": 0.0051, | |
| "step": 167000 | |
| }, | |
| { | |
| "epoch": 1.6331080882856277, | |
| "eval_accuracy": 0.9994632380952381, | |
| "eval_loss": 0.003615338122472167, | |
| "eval_runtime": 53.1392, | |
| "eval_samples_per_second": 564.555, | |
| "eval_steps_per_second": 35.285, | |
| "step": 167000 | |
| }, | |
| { | |
| "epoch": 1.6379976334601354, | |
| "grad_norm": 0.03695495426654816, | |
| "learning_rate": 0.00012639277777777778, | |
| "loss": 0.0053, | |
| "step": 167500 | |
| }, | |
| { | |
| "epoch": 1.6428871786346435, | |
| "grad_norm": 0.011762870475649834, | |
| "learning_rate": 0.00012444833333333333, | |
| "loss": 0.0051, | |
| "step": 168000 | |
| }, | |
| { | |
| "epoch": 1.6428871786346435, | |
| "eval_accuracy": 0.9994638095238095, | |
| "eval_loss": 0.003587596118450165, | |
| "eval_runtime": 53.5347, | |
| "eval_samples_per_second": 560.384, | |
| "eval_steps_per_second": 35.024, | |
| "step": 168000 | |
| }, | |
| { | |
| "epoch": 1.6477767238091512, | |
| "grad_norm": 0.01232131477445364, | |
| "learning_rate": 0.0001225038888888889, | |
| "loss": 0.0048, | |
| "step": 168500 | |
| }, | |
| { | |
| "epoch": 1.652666268983659, | |
| "grad_norm": 0.04049614071846008, | |
| "learning_rate": 0.00012055944444444445, | |
| "loss": 0.0048, | |
| "step": 169000 | |
| }, | |
| { | |
| "epoch": 1.652666268983659, | |
| "eval_accuracy": 0.9994665714285714, | |
| "eval_loss": 0.003581820521503687, | |
| "eval_runtime": 53.0217, | |
| "eval_samples_per_second": 565.806, | |
| "eval_steps_per_second": 35.363, | |
| "step": 169000 | |
| }, | |
| { | |
| "epoch": 1.657555814158167, | |
| "grad_norm": 0.04034195467829704, | |
| "learning_rate": 0.00011861499999999999, | |
| "loss": 0.0051, | |
| "step": 169500 | |
| }, | |
| { | |
| "epoch": 1.662445359332675, | |
| "grad_norm": 0.014481657184660435, | |
| "learning_rate": 0.00011667055555555556, | |
| "loss": 0.0051, | |
| "step": 170000 | |
| }, | |
| { | |
| "epoch": 1.662445359332675, | |
| "eval_accuracy": 0.9994720476190476, | |
| "eval_loss": 0.0035638269037008286, | |
| "eval_runtime": 54.3974, | |
| "eval_samples_per_second": 551.497, | |
| "eval_steps_per_second": 34.469, | |
| "step": 170000 | |
| }, | |
| { | |
| "epoch": 1.6673349045071828, | |
| "grad_norm": 0.025204768404364586, | |
| "learning_rate": 0.00011472611111111111, | |
| "loss": 0.0051, | |
| "step": 170500 | |
| }, | |
| { | |
| "epoch": 1.6722244496816905, | |
| "grad_norm": 0.027605898678302765, | |
| "learning_rate": 0.00011278166666666666, | |
| "loss": 0.0049, | |
| "step": 171000 | |
| }, | |
| { | |
| "epoch": 1.6722244496816905, | |
| "eval_accuracy": 0.9994744285714285, | |
| "eval_loss": 0.003567066974937916, | |
| "eval_runtime": 53.8985, | |
| "eval_samples_per_second": 556.602, | |
| "eval_steps_per_second": 34.788, | |
| "step": 171000 | |
| }, | |
| { | |
| "epoch": 1.6771139948561986, | |
| "grad_norm": 0.038017790764570236, | |
| "learning_rate": 0.00011083722222222223, | |
| "loss": 0.005, | |
| "step": 171500 | |
| }, | |
| { | |
| "epoch": 1.6820035400307063, | |
| "grad_norm": 0.048752035945653915, | |
| "learning_rate": 0.00010889277777777778, | |
| "loss": 0.005, | |
| "step": 172000 | |
| }, | |
| { | |
| "epoch": 1.6820035400307063, | |
| "eval_accuracy": 0.9994751428571429, | |
| "eval_loss": 0.003484962275251746, | |
| "eval_runtime": 54.461, | |
| "eval_samples_per_second": 550.853, | |
| "eval_steps_per_second": 34.428, | |
| "step": 172000 | |
| }, | |
| { | |
| "epoch": 1.6868930852052142, | |
| "grad_norm": 0.08453824371099472, | |
| "learning_rate": 0.00010694833333333333, | |
| "loss": 0.005, | |
| "step": 172500 | |
| }, | |
| { | |
| "epoch": 1.691782630379722, | |
| "grad_norm": 0.01620589755475521, | |
| "learning_rate": 0.00010500388888888888, | |
| "loss": 0.005, | |
| "step": 173000 | |
| }, | |
| { | |
| "epoch": 1.691782630379722, | |
| "eval_accuracy": 0.9994759047619047, | |
| "eval_loss": 0.003478883532807231, | |
| "eval_runtime": 54.0084, | |
| "eval_samples_per_second": 555.469, | |
| "eval_steps_per_second": 34.717, | |
| "step": 173000 | |
| }, | |
| { | |
| "epoch": 1.69667217555423, | |
| "grad_norm": 0.024735888466238976, | |
| "learning_rate": 0.00010305944444444445, | |
| "loss": 0.005, | |
| "step": 173500 | |
| }, | |
| { | |
| "epoch": 1.701561720728738, | |
| "grad_norm": 0.020829100161790848, | |
| "learning_rate": 0.000101115, | |
| "loss": 0.005, | |
| "step": 174000 | |
| }, | |
| { | |
| "epoch": 1.701561720728738, | |
| "eval_accuracy": 0.9994835714285715, | |
| "eval_loss": 0.003460401203483343, | |
| "eval_runtime": 53.8886, | |
| "eval_samples_per_second": 556.704, | |
| "eval_steps_per_second": 34.794, | |
| "step": 174000 | |
| }, | |
| { | |
| "epoch": 1.7064512659032456, | |
| "grad_norm": 0.02870938368141651, | |
| "learning_rate": 9.917055555555555e-05, | |
| "loss": 0.0049, | |
| "step": 174500 | |
| }, | |
| { | |
| "epoch": 1.7113408110777537, | |
| "grad_norm": 0.03082539327442646, | |
| "learning_rate": 9.72261111111111e-05, | |
| "loss": 0.0049, | |
| "step": 175000 | |
| }, | |
| { | |
| "epoch": 1.7113408110777537, | |
| "eval_accuracy": 0.9994848095238095, | |
| "eval_loss": 0.0034341050777584314, | |
| "eval_runtime": 53.6965, | |
| "eval_samples_per_second": 558.695, | |
| "eval_steps_per_second": 34.918, | |
| "step": 175000 | |
| }, | |
| { | |
| "epoch": 1.7162303562522614, | |
| "grad_norm": 0.04300360381603241, | |
| "learning_rate": 9.528166666666667e-05, | |
| "loss": 0.0047, | |
| "step": 175500 | |
| }, | |
| { | |
| "epoch": 1.7211199014267693, | |
| "grad_norm": 0.010836569592356682, | |
| "learning_rate": 9.333722222222222e-05, | |
| "loss": 0.0049, | |
| "step": 176000 | |
| }, | |
| { | |
| "epoch": 1.7211199014267693, | |
| "eval_accuracy": 0.999487, | |
| "eval_loss": 0.0034149654675275087, | |
| "eval_runtime": 54.2439, | |
| "eval_samples_per_second": 553.058, | |
| "eval_steps_per_second": 34.566, | |
| "step": 176000 | |
| }, | |
| { | |
| "epoch": 1.7260094466012772, | |
| "grad_norm": 0.012880703434348106, | |
| "learning_rate": 9.139277777777777e-05, | |
| "loss": 0.0049, | |
| "step": 176500 | |
| }, | |
| { | |
| "epoch": 1.7308989917757849, | |
| "grad_norm": 0.029965711757540703, | |
| "learning_rate": 8.944833333333334e-05, | |
| "loss": 0.0049, | |
| "step": 177000 | |
| }, | |
| { | |
| "epoch": 1.7308989917757849, | |
| "eval_accuracy": 0.9994862857142857, | |
| "eval_loss": 0.0034615020267665386, | |
| "eval_runtime": 53.1772, | |
| "eval_samples_per_second": 564.151, | |
| "eval_steps_per_second": 35.259, | |
| "step": 177000 | |
| }, | |
| { | |
| "epoch": 1.735788536950293, | |
| "grad_norm": 0.014986414462327957, | |
| "learning_rate": 8.750388888888889e-05, | |
| "loss": 0.0048, | |
| "step": 177500 | |
| }, | |
| { | |
| "epoch": 1.7406780821248007, | |
| "grad_norm": 0.02675153873860836, | |
| "learning_rate": 8.555944444444445e-05, | |
| "loss": 0.0049, | |
| "step": 178000 | |
| }, | |
| { | |
| "epoch": 1.7406780821248007, | |
| "eval_accuracy": 0.9994909047619047, | |
| "eval_loss": 0.003412367310374975, | |
| "eval_runtime": 54.1342, | |
| "eval_samples_per_second": 554.178, | |
| "eval_steps_per_second": 34.636, | |
| "step": 178000 | |
| }, | |
| { | |
| "epoch": 1.7455676272993086, | |
| "grad_norm": 0.031100204214453697, | |
| "learning_rate": 8.3615e-05, | |
| "loss": 0.0051, | |
| "step": 178500 | |
| }, | |
| { | |
| "epoch": 1.7504571724738165, | |
| "grad_norm": 0.04925690218806267, | |
| "learning_rate": 8.167055555555555e-05, | |
| "loss": 0.005, | |
| "step": 179000 | |
| }, | |
| { | |
| "epoch": 1.7504571724738165, | |
| "eval_accuracy": 0.9994981428571429, | |
| "eval_loss": 0.003331870539113879, | |
| "eval_runtime": 53.5829, | |
| "eval_samples_per_second": 559.881, | |
| "eval_steps_per_second": 34.993, | |
| "step": 179000 | |
| }, | |
| { | |
| "epoch": 1.7553467176483244, | |
| "grad_norm": 0.029799846932291985, | |
| "learning_rate": 7.972611111111112e-05, | |
| "loss": 0.0048, | |
| "step": 179500 | |
| }, | |
| { | |
| "epoch": 1.7602362628228323, | |
| "grad_norm": 0.012169072404503822, | |
| "learning_rate": 7.778166666666666e-05, | |
| "loss": 0.005, | |
| "step": 180000 | |
| }, | |
| { | |
| "epoch": 1.7602362628228323, | |
| "eval_accuracy": 0.9994982380952381, | |
| "eval_loss": 0.003362874034792185, | |
| "eval_runtime": 53.5982, | |
| "eval_samples_per_second": 559.72, | |
| "eval_steps_per_second": 34.983, | |
| "step": 180000 | |
| }, | |
| { | |
| "epoch": 1.76512580799734, | |
| "grad_norm": 0.016585633158683777, | |
| "learning_rate": 7.583722222222222e-05, | |
| "loss": 0.0045, | |
| "step": 180500 | |
| }, | |
| { | |
| "epoch": 1.770015353171848, | |
| "grad_norm": 0.025369074195623398, | |
| "learning_rate": 7.389277777777777e-05, | |
| "loss": 0.0047, | |
| "step": 181000 | |
| }, | |
| { | |
| "epoch": 1.770015353171848, | |
| "eval_accuracy": 0.9995001904761904, | |
| "eval_loss": 0.0033530080690979958, | |
| "eval_runtime": 53.4193, | |
| "eval_samples_per_second": 561.595, | |
| "eval_steps_per_second": 35.1, | |
| "step": 181000 | |
| }, | |
| { | |
| "epoch": 1.7749048983463558, | |
| "grad_norm": 0.04421771690249443, | |
| "learning_rate": 7.194833333333333e-05, | |
| "loss": 0.0046, | |
| "step": 181500 | |
| }, | |
| { | |
| "epoch": 1.7797944435208637, | |
| "grad_norm": 0.05346609279513359, | |
| "learning_rate": 7.000388888888889e-05, | |
| "loss": 0.0048, | |
| "step": 182000 | |
| }, | |
| { | |
| "epoch": 1.7797944435208637, | |
| "eval_accuracy": 0.9995021428571429, | |
| "eval_loss": 0.0033386677969247103, | |
| "eval_runtime": 53.8476, | |
| "eval_samples_per_second": 557.128, | |
| "eval_steps_per_second": 34.82, | |
| "step": 182000 | |
| }, | |
| { | |
| "epoch": 1.7846839886953716, | |
| "grad_norm": 0.019687172025442123, | |
| "learning_rate": 6.805944444444444e-05, | |
| "loss": 0.0048, | |
| "step": 182500 | |
| }, | |
| { | |
| "epoch": 1.7895735338698793, | |
| "grad_norm": 0.026194104924798012, | |
| "learning_rate": 6.6115e-05, | |
| "loss": 0.0048, | |
| "step": 183000 | |
| }, | |
| { | |
| "epoch": 1.7895735338698793, | |
| "eval_accuracy": 0.9995039523809524, | |
| "eval_loss": 0.003320470917969942, | |
| "eval_runtime": 54.9547, | |
| "eval_samples_per_second": 545.904, | |
| "eval_steps_per_second": 34.119, | |
| "step": 183000 | |
| }, | |
| { | |
| "epoch": 1.7944630790443874, | |
| "grad_norm": 0.039239440113306046, | |
| "learning_rate": 6.417055555555556e-05, | |
| "loss": 0.0046, | |
| "step": 183500 | |
| }, | |
| { | |
| "epoch": 1.799352624218895, | |
| "grad_norm": 0.007467139046639204, | |
| "learning_rate": 6.222611111111111e-05, | |
| "loss": 0.0045, | |
| "step": 184000 | |
| }, | |
| { | |
| "epoch": 1.799352624218895, | |
| "eval_accuracy": 0.9995039047619048, | |
| "eval_loss": 0.003313555382192135, | |
| "eval_runtime": 54.3263, | |
| "eval_samples_per_second": 552.218, | |
| "eval_steps_per_second": 34.514, | |
| "step": 184000 | |
| }, | |
| { | |
| "epoch": 1.804242169393403, | |
| "grad_norm": 0.015036596916615963, | |
| "learning_rate": 6.028166666666666e-05, | |
| "loss": 0.0047, | |
| "step": 184500 | |
| }, | |
| { | |
| "epoch": 1.8091317145679109, | |
| "grad_norm": 0.03583378717303276, | |
| "learning_rate": 5.8337222222222226e-05, | |
| "loss": 0.0045, | |
| "step": 185000 | |
| }, | |
| { | |
| "epoch": 1.8091317145679109, | |
| "eval_accuracy": 0.9995053333333334, | |
| "eval_loss": 0.003319466719403863, | |
| "eval_runtime": 53.7318, | |
| "eval_samples_per_second": 558.329, | |
| "eval_steps_per_second": 34.896, | |
| "step": 185000 | |
| }, | |
| { | |
| "epoch": 1.8140212597424188, | |
| "grad_norm": 0.025585120543837547, | |
| "learning_rate": 5.6392777777777775e-05, | |
| "loss": 0.0046, | |
| "step": 185500 | |
| }, | |
| { | |
| "epoch": 1.8189108049169267, | |
| "grad_norm": 0.05633428320288658, | |
| "learning_rate": 5.444833333333333e-05, | |
| "loss": 0.0049, | |
| "step": 186000 | |
| }, | |
| { | |
| "epoch": 1.8189108049169267, | |
| "eval_accuracy": 0.9995094285714285, | |
| "eval_loss": 0.0032863873057067394, | |
| "eval_runtime": 54.2808, | |
| "eval_samples_per_second": 552.682, | |
| "eval_steps_per_second": 34.543, | |
| "step": 186000 | |
| }, | |
| { | |
| "epoch": 1.8238003500914344, | |
| "grad_norm": 0.08839651942253113, | |
| "learning_rate": 5.2503888888888895e-05, | |
| "loss": 0.0046, | |
| "step": 186500 | |
| }, | |
| { | |
| "epoch": 1.8286898952659425, | |
| "grad_norm": 0.02346086874604225, | |
| "learning_rate": 5.0559444444444445e-05, | |
| "loss": 0.0046, | |
| "step": 187000 | |
| }, | |
| { | |
| "epoch": 1.8286898952659425, | |
| "eval_accuracy": 0.999513, | |
| "eval_loss": 0.0032574611250311136, | |
| "eval_runtime": 53.1956, | |
| "eval_samples_per_second": 563.957, | |
| "eval_steps_per_second": 35.247, | |
| "step": 187000 | |
| }, | |
| { | |
| "epoch": 1.8335794404404502, | |
| "grad_norm": 0.04460394009947777, | |
| "learning_rate": 4.8615e-05, | |
| "loss": 0.0048, | |
| "step": 187500 | |
| }, | |
| { | |
| "epoch": 1.838468985614958, | |
| "grad_norm": 0.039988644421100616, | |
| "learning_rate": 4.667055555555555e-05, | |
| "loss": 0.0045, | |
| "step": 188000 | |
| }, | |
| { | |
| "epoch": 1.838468985614958, | |
| "eval_accuracy": 0.999518380952381, | |
| "eval_loss": 0.0032375219743698835, | |
| "eval_runtime": 54.4684, | |
| "eval_samples_per_second": 550.778, | |
| "eval_steps_per_second": 34.424, | |
| "step": 188000 | |
| }, | |
| { | |
| "epoch": 1.843358530789466, | |
| "grad_norm": 0.026043614372611046, | |
| "learning_rate": 4.4726111111111114e-05, | |
| "loss": 0.0045, | |
| "step": 188500 | |
| }, | |
| { | |
| "epoch": 1.8482480759639737, | |
| "grad_norm": 0.03250015527009964, | |
| "learning_rate": 4.2781666666666664e-05, | |
| "loss": 0.0046, | |
| "step": 189000 | |
| }, | |
| { | |
| "epoch": 1.8482480759639737, | |
| "eval_accuracy": 0.999518, | |
| "eval_loss": 0.003229686524719, | |
| "eval_runtime": 53.1577, | |
| "eval_samples_per_second": 564.358, | |
| "eval_steps_per_second": 35.272, | |
| "step": 189000 | |
| }, | |
| { | |
| "epoch": 1.8531376211384818, | |
| "grad_norm": 0.041333604604005814, | |
| "learning_rate": 4.083722222222222e-05, | |
| "loss": 0.0045, | |
| "step": 189500 | |
| }, | |
| { | |
| "epoch": 1.8580271663129895, | |
| "grad_norm": 0.030839432030916214, | |
| "learning_rate": 3.889277777777778e-05, | |
| "loss": 0.0044, | |
| "step": 190000 | |
| }, | |
| { | |
| "epoch": 1.8580271663129895, | |
| "eval_accuracy": 0.9995217142857142, | |
| "eval_loss": 0.0032240275759249926, | |
| "eval_runtime": 53.3872, | |
| "eval_samples_per_second": 561.933, | |
| "eval_steps_per_second": 35.121, | |
| "step": 190000 | |
| }, | |
| { | |
| "epoch": 1.8629167114874974, | |
| "grad_norm": 0.0212627574801445, | |
| "learning_rate": 3.694833333333333e-05, | |
| "loss": 0.0044, | |
| "step": 190500 | |
| }, | |
| { | |
| "epoch": 1.8678062566620053, | |
| "grad_norm": 0.04159221053123474, | |
| "learning_rate": 3.500388888888889e-05, | |
| "loss": 0.0046, | |
| "step": 191000 | |
| }, | |
| { | |
| "epoch": 1.8678062566620053, | |
| "eval_accuracy": 0.9995220476190476, | |
| "eval_loss": 0.0032232191879302263, | |
| "eval_runtime": 53.7977, | |
| "eval_samples_per_second": 557.645, | |
| "eval_steps_per_second": 34.853, | |
| "step": 191000 | |
| }, | |
| { | |
| "epoch": 1.8726958018365132, | |
| "grad_norm": 0.02389533445239067, | |
| "learning_rate": 3.3059444444444446e-05, | |
| "loss": 0.0045, | |
| "step": 191500 | |
| }, | |
| { | |
| "epoch": 1.877585347011021, | |
| "grad_norm": 0.02341424487531185, | |
| "learning_rate": 3.1115e-05, | |
| "loss": 0.0045, | |
| "step": 192000 | |
| }, | |
| { | |
| "epoch": 1.877585347011021, | |
| "eval_accuracy": 0.9995232857142857, | |
| "eval_loss": 0.0032045834232121706, | |
| "eval_runtime": 54.1632, | |
| "eval_samples_per_second": 553.881, | |
| "eval_steps_per_second": 34.618, | |
| "step": 192000 | |
| }, | |
| { | |
| "epoch": 1.8824748921855288, | |
| "grad_norm": 0.03770390897989273, | |
| "learning_rate": 2.9170555555555556e-05, | |
| "loss": 0.0046, | |
| "step": 192500 | |
| }, | |
| { | |
| "epoch": 1.887364437360037, | |
| "grad_norm": 0.024086985737085342, | |
| "learning_rate": 2.7226111111111112e-05, | |
| "loss": 0.0044, | |
| "step": 193000 | |
| }, | |
| { | |
| "epoch": 1.887364437360037, | |
| "eval_accuracy": 0.9995234285714286, | |
| "eval_loss": 0.003206311957910657, | |
| "eval_runtime": 53.6844, | |
| "eval_samples_per_second": 558.822, | |
| "eval_steps_per_second": 34.926, | |
| "step": 193000 | |
| }, | |
| { | |
| "epoch": 1.8922539825345446, | |
| "grad_norm": 0.02860225737094879, | |
| "learning_rate": 2.5281666666666665e-05, | |
| "loss": 0.0043, | |
| "step": 193500 | |
| }, | |
| { | |
| "epoch": 1.8971435277090525, | |
| "grad_norm": 0.034325193613767624, | |
| "learning_rate": 2.3337222222222222e-05, | |
| "loss": 0.0044, | |
| "step": 194000 | |
| }, | |
| { | |
| "epoch": 1.8971435277090525, | |
| "eval_accuracy": 0.9995254285714286, | |
| "eval_loss": 0.0031913991551846266, | |
| "eval_runtime": 54.2224, | |
| "eval_samples_per_second": 553.276, | |
| "eval_steps_per_second": 34.58, | |
| "step": 194000 | |
| }, | |
| { | |
| "epoch": 1.9020330728835604, | |
| "grad_norm": 0.03300917148590088, | |
| "learning_rate": 2.139277777777778e-05, | |
| "loss": 0.0045, | |
| "step": 194500 | |
| }, | |
| { | |
| "epoch": 1.9069226180580683, | |
| "grad_norm": 0.037190355360507965, | |
| "learning_rate": 1.9448333333333335e-05, | |
| "loss": 0.0043, | |
| "step": 195000 | |
| }, | |
| { | |
| "epoch": 1.9069226180580683, | |
| "eval_accuracy": 0.9995253333333334, | |
| "eval_loss": 0.0031883243937045336, | |
| "eval_runtime": 54.2098, | |
| "eval_samples_per_second": 553.405, | |
| "eval_steps_per_second": 34.588, | |
| "step": 195000 | |
| }, | |
| { | |
| "epoch": 1.9118121632325762, | |
| "grad_norm": 0.1029098629951477, | |
| "learning_rate": 1.7503888888888888e-05, | |
| "loss": 0.0045, | |
| "step": 195500 | |
| }, | |
| { | |
| "epoch": 1.9167017084070839, | |
| "grad_norm": 0.027764180675148964, | |
| "learning_rate": 1.5559444444444444e-05, | |
| "loss": 0.0043, | |
| "step": 196000 | |
| }, | |
| { | |
| "epoch": 1.9167017084070839, | |
| "eval_accuracy": 0.9995274761904762, | |
| "eval_loss": 0.0031636343337595463, | |
| "eval_runtime": 54.5719, | |
| "eval_samples_per_second": 549.733, | |
| "eval_steps_per_second": 34.358, | |
| "step": 196000 | |
| }, | |
| { | |
| "epoch": 1.921591253581592, | |
| "grad_norm": 0.031358424574136734, | |
| "learning_rate": 1.3615e-05, | |
| "loss": 0.0043, | |
| "step": 196500 | |
| }, | |
| { | |
| "epoch": 1.9264807987560997, | |
| "grad_norm": 0.035557158291339874, | |
| "learning_rate": 1.1670555555555556e-05, | |
| "loss": 0.0046, | |
| "step": 197000 | |
| }, | |
| { | |
| "epoch": 1.9264807987560997, | |
| "eval_accuracy": 0.999529761904762, | |
| "eval_loss": 0.0031551867723464966, | |
| "eval_runtime": 56.5808, | |
| "eval_samples_per_second": 530.215, | |
| "eval_steps_per_second": 33.138, | |
| "step": 197000 | |
| }, | |
| { | |
| "epoch": 1.9313703439306076, | |
| "grad_norm": 0.034682463854551315, | |
| "learning_rate": 9.72611111111111e-06, | |
| "loss": 0.0045, | |
| "step": 197500 | |
| }, | |
| { | |
| "epoch": 1.9362598891051155, | |
| "grad_norm": 0.023823970928788185, | |
| "learning_rate": 7.781666666666667e-06, | |
| "loss": 0.0043, | |
| "step": 198000 | |
| }, | |
| { | |
| "epoch": 1.9362598891051155, | |
| "eval_accuracy": 0.9995307142857143, | |
| "eval_loss": 0.0031563735101372004, | |
| "eval_runtime": 56.8893, | |
| "eval_samples_per_second": 527.34, | |
| "eval_steps_per_second": 32.959, | |
| "step": 198000 | |
| }, | |
| { | |
| "epoch": 1.9411494342796232, | |
| "grad_norm": 0.020929349586367607, | |
| "learning_rate": 5.837222222222222e-06, | |
| "loss": 0.0044, | |
| "step": 198500 | |
| }, | |
| { | |
| "epoch": 1.9460389794541313, | |
| "grad_norm": 0.025913028046488762, | |
| "learning_rate": 3.892777777777778e-06, | |
| "loss": 0.0044, | |
| "step": 199000 | |
| }, | |
| { | |
| "epoch": 1.9460389794541313, | |
| "eval_accuracy": 0.9995307619047619, | |
| "eval_loss": 0.003153804922476411, | |
| "eval_runtime": 54.8658, | |
| "eval_samples_per_second": 546.788, | |
| "eval_steps_per_second": 34.174, | |
| "step": 199000 | |
| }, | |
| { | |
| "epoch": 1.950928524628639, | |
| "grad_norm": 0.02582838013768196, | |
| "learning_rate": 1.9483333333333335e-06, | |
| "loss": 0.0044, | |
| "step": 199500 | |
| }, | |
| { | |
| "epoch": 1.9558180698031469, | |
| "grad_norm": 0.016758419573307037, | |
| "learning_rate": 3.888888888888889e-09, | |
| "loss": 0.0042, | |
| "step": 200000 | |
| }, | |
| { | |
| "epoch": 1.9558180698031469, | |
| "eval_accuracy": 0.9995309047619048, | |
| "eval_loss": 0.003156075021252036, | |
| "eval_runtime": 55.5903, | |
| "eval_samples_per_second": 539.663, | |
| "eval_steps_per_second": 33.729, | |
| "step": 200000 | |
| } | |
| ], | |
| "logging_steps": 500, | |
| "max_steps": 200000, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 2, | |
| "save_steps": 1000, | |
| "stateful_callbacks": { | |
| "EarlyStoppingCallback": { | |
| "args": { | |
| "early_stopping_patience": 7, | |
| "early_stopping_threshold": 0.0 | |
| }, | |
| "attributes": { | |
| "early_stopping_patience_counter": 1 | |
| } | |
| }, | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.480299103223808e+17, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |