dscripka ggerganov commited on
Commit
469988b
·
unverified ·
1 Parent(s): a46b62a

examples : added audio_ctx argument to main and server (#1857)

Browse files

* added audio_ctx argument to main and server examples

* Better default value

Co-authored-by: Georgi Gerganov <[email protected]>

* better default value (again)

Co-authored-by: Georgi Gerganov <[email protected]>

---------

Co-authored-by: Georgi Gerganov <[email protected]>

examples/main/main.cpp CHANGED
@@ -64,6 +64,7 @@ struct whisper_params {
64
  int32_t max_len = 0;
65
  int32_t best_of = whisper_full_default_params(WHISPER_SAMPLING_GREEDY).greedy.best_of;
66
  int32_t beam_size = whisper_full_default_params(WHISPER_SAMPLING_BEAM_SEARCH).beam_search.beam_size;
 
67
 
68
  float word_thold = 0.01f;
69
  float entropy_thold = 2.40f;
@@ -136,6 +137,7 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
136
  else if (arg == "-ml" || arg == "--max-len") { params.max_len = std::stoi(argv[++i]); }
137
  else if (arg == "-bo" || arg == "--best-of") { params.best_of = std::stoi(argv[++i]); }
138
  else if (arg == "-bs" || arg == "--beam-size") { params.beam_size = std::stoi(argv[++i]); }
 
139
  else if (arg == "-wt" || arg == "--word-thold") { params.word_thold = std::stof(argv[++i]); }
140
  else if (arg == "-et" || arg == "--entropy-thold") { params.entropy_thold = std::stof(argv[++i]); }
141
  else if (arg == "-lpt" || arg == "--logprob-thold") { params.logprob_thold = std::stof(argv[++i]); }
@@ -195,6 +197,7 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
195
  fprintf(stderr, " -sow, --split-on-word [%-7s] split on word rather than on token\n", params.split_on_word ? "true" : "false");
196
  fprintf(stderr, " -bo N, --best-of N [%-7d] number of best candidates to keep\n", params.best_of);
197
  fprintf(stderr, " -bs N, --beam-size N [%-7d] beam size for beam search\n", params.beam_size);
 
198
  fprintf(stderr, " -wt N, --word-thold N [%-7.2f] word timestamp probability threshold\n", params.word_thold);
199
  fprintf(stderr, " -et N, --entropy-thold N [%-7.2f] entropy threshold for decoder fail\n", params.entropy_thold);
200
  fprintf(stderr, " -lpt N, --logprob-thold N [%-7.2f] log probability threshold for decoder fail\n", params.logprob_thold);
@@ -967,6 +970,7 @@ int main(int argc, char ** argv) {
967
  wparams.thold_pt = params.word_thold;
968
  wparams.max_len = params.output_wts && params.max_len == 0 ? 60 : params.max_len;
969
  wparams.split_on_word = params.split_on_word;
 
970
 
971
  wparams.speed_up = params.speed_up;
972
  wparams.debug_mode = params.debug_mode;
 
64
  int32_t max_len = 0;
65
  int32_t best_of = whisper_full_default_params(WHISPER_SAMPLING_GREEDY).greedy.best_of;
66
  int32_t beam_size = whisper_full_default_params(WHISPER_SAMPLING_BEAM_SEARCH).beam_search.beam_size;
67
+ int32_t audio_ctx = 0;
68
 
69
  float word_thold = 0.01f;
70
  float entropy_thold = 2.40f;
 
137
  else if (arg == "-ml" || arg == "--max-len") { params.max_len = std::stoi(argv[++i]); }
138
  else if (arg == "-bo" || arg == "--best-of") { params.best_of = std::stoi(argv[++i]); }
139
  else if (arg == "-bs" || arg == "--beam-size") { params.beam_size = std::stoi(argv[++i]); }
140
+ else if (arg == "-ac" || arg == "--audio-context") { params.audio_ctx = std::stoi(argv[++i]); }
141
  else if (arg == "-wt" || arg == "--word-thold") { params.word_thold = std::stof(argv[++i]); }
142
  else if (arg == "-et" || arg == "--entropy-thold") { params.entropy_thold = std::stof(argv[++i]); }
143
  else if (arg == "-lpt" || arg == "--logprob-thold") { params.logprob_thold = std::stof(argv[++i]); }
 
197
  fprintf(stderr, " -sow, --split-on-word [%-7s] split on word rather than on token\n", params.split_on_word ? "true" : "false");
198
  fprintf(stderr, " -bo N, --best-of N [%-7d] number of best candidates to keep\n", params.best_of);
199
  fprintf(stderr, " -bs N, --beam-size N [%-7d] beam size for beam search\n", params.beam_size);
200
+ fprintf(stderr, " -ac N, --audio-ctx N [%-7d] audio context size (0 - all)\n", params.audio_ctx);
201
  fprintf(stderr, " -wt N, --word-thold N [%-7.2f] word timestamp probability threshold\n", params.word_thold);
202
  fprintf(stderr, " -et N, --entropy-thold N [%-7.2f] entropy threshold for decoder fail\n", params.entropy_thold);
203
  fprintf(stderr, " -lpt N, --logprob-thold N [%-7.2f] log probability threshold for decoder fail\n", params.logprob_thold);
 
970
  wparams.thold_pt = params.word_thold;
971
  wparams.max_len = params.output_wts && params.max_len == 0 ? 60 : params.max_len;
972
  wparams.split_on_word = params.split_on_word;
973
+ wparams.audio_ctx = params.audio_ctx;
974
 
975
  wparams.speed_up = params.speed_up;
976
  wparams.debug_mode = params.debug_mode;
examples/server/server.cpp CHANGED
@@ -60,6 +60,7 @@ struct whisper_params {
60
  int32_t max_len = 0;
61
  int32_t best_of = 2;
62
  int32_t beam_size = -1;
 
63
 
64
  float word_thold = 0.01f;
65
  float entropy_thold = 2.40f;
@@ -138,6 +139,7 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
138
  fprintf(stderr, " -sow, --split-on-word [%-7s] split on word rather than on token\n", params.split_on_word ? "true" : "false");
139
  fprintf(stderr, " -bo N, --best-of N [%-7d] number of best candidates to keep\n", params.best_of);
140
  fprintf(stderr, " -bs N, --beam-size N [%-7d] beam size for beam search\n", params.beam_size);
 
141
  fprintf(stderr, " -wt N, --word-thold N [%-7.2f] word timestamp probability threshold\n", params.word_thold);
142
  fprintf(stderr, " -et N, --entropy-thold N [%-7.2f] entropy threshold for decoder fail\n", params.entropy_thold);
143
  fprintf(stderr, " -lpt N, --logprob-thold N [%-7.2f] log probability threshold for decoder fail\n", params.logprob_thold);
@@ -183,6 +185,7 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params, serve
183
  else if (arg == "-ml" || arg == "--max-len") { params.max_len = std::stoi(argv[++i]); }
184
  else if (arg == "-bo" || arg == "--best-of") { params.best_of = std::stoi(argv[++i]); }
185
  else if (arg == "-bs" || arg == "--beam-size") { params.beam_size = std::stoi(argv[++i]); }
 
186
  else if (arg == "-wt" || arg == "--word-thold") { params.word_thold = std::stof(argv[++i]); }
187
  else if (arg == "-et" || arg == "--entropy-thold") { params.entropy_thold = std::stof(argv[++i]); }
188
  else if (arg == "-lpt" || arg == "--logprob-thold") { params.logprob_thold = std::stof(argv[++i]); }
@@ -433,6 +436,10 @@ void get_req_parameters(const Request & req, whisper_params & params)
433
  {
434
  params.beam_size = std::stoi(req.get_file_value("beam_size").content);
435
  }
 
 
 
 
436
  if (req.has_file("word_thold"))
437
  {
438
  params.word_thold = std::stof(req.get_file_value("word_thold").content);
@@ -741,6 +748,7 @@ int main(int argc, char ** argv) {
741
  wparams.thold_pt = params.word_thold;
742
  wparams.max_len = params.max_len == 0 ? 60 : params.max_len;
743
  wparams.split_on_word = params.split_on_word;
 
744
 
745
  wparams.speed_up = params.speed_up;
746
  wparams.debug_mode = params.debug_mode;
 
60
  int32_t max_len = 0;
61
  int32_t best_of = 2;
62
  int32_t beam_size = -1;
63
+ int32_t audio_ctx = 0;
64
 
65
  float word_thold = 0.01f;
66
  float entropy_thold = 2.40f;
 
139
  fprintf(stderr, " -sow, --split-on-word [%-7s] split on word rather than on token\n", params.split_on_word ? "true" : "false");
140
  fprintf(stderr, " -bo N, --best-of N [%-7d] number of best candidates to keep\n", params.best_of);
141
  fprintf(stderr, " -bs N, --beam-size N [%-7d] beam size for beam search\n", params.beam_size);
142
+ fprintf(stderr, " -ac N, --audio-ctx N [%-7d] audio context size (0 - all)\n", params.audio_ctx);
143
  fprintf(stderr, " -wt N, --word-thold N [%-7.2f] word timestamp probability threshold\n", params.word_thold);
144
  fprintf(stderr, " -et N, --entropy-thold N [%-7.2f] entropy threshold for decoder fail\n", params.entropy_thold);
145
  fprintf(stderr, " -lpt N, --logprob-thold N [%-7.2f] log probability threshold for decoder fail\n", params.logprob_thold);
 
185
  else if (arg == "-ml" || arg == "--max-len") { params.max_len = std::stoi(argv[++i]); }
186
  else if (arg == "-bo" || arg == "--best-of") { params.best_of = std::stoi(argv[++i]); }
187
  else if (arg == "-bs" || arg == "--beam-size") { params.beam_size = std::stoi(argv[++i]); }
188
+ else if (arg == "-ac" || arg == "--audio-context") { params.audio_ctx = std::stoi(argv[++i]); }
189
  else if (arg == "-wt" || arg == "--word-thold") { params.word_thold = std::stof(argv[++i]); }
190
  else if (arg == "-et" || arg == "--entropy-thold") { params.entropy_thold = std::stof(argv[++i]); }
191
  else if (arg == "-lpt" || arg == "--logprob-thold") { params.logprob_thold = std::stof(argv[++i]); }
 
436
  {
437
  params.beam_size = std::stoi(req.get_file_value("beam_size").content);
438
  }
439
+ if (req.has_file("audio_ctx"))
440
+ {
441
+ params.audio_ctx = std::stof(req.get_file_value("audio_ctx").content);
442
+ }
443
  if (req.has_file("word_thold"))
444
  {
445
  params.word_thold = std::stof(req.get_file_value("word_thold").content);
 
748
  wparams.thold_pt = params.word_thold;
749
  wparams.max_len = params.max_len == 0 ? 60 : params.max_len;
750
  wparams.split_on_word = params.split_on_word;
751
+ wparams.audio_ctx = params.audio_ctx;
752
 
753
  wparams.speed_up = params.speed_up;
754
  wparams.debug_mode = params.debug_mode;