Spaces:
Running
Running
examples : added audio_ctx argument to main and server (#1857)
Browse files* added audio_ctx argument to main and server examples
* Better default value
Co-authored-by: Georgi Gerganov <[email protected]>
* better default value (again)
Co-authored-by: Georgi Gerganov <[email protected]>
---------
Co-authored-by: Georgi Gerganov <[email protected]>
- examples/main/main.cpp +4 -0
- examples/server/server.cpp +8 -0
examples/main/main.cpp
CHANGED
|
@@ -64,6 +64,7 @@ struct whisper_params {
|
|
| 64 |
int32_t max_len = 0;
|
| 65 |
int32_t best_of = whisper_full_default_params(WHISPER_SAMPLING_GREEDY).greedy.best_of;
|
| 66 |
int32_t beam_size = whisper_full_default_params(WHISPER_SAMPLING_BEAM_SEARCH).beam_search.beam_size;
|
|
|
|
| 67 |
|
| 68 |
float word_thold = 0.01f;
|
| 69 |
float entropy_thold = 2.40f;
|
|
@@ -136,6 +137,7 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
|
|
| 136 |
else if (arg == "-ml" || arg == "--max-len") { params.max_len = std::stoi(argv[++i]); }
|
| 137 |
else if (arg == "-bo" || arg == "--best-of") { params.best_of = std::stoi(argv[++i]); }
|
| 138 |
else if (arg == "-bs" || arg == "--beam-size") { params.beam_size = std::stoi(argv[++i]); }
|
|
|
|
| 139 |
else if (arg == "-wt" || arg == "--word-thold") { params.word_thold = std::stof(argv[++i]); }
|
| 140 |
else if (arg == "-et" || arg == "--entropy-thold") { params.entropy_thold = std::stof(argv[++i]); }
|
| 141 |
else if (arg == "-lpt" || arg == "--logprob-thold") { params.logprob_thold = std::stof(argv[++i]); }
|
|
@@ -195,6 +197,7 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
|
|
| 195 |
fprintf(stderr, " -sow, --split-on-word [%-7s] split on word rather than on token\n", params.split_on_word ? "true" : "false");
|
| 196 |
fprintf(stderr, " -bo N, --best-of N [%-7d] number of best candidates to keep\n", params.best_of);
|
| 197 |
fprintf(stderr, " -bs N, --beam-size N [%-7d] beam size for beam search\n", params.beam_size);
|
|
|
|
| 198 |
fprintf(stderr, " -wt N, --word-thold N [%-7.2f] word timestamp probability threshold\n", params.word_thold);
|
| 199 |
fprintf(stderr, " -et N, --entropy-thold N [%-7.2f] entropy threshold for decoder fail\n", params.entropy_thold);
|
| 200 |
fprintf(stderr, " -lpt N, --logprob-thold N [%-7.2f] log probability threshold for decoder fail\n", params.logprob_thold);
|
|
@@ -967,6 +970,7 @@ int main(int argc, char ** argv) {
|
|
| 967 |
wparams.thold_pt = params.word_thold;
|
| 968 |
wparams.max_len = params.output_wts && params.max_len == 0 ? 60 : params.max_len;
|
| 969 |
wparams.split_on_word = params.split_on_word;
|
|
|
|
| 970 |
|
| 971 |
wparams.speed_up = params.speed_up;
|
| 972 |
wparams.debug_mode = params.debug_mode;
|
|
|
|
| 64 |
int32_t max_len = 0;
|
| 65 |
int32_t best_of = whisper_full_default_params(WHISPER_SAMPLING_GREEDY).greedy.best_of;
|
| 66 |
int32_t beam_size = whisper_full_default_params(WHISPER_SAMPLING_BEAM_SEARCH).beam_search.beam_size;
|
| 67 |
+
int32_t audio_ctx = 0;
|
| 68 |
|
| 69 |
float word_thold = 0.01f;
|
| 70 |
float entropy_thold = 2.40f;
|
|
|
|
| 137 |
else if (arg == "-ml" || arg == "--max-len") { params.max_len = std::stoi(argv[++i]); }
|
| 138 |
else if (arg == "-bo" || arg == "--best-of") { params.best_of = std::stoi(argv[++i]); }
|
| 139 |
else if (arg == "-bs" || arg == "--beam-size") { params.beam_size = std::stoi(argv[++i]); }
|
| 140 |
+
else if (arg == "-ac" || arg == "--audio-context") { params.audio_ctx = std::stoi(argv[++i]); }
|
| 141 |
else if (arg == "-wt" || arg == "--word-thold") { params.word_thold = std::stof(argv[++i]); }
|
| 142 |
else if (arg == "-et" || arg == "--entropy-thold") { params.entropy_thold = std::stof(argv[++i]); }
|
| 143 |
else if (arg == "-lpt" || arg == "--logprob-thold") { params.logprob_thold = std::stof(argv[++i]); }
|
|
|
|
| 197 |
fprintf(stderr, " -sow, --split-on-word [%-7s] split on word rather than on token\n", params.split_on_word ? "true" : "false");
|
| 198 |
fprintf(stderr, " -bo N, --best-of N [%-7d] number of best candidates to keep\n", params.best_of);
|
| 199 |
fprintf(stderr, " -bs N, --beam-size N [%-7d] beam size for beam search\n", params.beam_size);
|
| 200 |
+
fprintf(stderr, " -ac N, --audio-ctx N [%-7d] audio context size (0 - all)\n", params.audio_ctx);
|
| 201 |
fprintf(stderr, " -wt N, --word-thold N [%-7.2f] word timestamp probability threshold\n", params.word_thold);
|
| 202 |
fprintf(stderr, " -et N, --entropy-thold N [%-7.2f] entropy threshold for decoder fail\n", params.entropy_thold);
|
| 203 |
fprintf(stderr, " -lpt N, --logprob-thold N [%-7.2f] log probability threshold for decoder fail\n", params.logprob_thold);
|
|
|
|
| 970 |
wparams.thold_pt = params.word_thold;
|
| 971 |
wparams.max_len = params.output_wts && params.max_len == 0 ? 60 : params.max_len;
|
| 972 |
wparams.split_on_word = params.split_on_word;
|
| 973 |
+
wparams.audio_ctx = params.audio_ctx;
|
| 974 |
|
| 975 |
wparams.speed_up = params.speed_up;
|
| 976 |
wparams.debug_mode = params.debug_mode;
|
examples/server/server.cpp
CHANGED
|
@@ -60,6 +60,7 @@ struct whisper_params {
|
|
| 60 |
int32_t max_len = 0;
|
| 61 |
int32_t best_of = 2;
|
| 62 |
int32_t beam_size = -1;
|
|
|
|
| 63 |
|
| 64 |
float word_thold = 0.01f;
|
| 65 |
float entropy_thold = 2.40f;
|
|
@@ -138,6 +139,7 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
|
|
| 138 |
fprintf(stderr, " -sow, --split-on-word [%-7s] split on word rather than on token\n", params.split_on_word ? "true" : "false");
|
| 139 |
fprintf(stderr, " -bo N, --best-of N [%-7d] number of best candidates to keep\n", params.best_of);
|
| 140 |
fprintf(stderr, " -bs N, --beam-size N [%-7d] beam size for beam search\n", params.beam_size);
|
|
|
|
| 141 |
fprintf(stderr, " -wt N, --word-thold N [%-7.2f] word timestamp probability threshold\n", params.word_thold);
|
| 142 |
fprintf(stderr, " -et N, --entropy-thold N [%-7.2f] entropy threshold for decoder fail\n", params.entropy_thold);
|
| 143 |
fprintf(stderr, " -lpt N, --logprob-thold N [%-7.2f] log probability threshold for decoder fail\n", params.logprob_thold);
|
|
@@ -183,6 +185,7 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params, serve
|
|
| 183 |
else if (arg == "-ml" || arg == "--max-len") { params.max_len = std::stoi(argv[++i]); }
|
| 184 |
else if (arg == "-bo" || arg == "--best-of") { params.best_of = std::stoi(argv[++i]); }
|
| 185 |
else if (arg == "-bs" || arg == "--beam-size") { params.beam_size = std::stoi(argv[++i]); }
|
|
|
|
| 186 |
else if (arg == "-wt" || arg == "--word-thold") { params.word_thold = std::stof(argv[++i]); }
|
| 187 |
else if (arg == "-et" || arg == "--entropy-thold") { params.entropy_thold = std::stof(argv[++i]); }
|
| 188 |
else if (arg == "-lpt" || arg == "--logprob-thold") { params.logprob_thold = std::stof(argv[++i]); }
|
|
@@ -433,6 +436,10 @@ void get_req_parameters(const Request & req, whisper_params & params)
|
|
| 433 |
{
|
| 434 |
params.beam_size = std::stoi(req.get_file_value("beam_size").content);
|
| 435 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
| 436 |
if (req.has_file("word_thold"))
|
| 437 |
{
|
| 438 |
params.word_thold = std::stof(req.get_file_value("word_thold").content);
|
|
@@ -741,6 +748,7 @@ int main(int argc, char ** argv) {
|
|
| 741 |
wparams.thold_pt = params.word_thold;
|
| 742 |
wparams.max_len = params.max_len == 0 ? 60 : params.max_len;
|
| 743 |
wparams.split_on_word = params.split_on_word;
|
|
|
|
| 744 |
|
| 745 |
wparams.speed_up = params.speed_up;
|
| 746 |
wparams.debug_mode = params.debug_mode;
|
|
|
|
| 60 |
int32_t max_len = 0;
|
| 61 |
int32_t best_of = 2;
|
| 62 |
int32_t beam_size = -1;
|
| 63 |
+
int32_t audio_ctx = 0;
|
| 64 |
|
| 65 |
float word_thold = 0.01f;
|
| 66 |
float entropy_thold = 2.40f;
|
|
|
|
| 139 |
fprintf(stderr, " -sow, --split-on-word [%-7s] split on word rather than on token\n", params.split_on_word ? "true" : "false");
|
| 140 |
fprintf(stderr, " -bo N, --best-of N [%-7d] number of best candidates to keep\n", params.best_of);
|
| 141 |
fprintf(stderr, " -bs N, --beam-size N [%-7d] beam size for beam search\n", params.beam_size);
|
| 142 |
+
fprintf(stderr, " -ac N, --audio-ctx N [%-7d] audio context size (0 - all)\n", params.audio_ctx);
|
| 143 |
fprintf(stderr, " -wt N, --word-thold N [%-7.2f] word timestamp probability threshold\n", params.word_thold);
|
| 144 |
fprintf(stderr, " -et N, --entropy-thold N [%-7.2f] entropy threshold for decoder fail\n", params.entropy_thold);
|
| 145 |
fprintf(stderr, " -lpt N, --logprob-thold N [%-7.2f] log probability threshold for decoder fail\n", params.logprob_thold);
|
|
|
|
| 185 |
else if (arg == "-ml" || arg == "--max-len") { params.max_len = std::stoi(argv[++i]); }
|
| 186 |
else if (arg == "-bo" || arg == "--best-of") { params.best_of = std::stoi(argv[++i]); }
|
| 187 |
else if (arg == "-bs" || arg == "--beam-size") { params.beam_size = std::stoi(argv[++i]); }
|
| 188 |
+
else if (arg == "-ac" || arg == "--audio-context") { params.audio_ctx = std::stoi(argv[++i]); }
|
| 189 |
else if (arg == "-wt" || arg == "--word-thold") { params.word_thold = std::stof(argv[++i]); }
|
| 190 |
else if (arg == "-et" || arg == "--entropy-thold") { params.entropy_thold = std::stof(argv[++i]); }
|
| 191 |
else if (arg == "-lpt" || arg == "--logprob-thold") { params.logprob_thold = std::stof(argv[++i]); }
|
|
|
|
| 436 |
{
|
| 437 |
params.beam_size = std::stoi(req.get_file_value("beam_size").content);
|
| 438 |
}
|
| 439 |
+
if (req.has_file("audio_ctx"))
|
| 440 |
+
{
|
| 441 |
+
params.audio_ctx = std::stof(req.get_file_value("audio_ctx").content);
|
| 442 |
+
}
|
| 443 |
if (req.has_file("word_thold"))
|
| 444 |
{
|
| 445 |
params.word_thold = std::stof(req.get_file_value("word_thold").content);
|
|
|
|
| 748 |
wparams.thold_pt = params.word_thold;
|
| 749 |
wparams.max_len = params.max_len == 0 ? 60 : params.max_len;
|
| 750 |
wparams.split_on_word = params.split_on_word;
|
| 751 |
+
wparams.audio_ctx = params.audio_ctx;
|
| 752 |
|
| 753 |
wparams.speed_up = params.speed_up;
|
| 754 |
wparams.debug_mode = params.debug_mode;
|