Alp
wiki search, huge refactor
5ea1cbe
import gradio as gr
import pandas as pd
# Local modules
from data_loaders import (
load_language_list, load_language_taxonomy, load_common_voice_data,
load_app_content, get_common_voice_stats
)
from commercial_services import (
fetch_azure_asr_languages, fetch_azure_tts_languages,
fetch_google_stt_languages, fetch_google_tts_languages,
fetch_aws_transcribe_languages, fetch_aws_polly_languages,
get_azure_locales_for_language, get_google_locales_for_language,
get_aws_locales_for_language,
check_elevenlabs_multilingual_v2_support, check_elevenlabs_turbo_v3_support
)
from huggingface_search import (
search_huggingface_models, search_huggingface_datasets, deduplicate_models
)
from language_metadata import get_language_metadata_html, get_default_metadata_html
# Configuration
LANGUAGE_CODES_FILE = "language-codes-full.csv"
APP_CONTENT_FILE = "app_content.md"
LANGUAGE_TAXONOMY_URL = "https://microsoft.github.io/linguisticdiversity/assets/lang2tax.txt"
COMMON_VOICE_DATA_FILE = "cv-corpus-24.0-2025-12-05.json"
COMMON_VOICE_VERSION = "24.0 (2025-12-05)"
# Language list will be loaded from CSV
# Structure: {iso_639_2: {"name": str, "iso_639_1": str, "french_name": str}}
LANGUAGES = {}
# Language taxonomy mapping (from Joshi et al.'s linguistic diversity paper)
# Structure: {language_name_lowercase: level}
LANGUAGE_TAXONOMY = {}
# Common Voice dataset
# Structure: {locale_code: {validHrs: float, totalHrs: float, splits: {gender: {...}}, ...}}
COMMON_VOICE_DATA = {}
# Taxonomy level descriptions
TAXONOMY_LEVELS = {
0: "The Left-Behinds",
1: "The Scraping-Bys",
2: "The Hopefuls",
3: "The Rising Stars",
4: "The Underdogs",
5: "The Winners"
}
# App content will be loaded from markdown file
APP_CONTENT = {
"title": "Speech Resource Finder",
"description": "Search for speech resources",
"full_content": ""
}
def search_language_resources(language_code, deduplicate=False):
"""
Search for ASR/TTS resources for a given language
Returns results organized by service type
deduplicate: if True, remove duplicate models (same base name) and keep only the one with most downloads
"""
all_logs = []
if not language_code:
return None, None, None, None, 0, 0, None, None, 0, 0, ""
lang_info = LANGUAGES.get(language_code)
if not lang_info:
return None, None, None, None, 0, 0, None, None, 0, 0, ""
language_name = lang_info['name']
iso_639_1 = lang_info['iso_639_1']
iso_639_2 = language_code # language_code IS the ISO 639-2 code
all_logs.append(f"=== Searching for {language_name} ({language_code}) ===")
all_logs.append(f"Language codes: ISO 639-1={iso_639_1}, ISO 639-2={iso_639_2}")
# Check Common Voice data
all_logs.append("\n[Common Voice Dataset]")
cv_stats = get_common_voice_stats(iso_639_2, iso_639_1, COMMON_VOICE_DATA)
if cv_stats:
all_logs.append(f" βœ… Available in Common Voice (locale: {cv_stats['locale']})")
all_logs.append(f" Valid hours: {cv_stats['valid_hrs']:.1f}h, Total hours: {cv_stats['total_hrs']:.1f}h")
all_logs.append(f" Gender balance: {cv_stats['male_pct']:.1f}% male, {cv_stats['female_pct']:.1f}% female")
else:
all_logs.append(f" ❌ Not available in Common Voice")
# Fetch Azure data
all_logs.append("\n[Azure Speech Services]")
azure_asr = fetch_azure_asr_languages()
azure_tts = fetch_azure_tts_languages()
all_logs.append(f" Fetched {len(azure_asr)} ASR languages and {len(azure_tts)} TTS languages from Azure")
# Get matching Azure locales using ISO 639-1 code
azure_locales = get_azure_locales_for_language(iso_639_1)
all_logs.append(f" Matching Azure locales: {azure_locales}")
# Check Azure ASR support
azure_asr_locales = [loc for loc in azure_locales if loc in azure_asr]
azure_asr_available = len(azure_asr_locales) > 0
all_logs.append(f" Azure ASR: {'βœ… Supported' if azure_asr_available else '❌ Not supported'} ({len(azure_asr_locales)} locales)")
# Check Azure TTS support and count voices
azure_tts_locales = [loc for loc in azure_locales if loc in azure_tts]
azure_tts_available = len(azure_tts_locales) > 0
azure_total_voices = sum(azure_tts[loc]['voice_count'] for loc in azure_tts_locales)
all_logs.append(f" Azure TTS: {'βœ… Supported' if azure_tts_available else '❌ Not supported'} ({len(azure_tts_locales)} locales, {azure_total_voices} voices)")
# Fetch Google Cloud data
all_logs.append("\n[Google Cloud Speech]")
google_stt = fetch_google_stt_languages()
google_tts = fetch_google_tts_languages()
all_logs.append(f" Fetched {len(google_stt)} STT languages and {len(google_tts)} TTS languages from Google Cloud")
# Get matching Google Cloud locales using ISO 639-1 code
google_locales = get_google_locales_for_language(iso_639_1)
all_logs.append(f" Matching Google Cloud locales: {google_locales}")
# Check Google Cloud STT support
google_stt_locales = [loc for loc in google_locales if loc in google_stt]
google_stt_available = len(google_stt_locales) > 0
all_logs.append(f" Google STT: {'βœ… Supported' if google_stt_available else '❌ Not supported'} ({len(google_stt_locales)} locales)")
# Check Google Cloud TTS support and count voices
google_tts_locales = [loc for loc in google_locales if loc in google_tts]
google_tts_available = len(google_tts_locales) > 0
google_total_voices = sum(google_tts[loc]['voice_count'] for loc in google_tts_locales)
all_logs.append(f" Google TTS: {'βœ… Supported' if google_tts_available else '❌ Not supported'} ({len(google_tts_locales)} locales, {google_total_voices} voices)")
# Fetch AWS data
all_logs.append("\n[AWS (Transcribe + Polly)]")
aws_transcribe = fetch_aws_transcribe_languages()
aws_polly = fetch_aws_polly_languages()
all_logs.append(f" Fetched {len(aws_transcribe)} Transcribe languages and {len(aws_polly)} Polly languages from AWS")
# Get matching AWS locales using ISO 639-1 code
aws_locales = get_aws_locales_for_language(iso_639_1)
all_logs.append(f" Matching AWS locales: {aws_locales}")
# Check AWS Transcribe support
aws_transcribe_locales = [loc for loc in aws_locales if loc in aws_transcribe]
aws_transcribe_available = len(aws_transcribe_locales) > 0
all_logs.append(f" AWS Transcribe: {'βœ… Supported' if aws_transcribe_available else '❌ Not supported'} ({len(aws_transcribe_locales)} locales)")
# Check AWS Polly support and count voices
aws_polly_locales = [loc for loc in aws_locales if loc in aws_polly]
aws_polly_available = len(aws_polly_locales) > 0
aws_total_voices = sum(aws_polly[loc]['voice_count'] for loc in aws_polly_locales)
all_logs.append(f" AWS Polly: {'βœ… Supported' if aws_polly_available else '❌ Not supported'} ({len(aws_polly_locales)} locales, {aws_total_voices} voices)")
# Commercial Services
commercial_rows = []
# Azure Speech
if azure_asr_available:
azure_asr_text = f"βœ… {len(azure_asr_locales)} locale(s)"
else:
azure_asr_text = "❌ N/A"
if azure_tts_available:
azure_tts_text = f"βœ… {len(azure_tts_locales)} locale(s), {azure_total_voices} voice(s)"
else:
azure_tts_text = "❌ N/A"
commercial_rows.append({
"Service": "Azure Speech",
"ASR": azure_asr_text,
"TTS": azure_tts_text,
})
# Google Cloud Speech
if google_stt_available:
google_stt_text = f"βœ… {len(google_stt_locales)} locale(s)"
else:
google_stt_text = "❌ N/A"
if google_tts_available:
google_tts_text = f"βœ… {len(google_tts_locales)} locale(s), {google_total_voices} voice(s)"
else:
google_tts_text = "❌ N/A"
commercial_rows.append({
"Service": "Google Cloud Speech",
"ASR": google_stt_text,
"TTS": google_tts_text,
})
# AWS (Transcribe + Polly)
if aws_transcribe_available:
aws_transcribe_text = f"βœ… {len(aws_transcribe_locales)} locale(s)"
else:
aws_transcribe_text = "❌ N/A"
if aws_polly_available:
aws_polly_text = f"βœ… {len(aws_polly_locales)} locale(s), {aws_total_voices} voice(s)"
else:
aws_polly_text = "❌ N/A"
commercial_rows.append({
"Service": "AWS (Transcribe + Polly)",
"ASR": aws_transcribe_text,
"TTS": aws_polly_text,
})
# ElevenLabs Multilingual v2 (TTS only)
all_logs.append("\n[ElevenLabs]")
elevenlabs_v2_supported = check_elevenlabs_multilingual_v2_support(iso_639_1)
all_logs.append(f" Multilingual v2: {'βœ… Supported' if elevenlabs_v2_supported else '❌ Not supported'}")
if elevenlabs_v2_supported:
elevenlabs_v2_tts_text = "βœ… Supported"
else:
elevenlabs_v2_tts_text = "❌ N/A"
commercial_rows.append({
"Service": "ElevenLabs Multilingual v2",
"ASR": "N/A", # ElevenLabs doesn't offer ASR
"TTS": elevenlabs_v2_tts_text,
})
# ElevenLabs Turbo v3 (TTS only)
elevenlabs_v3_supported = check_elevenlabs_turbo_v3_support(iso_639_2)
all_logs.append(f" Turbo v3: {'βœ… Supported' if elevenlabs_v3_supported else '❌ Not supported'}")
if elevenlabs_v3_supported:
elevenlabs_v3_tts_text = "βœ… Supported"
else:
elevenlabs_v3_tts_text = "❌ N/A"
commercial_rows.append({
"Service": "ElevenLabs Turbo v3",
"ASR": "N/A", # ElevenLabs doesn't offer ASR
"TTS": elevenlabs_v3_tts_text,
})
commercial_df = pd.DataFrame(commercial_rows)
# HuggingFace Models - Search for real ASR and TTS models
all_logs.append("\n[HuggingFace Models]")
asr_models, asr_model_logs = search_huggingface_models(iso_639_1, iso_639_2, 'automatic-speech-recognition', max_results=100, max_pages=5)
all_logs.extend([f" [ASR] {log}" for log in asr_model_logs])
tts_models, tts_model_logs = search_huggingface_models(iso_639_1, iso_639_2, 'text-to-speech', max_results=100, max_pages=5)
all_logs.extend([f" [TTS] {log}" for log in tts_model_logs])
# Apply deduplication if requested
if deduplicate:
all_logs.append(f"\n[Deduplication]")
asr_before = len(asr_models)
asr_models = deduplicate_models(asr_models)
all_logs.append(f" ASR models: {asr_before} β†’ {len(asr_models)} (removed {asr_before - len(asr_models)} duplicates)")
tts_before = len(tts_models)
tts_models = deduplicate_models(tts_models)
all_logs.append(f" TTS models: {tts_before} β†’ {len(tts_models)} (removed {tts_before - len(tts_models)} duplicates)")
else:
# Add duplicates count of 1 for all models when not deduplicating
for model in asr_models:
model['duplicates'] = 1
for model in tts_models:
model['duplicates'] = 1
# Format ASR models with clickable names
asr_models_data = []
for model in asr_models:
asr_models_data.append({
"Model Name": f"[{model['name']}]({model['url']})",
"Downloads": model['downloads'],
"Likes": model['likes'],
"Size": model.get('size', ''),
"Duplicates": model.get('duplicates', 1)
})
if asr_models_data:
asr_models_df = pd.DataFrame(asr_models_data)
else:
# Empty dataframe if no models found
asr_models_df = pd.DataFrame(columns=["Model Name", "Downloads", "Likes", "Size", "Duplicates"])
# Format TTS models with clickable names
tts_models_data = []
for model in tts_models:
tts_models_data.append({
"Model Name": f"[{model['name']}]({model['url']})",
"Downloads": model['downloads'],
"Likes": model['likes'],
"Size": model.get('size', ''),
"Duplicates": model.get('duplicates', 1)
})
if tts_models_data:
tts_models_df = pd.DataFrame(tts_models_data)
else:
# Empty dataframe if no models found
tts_models_df = pd.DataFrame(columns=["Model Name", "Downloads", "Likes", "Size", "Duplicates"])
# HuggingFace Datasets - Search for real ASR and TTS datasets
all_logs.append("\n[HuggingFace Datasets]")
asr_datasets, asr_dataset_logs = search_huggingface_datasets(iso_639_1, iso_639_2, 'automatic-speech-recognition', max_results=100, max_pages=5)
all_logs.extend([f" [ASR] {log}" for log in asr_dataset_logs])
tts_datasets, tts_dataset_logs = search_huggingface_datasets(iso_639_1, iso_639_2, 'text-to-speech', max_results=100, max_pages=5)
all_logs.extend([f" [TTS] {log}" for log in tts_dataset_logs])
# Format ASR datasets with clickable names
asr_datasets_data = []
for dataset in asr_datasets:
asr_datasets_data.append({
"Dataset Name": f"[{dataset['name']}]({dataset['url']})",
"Downloads": dataset['downloads'],
"Likes": dataset['likes'],
"Size": dataset.get('size', '')
})
if asr_datasets_data:
asr_datasets_df = pd.DataFrame(asr_datasets_data)
else:
# Empty dataframe if no datasets found
asr_datasets_df = pd.DataFrame(columns=["Dataset Name", "Downloads", "Likes", "Size"])
# Format TTS datasets with clickable names
tts_datasets_data = []
for dataset in tts_datasets:
tts_datasets_data.append({
"Dataset Name": f"[{dataset['name']}]({dataset['url']})",
"Downloads": dataset['downloads'],
"Likes": dataset['likes'],
"Size": dataset.get('size', '')
})
if tts_datasets_data:
tts_datasets_df = pd.DataFrame(tts_datasets_data)
else:
# Empty dataframe if no datasets found
tts_datasets_df = pd.DataFrame(columns=["Dataset Name", "Downloads", "Likes", "Size"])
# Combine all logs
log_text = "\n".join(all_logs)
# Return CV stats, commercial services, models, datasets, and logs
return cv_stats, commercial_df, asr_models_df, tts_models_df, len(asr_models), len(tts_models), asr_datasets_df, tts_datasets_df, len(asr_datasets), len(tts_datasets), log_text
# Initialize - load language list and app content
print("Initializing Speech Resource Finder...")
APP_CONTENT = load_app_content(APP_CONTENT_FILE)
LANGUAGES = load_language_list(LANGUAGE_CODES_FILE)
LANGUAGE_TAXONOMY = load_language_taxonomy(LANGUAGE_TAXONOMY_URL)
COMMON_VOICE_DATA = load_common_voice_data(COMMON_VOICE_DATA_FILE)
# Create language choices for dropdown (code: name format for easy searching)
language_choices = [f"{code}: {info['name']}" for code, info in sorted(LANGUAGES.items(), key=lambda x: x[1]['name'])]
print(f"Created dropdown with {len(language_choices)} language options")
with gr.Blocks(title=APP_CONTENT["title"]) as demo:
gr.Markdown(f"# 🌐 {APP_CONTENT['title']}")
gr.Markdown(APP_CONTENT["description"])
with gr.Row(equal_height=True):
with gr.Column(scale=70):
language_dropdown = gr.Dropdown(
choices=language_choices,
label="Select Language",
info="Type to search for a language",
allow_custom_value=False,
filterable=True,
)
with gr.Column(scale=30):
language_metadata = gr.HTML(
"""<div style='padding: 15px; border: 2px solid #e0e0e0; border-radius: 4px; background-color: #fafafa; height: 100%; display: flex; align-items: center; justify-content: center; box-sizing: border-box;'>
<p style='margin: 0; color: #333; font-size: 14px;'>Select a language to see resource classification</p>
</div>""",
elem_id="language-metadata"
)
with gr.Row():
with gr.Column(scale=70):
gr.Markdown("## Commercial Services")
commercial_table = gr.Dataframe(
headers=["Service", "ASR", "TTS"],
interactive=False,
wrap=True,
)
with gr.Column(scale=30):
gr.Markdown("## Common Voice")
cv_info = gr.HTML(
"""<div style='padding: 15px; border: 2px solid #e0e0e0; border-radius: 4px; background-color: #fafafa;'>
<p style='margin: 0; color: #666; font-size: 13px;'>Select a language</p>
</div>""",
elem_id="cv-info"
)
gr.Markdown("## HuggingFace Models")
with gr.Row():
deduplicate_checkbox = gr.Checkbox(
label="Deduplicate models",
value=True,
info="Keep only the model with most downloads for each base name"
)
# Create tabs for ASR and TTS models with count labels
with gr.Tabs():
with gr.Tab(label="ASR Models") as asr_tab:
asr_count_label = gr.Markdown("*Loading...*")
asr_models_table = gr.Dataframe(
headers=["Model Name", "Downloads", "Likes", "Size", "Duplicates"],
interactive=False,
wrap=True,
datatype=["markdown", "number", "number", "str", "number"],
)
with gr.Tab(label="TTS Models") as tts_tab:
tts_count_label = gr.Markdown("*Loading...*")
tts_models_table = gr.Dataframe(
headers=["Model Name", "Downloads", "Likes", "Size", "Duplicates"],
interactive=False,
wrap=True,
datatype=["markdown", "number", "number", "str", "number"],
)
gr.Markdown("## HuggingFace Datasets")
# Create tabs for ASR and TTS datasets with count labels
with gr.Tabs():
with gr.Tab(label="ASR Datasets") as asr_datasets_tab:
asr_datasets_count_label = gr.Markdown("*Loading...*")
asr_datasets_table = gr.Dataframe(
headers=["Dataset Name", "Downloads", "Likes", "Size"],
interactive=False,
wrap=True,
datatype=["markdown", "number", "number", "str"],
)
with gr.Tab(label="TTS Datasets") as tts_datasets_tab:
tts_datasets_count_label = gr.Markdown("*Loading...*")
tts_datasets_table = gr.Dataframe(
headers=["Dataset Name", "Downloads", "Likes", "Size"],
interactive=False,
wrap=True,
datatype=["markdown", "number", "number", "str"],
)
with gr.Accordion("Logs", open=False):
log_textbox = gr.Textbox(
show_label=False,
lines=15,
max_lines=30,
interactive=False,
placeholder="Logs will appear here...",
autoscroll=True,
)
# About section with full content
with gr.Accordion("About this tool", open=False):
gr.Markdown(APP_CONTENT["full_content"])
def on_search(language_selection, deduplicate):
if not language_selection:
cv_default_html = """<div style='padding: 15px; border: 2px solid #e0e0e0; border-radius: 4px; background-color: #fafafa;'>
<p style='margin: 0; color: #666; font-size: 13px;'>Select a language</p>
</div>"""
return get_default_metadata_html(), cv_default_html, None, "", None, "", None, "", None, "", None, ""
# Extract the language code from "code: name" format
language_code = language_selection.split(":")[0].strip()
# Get language name and ISO 639-1 code
language_name = LANGUAGES.get(language_code, {}).get("name", "")
iso_639_1 = LANGUAGES.get(language_code, {}).get("iso_639_1", "")
# Generate metadata HTML (taxonomy + Wikipedia info)
metadata_html = get_language_metadata_html(language_code, language_name, iso_639_1, LANGUAGE_TAXONOMY)
cv_stats, commercial_df, asr_models_df, tts_models_df, asr_models_count, tts_models_count, asr_datasets_df, tts_datasets_df, asr_datasets_count, tts_datasets_count, logs = search_language_resources(language_code, deduplicate=deduplicate)
# Create Common Voice info HTML
if cv_stats:
cv_info_html = f"""<div style='padding: 15px; border: 2px solid #4caf50; border-radius: 4px; background-color: #ffffff;'>
<div style='margin-bottom: 12px;'>
<span style='font-size: 18px;'>βœ…</span>
<span style='font-weight: bold; color: #2e7d32; font-size: 14px; margin-left: 4px;'>Available</span>
</div>
<table style='width: 100%; border-collapse: collapse; font-size: 13px;'>
<tr>
<td style='padding: 3px 8px 3px 0; color: #666; width: 45%;'>Locale</td>
<td style='padding: 3px 0; color: #000; font-weight: 500;'>{cv_stats['locale']}</td>
</tr>
<tr>
<td style='padding: 3px 8px 3px 0; color: #666;'>Valid Hours</td>
<td style='padding: 3px 0; color: #000; font-weight: 500;'>{cv_stats['valid_hrs']:.1f}h</td>
</tr>
<tr>
<td style='padding: 3px 8px 3px 0; color: #666;'>Total Hours</td>
<td style='padding: 3px 0; color: #000; font-weight: 500;'>{cv_stats['total_hrs']:.1f}h</td>
</tr>
<tr>
<td style='padding: 3px 8px 3px 0; color: #666;'>Contributors</td>
<td style='padding: 3px 0; color: #000; font-weight: 500;'>{cv_stats['users_formatted']}</td>
</tr>
<tr>
<td style='padding: 3px 8px 3px 0; color: #666;'>Gender</td>
<td style='padding: 3px 0; color: #000; font-weight: 500;'>{cv_stats['male_pct']:.0f}% M / {cv_stats['female_pct']:.0f}% F</td>
</tr>
<tr>
<td style='padding: 3px 8px 3px 0; color: #666;'>Version</td>
<td style='padding: 3px 0; color: #000; font-weight: 500;'>{COMMON_VOICE_VERSION}</td>
</tr>
</table>
</div>"""
else:
cv_info_html = """<div style='padding: 15px; border: 2px solid #e0e0e0; border-radius: 4px; background-color: #fafafa;'>
<div style='margin-bottom: 8px;'>
<span style='font-size: 18px;'>❌</span>
<span style='font-weight: bold; color: #666; font-size: 14px; margin-left: 4px;'>Not Available</span>
</div>
<p style='margin: 0; color: #999; font-size: 12px;'>Not in Common Voice dataset</p>
</div>"""
# Create count labels
asr_models_label = f"**Found {asr_models_count} ASR model(s)**"
tts_models_label = f"**Found {tts_models_count} TTS model(s)**"
asr_datasets_label = f"**Found {asr_datasets_count} ASR dataset(s)**"
tts_datasets_label = f"**Found {tts_datasets_count} TTS dataset(s)**"
return metadata_html, cv_info_html, commercial_df, asr_models_label, asr_models_df, tts_models_label, tts_models_df, asr_datasets_label, asr_datasets_df, tts_datasets_label, tts_datasets_df, logs
# Trigger search when language is selected
language_dropdown.change(
fn=on_search,
inputs=[language_dropdown, deduplicate_checkbox],
outputs=[language_metadata, cv_info, commercial_table, asr_count_label, asr_models_table, tts_count_label, tts_models_table, asr_datasets_count_label, asr_datasets_table, tts_datasets_count_label, tts_datasets_table, log_textbox],
)
# Trigger search when deduplicate checkbox is changed
deduplicate_checkbox.change(
fn=on_search,
inputs=[language_dropdown, deduplicate_checkbox],
outputs=[language_metadata, cv_info, commercial_table, asr_count_label, asr_models_table, tts_count_label, tts_models_table, asr_datasets_count_label, asr_datasets_table, tts_datasets_count_label, tts_datasets_table, log_textbox],
)
if __name__ == "__main__":
demo.launch(server_name="0.0.0.0", server_port=7860, share=False, show_error=True)