import gradio as gr import pandas as pd # Local modules from data_loaders import ( load_language_list, load_language_taxonomy, load_common_voice_data, load_app_content, get_common_voice_stats ) from commercial_services import ( fetch_azure_asr_languages, fetch_azure_tts_languages, fetch_google_stt_languages, fetch_google_tts_languages, fetch_aws_transcribe_languages, fetch_aws_polly_languages, get_azure_locales_for_language, get_google_locales_for_language, get_aws_locales_for_language, check_elevenlabs_multilingual_v2_support, check_elevenlabs_turbo_v3_support ) from huggingface_search import ( search_huggingface_models, search_huggingface_datasets, deduplicate_models ) from language_metadata import get_language_metadata_html, get_default_metadata_html # Configuration LANGUAGE_CODES_FILE = "language-codes-full.csv" APP_CONTENT_FILE = "app_content.md" LANGUAGE_TAXONOMY_URL = "https://microsoft.github.io/linguisticdiversity/assets/lang2tax.txt" COMMON_VOICE_DATA_FILE = "cv-corpus-24.0-2025-12-05.json" COMMON_VOICE_VERSION = "24.0 (2025-12-05)" # Language list will be loaded from CSV # Structure: {iso_639_2: {"name": str, "iso_639_1": str, "french_name": str}} LANGUAGES = {} # Language taxonomy mapping (from Joshi et al.'s linguistic diversity paper) # Structure: {language_name_lowercase: level} LANGUAGE_TAXONOMY = {} # Common Voice dataset # Structure: {locale_code: {validHrs: float, totalHrs: float, splits: {gender: {...}}, ...}} COMMON_VOICE_DATA = {} # Taxonomy level descriptions TAXONOMY_LEVELS = { 0: "The Left-Behinds", 1: "The Scraping-Bys", 2: "The Hopefuls", 3: "The Rising Stars", 4: "The Underdogs", 5: "The Winners" } # App content will be loaded from markdown file APP_CONTENT = { "title": "Speech Resource Finder", "description": "Search for speech resources", "full_content": "" } def search_language_resources(language_code, deduplicate=False): """ Search for ASR/TTS resources for a given language Returns results organized by service type deduplicate: if True, remove duplicate models (same base name) and keep only the one with most downloads """ all_logs = [] if not language_code: return None, None, None, None, 0, 0, None, None, 0, 0, "" lang_info = LANGUAGES.get(language_code) if not lang_info: return None, None, None, None, 0, 0, None, None, 0, 0, "" language_name = lang_info['name'] iso_639_1 = lang_info['iso_639_1'] iso_639_2 = language_code # language_code IS the ISO 639-2 code all_logs.append(f"=== Searching for {language_name} ({language_code}) ===") all_logs.append(f"Language codes: ISO 639-1={iso_639_1}, ISO 639-2={iso_639_2}") # Check Common Voice data all_logs.append("\n[Common Voice Dataset]") cv_stats = get_common_voice_stats(iso_639_2, iso_639_1, COMMON_VOICE_DATA) if cv_stats: all_logs.append(f" ✅ Available in Common Voice (locale: {cv_stats['locale']})") all_logs.append(f" Valid hours: {cv_stats['valid_hrs']:.1f}h, Total hours: {cv_stats['total_hrs']:.1f}h") all_logs.append(f" Gender balance: {cv_stats['male_pct']:.1f}% male, {cv_stats['female_pct']:.1f}% female") else: all_logs.append(f" ❌ Not available in Common Voice") # Fetch Azure data all_logs.append("\n[Azure Speech Services]") azure_asr = fetch_azure_asr_languages() azure_tts = fetch_azure_tts_languages() all_logs.append(f" Fetched {len(azure_asr)} ASR languages and {len(azure_tts)} TTS languages from Azure") # Get matching Azure locales using ISO 639-1 code azure_locales = get_azure_locales_for_language(iso_639_1) all_logs.append(f" Matching Azure locales: {azure_locales}") # Check Azure ASR support azure_asr_locales = [loc for loc in azure_locales if loc in azure_asr] azure_asr_available = len(azure_asr_locales) > 0 all_logs.append(f" Azure ASR: {'✅ Supported' if azure_asr_available else '❌ Not supported'} ({len(azure_asr_locales)} locales)") # Check Azure TTS support and count voices azure_tts_locales = [loc for loc in azure_locales if loc in azure_tts] azure_tts_available = len(azure_tts_locales) > 0 azure_total_voices = sum(azure_tts[loc]['voice_count'] for loc in azure_tts_locales) all_logs.append(f" Azure TTS: {'✅ Supported' if azure_tts_available else '❌ Not supported'} ({len(azure_tts_locales)} locales, {azure_total_voices} voices)") # Fetch Google Cloud data all_logs.append("\n[Google Cloud Speech]") google_stt = fetch_google_stt_languages() google_tts = fetch_google_tts_languages() all_logs.append(f" Fetched {len(google_stt)} STT languages and {len(google_tts)} TTS languages from Google Cloud") # Get matching Google Cloud locales using ISO 639-1 code google_locales = get_google_locales_for_language(iso_639_1) all_logs.append(f" Matching Google Cloud locales: {google_locales}") # Check Google Cloud STT support google_stt_locales = [loc for loc in google_locales if loc in google_stt] google_stt_available = len(google_stt_locales) > 0 all_logs.append(f" Google STT: {'✅ Supported' if google_stt_available else '❌ Not supported'} ({len(google_stt_locales)} locales)") # Check Google Cloud TTS support and count voices google_tts_locales = [loc for loc in google_locales if loc in google_tts] google_tts_available = len(google_tts_locales) > 0 google_total_voices = sum(google_tts[loc]['voice_count'] for loc in google_tts_locales) all_logs.append(f" Google TTS: {'✅ Supported' if google_tts_available else '❌ Not supported'} ({len(google_tts_locales)} locales, {google_total_voices} voices)") # Fetch AWS data all_logs.append("\n[AWS (Transcribe + Polly)]") aws_transcribe = fetch_aws_transcribe_languages() aws_polly = fetch_aws_polly_languages() all_logs.append(f" Fetched {len(aws_transcribe)} Transcribe languages and {len(aws_polly)} Polly languages from AWS") # Get matching AWS locales using ISO 639-1 code aws_locales = get_aws_locales_for_language(iso_639_1) all_logs.append(f" Matching AWS locales: {aws_locales}") # Check AWS Transcribe support aws_transcribe_locales = [loc for loc in aws_locales if loc in aws_transcribe] aws_transcribe_available = len(aws_transcribe_locales) > 0 all_logs.append(f" AWS Transcribe: {'✅ Supported' if aws_transcribe_available else '❌ Not supported'} ({len(aws_transcribe_locales)} locales)") # Check AWS Polly support and count voices aws_polly_locales = [loc for loc in aws_locales if loc in aws_polly] aws_polly_available = len(aws_polly_locales) > 0 aws_total_voices = sum(aws_polly[loc]['voice_count'] for loc in aws_polly_locales) all_logs.append(f" AWS Polly: {'✅ Supported' if aws_polly_available else '❌ Not supported'} ({len(aws_polly_locales)} locales, {aws_total_voices} voices)") # Commercial Services commercial_rows = [] # Azure Speech if azure_asr_available: azure_asr_text = f"✅ {len(azure_asr_locales)} locale(s)" else: azure_asr_text = "❌ N/A" if azure_tts_available: azure_tts_text = f"✅ {len(azure_tts_locales)} locale(s), {azure_total_voices} voice(s)" else: azure_tts_text = "❌ N/A" commercial_rows.append({ "Service": "Azure Speech", "ASR": azure_asr_text, "TTS": azure_tts_text, }) # Google Cloud Speech if google_stt_available: google_stt_text = f"✅ {len(google_stt_locales)} locale(s)" else: google_stt_text = "❌ N/A" if google_tts_available: google_tts_text = f"✅ {len(google_tts_locales)} locale(s), {google_total_voices} voice(s)" else: google_tts_text = "❌ N/A" commercial_rows.append({ "Service": "Google Cloud Speech", "ASR": google_stt_text, "TTS": google_tts_text, }) # AWS (Transcribe + Polly) if aws_transcribe_available: aws_transcribe_text = f"✅ {len(aws_transcribe_locales)} locale(s)" else: aws_transcribe_text = "❌ N/A" if aws_polly_available: aws_polly_text = f"✅ {len(aws_polly_locales)} locale(s), {aws_total_voices} voice(s)" else: aws_polly_text = "❌ N/A" commercial_rows.append({ "Service": "AWS (Transcribe + Polly)", "ASR": aws_transcribe_text, "TTS": aws_polly_text, }) # ElevenLabs Multilingual v2 (TTS only) all_logs.append("\n[ElevenLabs]") elevenlabs_v2_supported = check_elevenlabs_multilingual_v2_support(iso_639_1) all_logs.append(f" Multilingual v2: {'✅ Supported' if elevenlabs_v2_supported else '❌ Not supported'}") if elevenlabs_v2_supported: elevenlabs_v2_tts_text = "✅ Supported" else: elevenlabs_v2_tts_text = "❌ N/A" commercial_rows.append({ "Service": "ElevenLabs Multilingual v2", "ASR": "N/A", # ElevenLabs doesn't offer ASR "TTS": elevenlabs_v2_tts_text, }) # ElevenLabs Turbo v3 (TTS only) elevenlabs_v3_supported = check_elevenlabs_turbo_v3_support(iso_639_2) all_logs.append(f" Turbo v3: {'✅ Supported' if elevenlabs_v3_supported else '❌ Not supported'}") if elevenlabs_v3_supported: elevenlabs_v3_tts_text = "✅ Supported" else: elevenlabs_v3_tts_text = "❌ N/A" commercial_rows.append({ "Service": "ElevenLabs Turbo v3", "ASR": "N/A", # ElevenLabs doesn't offer ASR "TTS": elevenlabs_v3_tts_text, }) commercial_df = pd.DataFrame(commercial_rows) # HuggingFace Models - Search for real ASR and TTS models all_logs.append("\n[HuggingFace Models]") asr_models, asr_model_logs = search_huggingface_models(iso_639_1, iso_639_2, 'automatic-speech-recognition', max_results=100, max_pages=5) all_logs.extend([f" [ASR] {log}" for log in asr_model_logs]) tts_models, tts_model_logs = search_huggingface_models(iso_639_1, iso_639_2, 'text-to-speech', max_results=100, max_pages=5) all_logs.extend([f" [TTS] {log}" for log in tts_model_logs]) # Apply deduplication if requested if deduplicate: all_logs.append(f"\n[Deduplication]") asr_before = len(asr_models) asr_models = deduplicate_models(asr_models) all_logs.append(f" ASR models: {asr_before} → {len(asr_models)} (removed {asr_before - len(asr_models)} duplicates)") tts_before = len(tts_models) tts_models = deduplicate_models(tts_models) all_logs.append(f" TTS models: {tts_before} → {len(tts_models)} (removed {tts_before - len(tts_models)} duplicates)") else: # Add duplicates count of 1 for all models when not deduplicating for model in asr_models: model['duplicates'] = 1 for model in tts_models: model['duplicates'] = 1 # Format ASR models with clickable names asr_models_data = [] for model in asr_models: asr_models_data.append({ "Model Name": f"[{model['name']}]({model['url']})", "Downloads": model['downloads'], "Likes": model['likes'], "Size": model.get('size', ''), "Duplicates": model.get('duplicates', 1) }) if asr_models_data: asr_models_df = pd.DataFrame(asr_models_data) else: # Empty dataframe if no models found asr_models_df = pd.DataFrame(columns=["Model Name", "Downloads", "Likes", "Size", "Duplicates"]) # Format TTS models with clickable names tts_models_data = [] for model in tts_models: tts_models_data.append({ "Model Name": f"[{model['name']}]({model['url']})", "Downloads": model['downloads'], "Likes": model['likes'], "Size": model.get('size', ''), "Duplicates": model.get('duplicates', 1) }) if tts_models_data: tts_models_df = pd.DataFrame(tts_models_data) else: # Empty dataframe if no models found tts_models_df = pd.DataFrame(columns=["Model Name", "Downloads", "Likes", "Size", "Duplicates"]) # HuggingFace Datasets - Search for real ASR and TTS datasets all_logs.append("\n[HuggingFace Datasets]") asr_datasets, asr_dataset_logs = search_huggingface_datasets(iso_639_1, iso_639_2, 'automatic-speech-recognition', max_results=100, max_pages=5) all_logs.extend([f" [ASR] {log}" for log in asr_dataset_logs]) tts_datasets, tts_dataset_logs = search_huggingface_datasets(iso_639_1, iso_639_2, 'text-to-speech', max_results=100, max_pages=5) all_logs.extend([f" [TTS] {log}" for log in tts_dataset_logs]) # Format ASR datasets with clickable names asr_datasets_data = [] for dataset in asr_datasets: asr_datasets_data.append({ "Dataset Name": f"[{dataset['name']}]({dataset['url']})", "Downloads": dataset['downloads'], "Likes": dataset['likes'], "Size": dataset.get('size', '') }) if asr_datasets_data: asr_datasets_df = pd.DataFrame(asr_datasets_data) else: # Empty dataframe if no datasets found asr_datasets_df = pd.DataFrame(columns=["Dataset Name", "Downloads", "Likes", "Size"]) # Format TTS datasets with clickable names tts_datasets_data = [] for dataset in tts_datasets: tts_datasets_data.append({ "Dataset Name": f"[{dataset['name']}]({dataset['url']})", "Downloads": dataset['downloads'], "Likes": dataset['likes'], "Size": dataset.get('size', '') }) if tts_datasets_data: tts_datasets_df = pd.DataFrame(tts_datasets_data) else: # Empty dataframe if no datasets found tts_datasets_df = pd.DataFrame(columns=["Dataset Name", "Downloads", "Likes", "Size"]) # Combine all logs log_text = "\n".join(all_logs) # Return CV stats, commercial services, models, datasets, and logs return cv_stats, commercial_df, asr_models_df, tts_models_df, len(asr_models), len(tts_models), asr_datasets_df, tts_datasets_df, len(asr_datasets), len(tts_datasets), log_text # Initialize - load language list and app content print("Initializing Speech Resource Finder...") APP_CONTENT = load_app_content(APP_CONTENT_FILE) LANGUAGES = load_language_list(LANGUAGE_CODES_FILE) LANGUAGE_TAXONOMY = load_language_taxonomy(LANGUAGE_TAXONOMY_URL) COMMON_VOICE_DATA = load_common_voice_data(COMMON_VOICE_DATA_FILE) # Create language choices for dropdown (code: name format for easy searching) language_choices = [f"{code}: {info['name']}" for code, info in sorted(LANGUAGES.items(), key=lambda x: x[1]['name'])] print(f"Created dropdown with {len(language_choices)} language options") with gr.Blocks(title=APP_CONTENT["title"]) as demo: gr.Markdown(f"# 🌐 {APP_CONTENT['title']}") gr.Markdown(APP_CONTENT["description"]) with gr.Row(equal_height=True): with gr.Column(scale=70): language_dropdown = gr.Dropdown( choices=language_choices, label="Select Language", info="Type to search for a language", allow_custom_value=False, filterable=True, ) with gr.Column(scale=30): language_metadata = gr.HTML( """
Select a language to see resource classification
Select a language
Select a language
| Locale | {cv_stats['locale']} |
| Valid Hours | {cv_stats['valid_hrs']:.1f}h |
| Total Hours | {cv_stats['total_hrs']:.1f}h |
| Contributors | {cv_stats['users_formatted']} |
| Gender | {cv_stats['male_pct']:.0f}% M / {cv_stats['female_pct']:.0f}% F |
| Version | {COMMON_VOICE_VERSION} |
Not in Common Voice dataset