""" Data loading utilities for Speech Resource Finder """ import csv import json import requests def load_language_list(csv_path): """ Load ISO 639 language codes from CSV file Returns: dict: {iso_639_2: {"name": str, "iso_639_1": str, "french_name": str}} """ languages = {} try: with open(csv_path, 'r', encoding='utf-8') as f: reader = csv.DictReader(f) for row in reader: iso_639_2 = row['ISO 639-2'].strip() iso_639_1 = row['ISO 639-1'].strip() name = row['English name'].strip() french_name = row['French name'].strip() if iso_639_2 and name: languages[iso_639_2] = { "name": name, "iso_639_1": iso_639_1, "french_name": french_name, } print(f"Loaded {len(languages)} languages from {csv_path}") except Exception as e: print(f"ERROR: Failed to load language list from {csv_path}: {e}") print("The application cannot run without the language codes CSV file.") return languages def load_language_taxonomy(taxonomy_url): """ Load language taxonomy data from Microsoft's linguistic diversity project Returns: dict: {language_name_lowercase: level} """ taxonomy = {} try: response = requests.get(taxonomy_url, timeout=10) response.raise_for_status() # Parse the CSV-like content (format: language_name,level) for line in response.text.strip().split('\n'): if line.strip(): parts = line.strip().split(',') if len(parts) == 2: lang_name = parts[0].strip().lower() level = int(parts[1].strip()) taxonomy[lang_name] = level print(f"Loaded taxonomy data for {len(taxonomy)} languages") except Exception as e: print(f"Warning: Could not load language taxonomy: {e}") print("Language classification will show as 'Unknown'") return taxonomy def load_common_voice_data(json_path): """ Load Common Voice dataset statistics Returns: dict: {locale_code: {validHrs, totalHrs, splits, ...}} """ cv_data = {} try: with open(json_path, 'r', encoding='utf-8') as f: data = json.load(f) cv_data = data.get('locales', {}) print(f"Loaded Common Voice data for {len(cv_data)} locales") except Exception as e: print(f"Warning: Could not load Common Voice data: {e}") print("Common Voice information will not be available") return cv_data def load_app_content(content_path): """ Load app content from markdown file Returns: dict: {"title": str, "description": str, "full_content": str} """ app_content = { "title": "Speech Resource Finder", "description": "Search for speech resources", "full_content": "" } try: with open(content_path, 'r', encoding='utf-8') as f: content = f.read() # Parse markdown content lines = content.split('\n') # Extract title (first # heading) title = "Speech Resource Finder" for line in lines: if line.startswith('# '): title = line[2:].strip() break # Extract description (text after ## Description until next ##) description = "" in_description = False for line in lines: if line.startswith('## Description'): in_description = True continue elif in_description and line.startswith('##'): break elif in_description and line.strip(): description += line.strip() + " " app_content = { "title": title, "description": description.strip(), "full_content": content } print(f"Loaded app content from {content_path}") except Exception as e: print(f"Error loading app content: {e}") print("Using default content") return app_content def get_common_voice_stats(language_code, iso_639_1, cv_data): """ Get Common Voice statistics for a language Args: language_code: ISO 639-2 (3-letter) code iso_639_1: ISO 639-1 (2-letter) code cv_data: Common Voice dataset dictionary Returns: dict or None: Statistics if found, None otherwise """ # Try to find CV data using different code formats cv_locale = None locale_data = None # 1. Try ISO 639-2 (3-letter) code directly (e.g., "zgh", "kab") if language_code and language_code in cv_data: cv_locale = language_code locale_data = cv_data[language_code] # 2. Try ISO 639-1 (2-letter) code (e.g., "en", "fr") elif iso_639_1 and iso_639_1 in cv_data: cv_locale = iso_639_1 locale_data = cv_data[iso_639_1] # 3. Try to find any locale that starts with the 2-letter code (e.g., "fy-NL", "ga-IE") elif iso_639_1: matching_locales = [loc for loc in cv_data.keys() if loc.startswith(iso_639_1 + '-')] if matching_locales: cv_locale = matching_locales[0] # Take the first match locale_data = cv_data[cv_locale] if not locale_data: return None # Extract statistics valid_hrs = locale_data.get('validHrs', 0) total_hrs = locale_data.get('totalHrs', 0) users = locale_data.get('users', 0) # Extract gender balance gender_splits = locale_data.get('splits', {}).get('gender', {}) male_pct = gender_splits.get('male_masculine', 0) * 100 female_pct = gender_splits.get('female_feminine', 0) * 100 # Format users count if users >= 1000: users_formatted = f"{users / 1000:.0f}k" else: users_formatted = str(users) return { 'locale': cv_locale, 'valid_hrs': valid_hrs, 'total_hrs': total_hrs, 'male_pct': male_pct, 'female_pct': female_pct, 'users': users, 'users_formatted': users_formatted }