|
|
""" |
|
|
Commercial speech service support checkers |
|
|
Includes Azure, Google Cloud, AWS, and ElevenLabs |
|
|
""" |
|
|
|
|
|
import requests |
|
|
from bs4 import BeautifulSoup |
|
|
from functools import lru_cache |
|
|
|
|
|
|
|
|
@lru_cache(maxsize=1) |
|
|
def fetch_azure_asr_languages(): |
|
|
"""Scrape Azure Speech-to-Text supported languages""" |
|
|
url = "https://learn.microsoft.com/en-us/azure/ai-services/speech-service/language-support?tabs=stt" |
|
|
|
|
|
try: |
|
|
response = requests.get(url, timeout=10) |
|
|
response.raise_for_status() |
|
|
soup = BeautifulSoup(response.content, 'html.parser') |
|
|
|
|
|
|
|
|
tables = soup.find_all('table') |
|
|
|
|
|
azure_asr = {} |
|
|
for table in tables: |
|
|
rows = table.find_all('tr') |
|
|
if not rows: |
|
|
continue |
|
|
|
|
|
|
|
|
headers = [th.get_text(strip=True) for th in rows[0].find_all('th')] |
|
|
if 'Locale' in ' '.join(headers) or 'Language' in ' '.join(headers): |
|
|
for row in rows[1:]: |
|
|
cols = row.find_all('td') |
|
|
if len(cols) >= 2: |
|
|
locale = cols[0].get_text(strip=True) |
|
|
language = cols[1].get_text(strip=True) |
|
|
if locale and language: |
|
|
azure_asr[locale] = language |
|
|
break |
|
|
|
|
|
return azure_asr |
|
|
except Exception as e: |
|
|
print(f"Error fetching Azure ASR data: {e}") |
|
|
return {} |
|
|
|
|
|
|
|
|
@lru_cache(maxsize=1) |
|
|
def fetch_azure_tts_languages(): |
|
|
"""Scrape Azure Text-to-Speech supported languages with voice counts""" |
|
|
url = "https://learn.microsoft.com/en-us/azure/ai-services/speech-service/language-support?tabs=tts" |
|
|
|
|
|
try: |
|
|
response = requests.get(url, timeout=10) |
|
|
response.raise_for_status() |
|
|
soup = BeautifulSoup(response.content, 'html.parser') |
|
|
|
|
|
|
|
|
tables = soup.find_all('table') |
|
|
|
|
|
azure_tts = {} |
|
|
for table in tables: |
|
|
rows = table.find_all('tr') |
|
|
if not rows: |
|
|
continue |
|
|
|
|
|
headers = [th.get_text(strip=True) for th in rows[0].find_all('th')] |
|
|
if 'Text to speech' in ' '.join(headers) or 'voices' in ' '.join(headers).lower(): |
|
|
for row in rows[1:]: |
|
|
cols = row.find_all('td') |
|
|
if len(cols) >= 3: |
|
|
locale = cols[0].get_text(strip=True) |
|
|
language = cols[1].get_text(strip=True) |
|
|
voices_text = cols[2].get_text(strip=True) |
|
|
voice_count = voices_text.count('Neural') |
|
|
if locale and language: |
|
|
azure_tts[locale] = { |
|
|
'language': language, |
|
|
'voice_count': voice_count |
|
|
} |
|
|
break |
|
|
|
|
|
return azure_tts |
|
|
except Exception as e: |
|
|
print(f"Error fetching Azure TTS data: {e}") |
|
|
return {} |
|
|
|
|
|
|
|
|
@lru_cache(maxsize=1) |
|
|
def fetch_google_stt_languages(): |
|
|
"""Scrape Google Cloud Speech-to-Text supported languages""" |
|
|
url = "https://cloud.google.com/speech-to-text/docs/speech-to-text-supported-languages" |
|
|
|
|
|
try: |
|
|
response = requests.get(url, timeout=10) |
|
|
response.raise_for_status() |
|
|
soup = BeautifulSoup(response.content, 'html.parser') |
|
|
|
|
|
|
|
|
tables = soup.find_all('table') |
|
|
|
|
|
google_stt = {} |
|
|
for table in tables: |
|
|
rows = table.find_all('tr') |
|
|
if not rows: |
|
|
continue |
|
|
|
|
|
|
|
|
headers = [th.get_text(strip=True) for th in rows[0].find_all('th')] |
|
|
|
|
|
|
|
|
bcp47_idx = None |
|
|
name_idx = None |
|
|
for idx, header in enumerate(headers): |
|
|
if 'BCP-47' in header or 'BCP47' in header: |
|
|
bcp47_idx = idx |
|
|
if 'Name' in header and name_idx is None: |
|
|
name_idx = idx |
|
|
|
|
|
if bcp47_idx is not None: |
|
|
for row in rows[1:]: |
|
|
cols = row.find_all('td') |
|
|
if len(cols) > bcp47_idx: |
|
|
locale = cols[bcp47_idx].get_text(strip=True) |
|
|
language = cols[name_idx].get_text(strip=True) if name_idx and len(cols) > name_idx else '' |
|
|
if locale and locale not in ['—', '-', '']: |
|
|
google_stt[locale] = language |
|
|
|
|
|
return google_stt |
|
|
except Exception as e: |
|
|
print(f"Error fetching Google STT data: {e}") |
|
|
return {} |
|
|
|
|
|
|
|
|
@lru_cache(maxsize=1) |
|
|
def fetch_google_tts_languages(): |
|
|
"""Scrape Google Cloud Text-to-Speech supported languages with voice counts""" |
|
|
url = "https://cloud.google.com/text-to-speech/docs/voices" |
|
|
|
|
|
try: |
|
|
response = requests.get(url, timeout=10) |
|
|
response.raise_for_status() |
|
|
soup = BeautifulSoup(response.content, 'html.parser') |
|
|
|
|
|
|
|
|
tables = soup.find_all('table') |
|
|
|
|
|
google_tts = {} |
|
|
for table in tables: |
|
|
rows = table.find_all('tr') |
|
|
if not rows: |
|
|
continue |
|
|
|
|
|
headers = [th.get_text(strip=True) for th in rows[0].find_all('th')] |
|
|
|
|
|
|
|
|
lang_code_idx = None |
|
|
for idx, header in enumerate(headers): |
|
|
if 'Language code' in header or 'language code' in header.lower(): |
|
|
lang_code_idx = idx |
|
|
break |
|
|
|
|
|
if lang_code_idx is not None: |
|
|
for row in rows[1:]: |
|
|
cols = row.find_all('td') |
|
|
if len(cols) > lang_code_idx: |
|
|
locale = cols[lang_code_idx].get_text(strip=True) |
|
|
if locale and locale not in ['—', '-', '']: |
|
|
|
|
|
if locale in google_tts: |
|
|
google_tts[locale]['voice_count'] += 1 |
|
|
else: |
|
|
language = cols[0].get_text(strip=True) if len(cols) > 0 else '' |
|
|
google_tts[locale] = { |
|
|
'language': language, |
|
|
'voice_count': 1 |
|
|
} |
|
|
|
|
|
return google_tts |
|
|
except Exception as e: |
|
|
print(f"Error fetching Google TTS data: {e}") |
|
|
return {} |
|
|
|
|
|
|
|
|
@lru_cache(maxsize=1) |
|
|
def fetch_elevenlabs_multilingual_v2(): |
|
|
"""Get ElevenLabs Multilingual v2 supported languages (ISO 639-1 codes)""" |
|
|
|
|
|
supported_codes = { |
|
|
'en', 'ja', 'zh', 'de', 'hi', 'fr', 'ko', 'pt', 'it', 'es', |
|
|
'id', 'nl', 'tr', 'fil', 'pl', 'sv', 'bg', 'ro', 'ar', 'cs', |
|
|
'el', 'fi', 'hr', 'ms', 'sk', 'da', 'ta', 'uk', 'ru' |
|
|
} |
|
|
return supported_codes |
|
|
|
|
|
|
|
|
@lru_cache(maxsize=1) |
|
|
def fetch_elevenlabs_turbo_v3(): |
|
|
"""Get ElevenLabs Eleven Turbo v3 supported languages (ISO 639-2/3 codes)""" |
|
|
|
|
|
supported_codes = { |
|
|
'afr', 'ara', 'hye', 'asm', 'aze', 'bel', 'ben', 'bos', 'bul', 'cat', |
|
|
'ceb', 'nya', 'hrv', 'ces', 'dan', 'nld', 'eng', 'est', 'fil', 'fin', |
|
|
'fra', 'glg', 'kat', 'deu', 'ell', 'guj', 'hau', 'heb', 'hin', 'hun', |
|
|
'isl', 'ind', 'gle', 'ita', 'jpn', 'jav', 'kan', 'kaz', 'kir', 'kor', |
|
|
'lav', 'lin', 'lit', 'ltz', 'mkd', 'msa', 'mal', 'cmn', 'mar', 'nep', |
|
|
'nor', 'pus', 'fas', 'pol', 'por', 'pan', 'ron', 'rus', 'srp', 'snd', |
|
|
'slk', 'slv', 'som', 'spa', 'swa', 'swe', 'tam', 'tel', 'tha', 'tur', |
|
|
'ukr', 'urd', 'vie', 'cym' |
|
|
} |
|
|
return supported_codes |
|
|
|
|
|
|
|
|
@lru_cache(maxsize=1) |
|
|
def fetch_aws_transcribe_languages(): |
|
|
"""Scrape AWS Transcribe (ASR) supported languages""" |
|
|
url = "https://docs.aws.amazon.com/transcribe/latest/dg/supported-languages.html" |
|
|
|
|
|
try: |
|
|
response = requests.get(url, timeout=10) |
|
|
response.raise_for_status() |
|
|
soup = BeautifulSoup(response.content, 'html.parser') |
|
|
|
|
|
|
|
|
tables = soup.find_all('table') |
|
|
|
|
|
aws_transcribe = {} |
|
|
for table in tables: |
|
|
rows = table.find_all('tr') |
|
|
if not rows: |
|
|
continue |
|
|
|
|
|
|
|
|
headers = [th.get_text(strip=True) for th in rows[0].find_all('th')] |
|
|
|
|
|
|
|
|
lang_code_idx = None |
|
|
lang_name_idx = None |
|
|
for idx, header in enumerate(headers): |
|
|
if 'Language code' in header or 'language code' in header.lower(): |
|
|
lang_code_idx = idx |
|
|
if 'Language' == header or header.startswith('Language'): |
|
|
lang_name_idx = idx |
|
|
|
|
|
if lang_code_idx is not None: |
|
|
for row in rows[1:]: |
|
|
cols = row.find_all('td') |
|
|
if len(cols) > lang_code_idx: |
|
|
locale = cols[lang_code_idx].get_text(strip=True) |
|
|
language = cols[lang_name_idx].get_text(strip=True) if lang_name_idx and len(cols) > lang_name_idx else '' |
|
|
if locale and locale not in ['—', '-', '']: |
|
|
aws_transcribe[locale] = language |
|
|
|
|
|
return aws_transcribe |
|
|
except Exception as e: |
|
|
print(f"Error fetching AWS Transcribe data: {e}") |
|
|
return {} |
|
|
|
|
|
|
|
|
@lru_cache(maxsize=1) |
|
|
def fetch_aws_polly_languages(): |
|
|
"""Scrape AWS Polly (TTS) supported languages""" |
|
|
url = "https://docs.aws.amazon.com/polly/latest/dg/supported-languages.html" |
|
|
|
|
|
try: |
|
|
response = requests.get(url, timeout=10) |
|
|
response.raise_for_status() |
|
|
soup = BeautifulSoup(response.content, 'html.parser') |
|
|
|
|
|
|
|
|
tables = soup.find_all('table') |
|
|
|
|
|
aws_polly = {} |
|
|
for table in tables: |
|
|
rows = table.find_all('tr') |
|
|
if not rows: |
|
|
continue |
|
|
|
|
|
|
|
|
headers = [th.get_text(strip=True) for th in rows[0].find_all('th')] |
|
|
|
|
|
|
|
|
lang_code_idx = None |
|
|
lang_name_idx = None |
|
|
for idx, header in enumerate(headers): |
|
|
if 'Language code' in header or 'language code' in header.lower(): |
|
|
lang_code_idx = idx |
|
|
if 'Language' == header or header.startswith('Language'): |
|
|
lang_name_idx = idx |
|
|
|
|
|
if lang_code_idx is not None: |
|
|
for row in rows[1:]: |
|
|
cols = row.find_all('td') |
|
|
if len(cols) > lang_code_idx: |
|
|
locale = cols[lang_code_idx].get_text(strip=True) |
|
|
language = cols[lang_name_idx].get_text(strip=True) if lang_name_idx and len(cols) > lang_name_idx else '' |
|
|
if locale and locale not in ['—', '-', '']: |
|
|
|
|
|
if locale in aws_polly: |
|
|
aws_polly[locale]['voice_count'] += 1 |
|
|
else: |
|
|
aws_polly[locale] = { |
|
|
'language': language, |
|
|
'voice_count': 1 |
|
|
} |
|
|
|
|
|
return aws_polly |
|
|
except Exception as e: |
|
|
print(f"Error fetching AWS Polly data: {e}") |
|
|
return {} |
|
|
|
|
|
|
|
|
def get_azure_locales_for_language(iso_639_1): |
|
|
""" |
|
|
Get Azure BCP-47 locales for a language using ISO 639-1 (2-letter) code |
|
|
Returns list of matching locales from Azure |
|
|
""" |
|
|
if not iso_639_1: |
|
|
return [] |
|
|
|
|
|
azure_asr = fetch_azure_asr_languages() |
|
|
azure_tts = fetch_azure_tts_languages() |
|
|
|
|
|
|
|
|
matching_locales = set() |
|
|
|
|
|
for locale in azure_asr.keys(): |
|
|
if locale.startswith(iso_639_1 + '-') or locale == iso_639_1: |
|
|
matching_locales.add(locale) |
|
|
|
|
|
for locale in azure_tts.keys(): |
|
|
if locale.startswith(iso_639_1 + '-') or locale == iso_639_1: |
|
|
matching_locales.add(locale) |
|
|
|
|
|
return sorted(matching_locales) |
|
|
|
|
|
|
|
|
def get_google_locales_for_language(iso_639_1): |
|
|
""" |
|
|
Get Google Cloud BCP-47 locales for a language using ISO 639-1 (2-letter) code |
|
|
Returns list of matching locales from Google Cloud |
|
|
""" |
|
|
if not iso_639_1: |
|
|
return [] |
|
|
|
|
|
google_stt = fetch_google_stt_languages() |
|
|
google_tts = fetch_google_tts_languages() |
|
|
|
|
|
|
|
|
matching_locales = set() |
|
|
|
|
|
for locale in google_stt.keys(): |
|
|
if locale.startswith(iso_639_1 + '-') or locale == iso_639_1: |
|
|
matching_locales.add(locale) |
|
|
|
|
|
for locale in google_tts.keys(): |
|
|
if locale.startswith(iso_639_1 + '-') or locale == iso_639_1: |
|
|
matching_locales.add(locale) |
|
|
|
|
|
return sorted(matching_locales) |
|
|
|
|
|
|
|
|
def check_elevenlabs_multilingual_v2_support(iso_639_1): |
|
|
""" |
|
|
Check if ElevenLabs Multilingual v2 supports a language using ISO 639-1 code |
|
|
Returns True if supported, False otherwise |
|
|
""" |
|
|
if not iso_639_1: |
|
|
return False |
|
|
|
|
|
supported_codes = fetch_elevenlabs_multilingual_v2() |
|
|
return iso_639_1 in supported_codes |
|
|
|
|
|
|
|
|
def check_elevenlabs_turbo_v3_support(iso_639_2): |
|
|
""" |
|
|
Check if ElevenLabs Turbo v3 supports a language using ISO 639-2 code |
|
|
Returns True if supported, False otherwise |
|
|
""" |
|
|
if not iso_639_2: |
|
|
return False |
|
|
|
|
|
supported_codes = fetch_elevenlabs_turbo_v3() |
|
|
return iso_639_2 in supported_codes |
|
|
|
|
|
|
|
|
def get_aws_locales_for_language(iso_639_1): |
|
|
""" |
|
|
Get AWS locales for a language using ISO 639-1 (2-letter) code |
|
|
Returns list of matching locales from AWS Transcribe and Polly |
|
|
""" |
|
|
if not iso_639_1: |
|
|
return [] |
|
|
|
|
|
aws_transcribe = fetch_aws_transcribe_languages() |
|
|
aws_polly = fetch_aws_polly_languages() |
|
|
|
|
|
|
|
|
matching_locales = set() |
|
|
|
|
|
for locale in aws_transcribe.keys(): |
|
|
if locale.startswith(iso_639_1 + '-') or locale == iso_639_1: |
|
|
matching_locales.add(locale) |
|
|
|
|
|
for locale in aws_polly.keys(): |
|
|
if locale.startswith(iso_639_1 + '-') or locale == iso_639_1: |
|
|
matching_locales.add(locale) |
|
|
|
|
|
return sorted(matching_locales) |
|
|
|