Spaces:

CLEAR-Global
/

speech-resource-finder

Running

speech-resource-finder / commercial_services.py

Alp

wiki search, huge refactor

5ea1cbe 3 days ago

14.9 kB

	"""
	Commercial speech service support checkers
	Includes Azure, Google Cloud, AWS, and ElevenLabs
	"""

	import requests
	from bs4 import BeautifulSoup
	from functools import lru_cache


	@lru_cache(maxsize=1)
	def fetch_azure_asr_languages():
	"""Scrape Azure Speech-to-Text supported languages"""
	url = "https://learn.microsoft.com/en-us/azure/ai-services/speech-service/language-support?tabs=stt"

	try:
	response = requests.get(url, timeout=10)
	response.raise_for_status()
	soup = BeautifulSoup(response.content, 'html.parser')

	# Find the table with locale data
	tables = soup.find_all('table')

	azure_asr = {}
	for table in tables:
	rows = table.find_all('tr')
	if not rows:
	continue

	# Check if this is the right table by looking at headers
	headers = [th.get_text(strip=True) for th in rows[0].find_all('th')]
	if 'Locale' in ' '.join(headers) or 'Language' in ' '.join(headers):
	for row in rows[1:]: # Skip header
	cols = row.find_all('td')
	if len(cols) >= 2:
	locale = cols[0].get_text(strip=True)
	language = cols[1].get_text(strip=True)
	if locale and language:
	azure_asr[locale] = language
	break

	return azure_asr
	except Exception as e:
	print(f"Error fetching Azure ASR data: {e}")
	return {}


	@lru_cache(maxsize=1)
	def fetch_azure_tts_languages():
	"""Scrape Azure Text-to-Speech supported languages with voice counts"""
	url = "https://learn.microsoft.com/en-us/azure/ai-services/speech-service/language-support?tabs=tts"

	try:
	response = requests.get(url, timeout=10)
	response.raise_for_status()
	soup = BeautifulSoup(response.content, 'html.parser')

	# Find the TTS table
	tables = soup.find_all('table')

	azure_tts = {}
	for table in tables:
	rows = table.find_all('tr')
	if not rows:
	continue

	headers = [th.get_text(strip=True) for th in rows[0].find_all('th')]
	if 'Text to speech' in ' '.join(headers) or 'voices' in ' '.join(headers).lower():
	for row in rows[1:]:
	cols = row.find_all('td')
	if len(cols) >= 3:
	locale = cols[0].get_text(strip=True)
	language = cols[1].get_text(strip=True)
	voices_text = cols[2].get_text(strip=True)
	voice_count = voices_text.count('Neural')
	if locale and language:
	azure_tts[locale] = {
	'language': language,
	'voice_count': voice_count
	}
	break

	return azure_tts
	except Exception as e:
	print(f"Error fetching Azure TTS data: {e}")
	return {}


	@lru_cache(maxsize=1)
	def fetch_google_stt_languages():
	"""Scrape Google Cloud Speech-to-Text supported languages"""
	url = "https://cloud.google.com/speech-to-text/docs/speech-to-text-supported-languages"

	try:
	response = requests.get(url, timeout=10)
	response.raise_for_status()
	soup = BeautifulSoup(response.content, 'html.parser')

	# Find tables with BCP-47 language codes
	tables = soup.find_all('table')

	google_stt = {}
	for table in tables:
	rows = table.find_all('tr')
	if not rows:
	continue

	# Check if this table has BCP-47 column
	headers = [th.get_text(strip=True) for th in rows[0].find_all('th')]

	# Find BCP-47 column index
	bcp47_idx = None
	name_idx = None
	for idx, header in enumerate(headers):
	if 'BCP-47' in header or 'BCP47' in header:
	bcp47_idx = idx
	if 'Name' in header and name_idx is None:
	name_idx = idx

	if bcp47_idx is not None:
	for row in rows[1:]: # Skip header
	cols = row.find_all('td')
	if len(cols) > bcp47_idx:
	locale = cols[bcp47_idx].get_text(strip=True)
	language = cols[name_idx].get_text(strip=True) if name_idx and len(cols) > name_idx else ''
	if locale and locale not in ['—', '-', '']:
	google_stt[locale] = language

	return google_stt
	except Exception as e:
	print(f"Error fetching Google STT data: {e}")
	return {}


	@lru_cache(maxsize=1)
	def fetch_google_tts_languages():
	"""Scrape Google Cloud Text-to-Speech supported languages with voice counts"""
	url = "https://cloud.google.com/text-to-speech/docs/voices"

	try:
	response = requests.get(url, timeout=10)
	response.raise_for_status()
	soup = BeautifulSoup(response.content, 'html.parser')

	# Find the voices table
	tables = soup.find_all('table')

	google_tts = {}
	for table in tables:
	rows = table.find_all('tr')
	if not rows:
	continue

	headers = [th.get_text(strip=True) for th in rows[0].find_all('th')]

	# Find Language code column index
	lang_code_idx = None
	for idx, header in enumerate(headers):
	if 'Language code' in header or 'language code' in header.lower():
	lang_code_idx = idx
	break

	if lang_code_idx is not None:
	for row in rows[1:]:
	cols = row.find_all('td')
	if len(cols) > lang_code_idx:
	locale = cols[lang_code_idx].get_text(strip=True)
	if locale and locale not in ['—', '-', '']:
	# Count voices per locale
	if locale in google_tts:
	google_tts[locale]['voice_count'] += 1
	else:
	language = cols[0].get_text(strip=True) if len(cols) > 0 else ''
	google_tts[locale] = {
	'language': language,
	'voice_count': 1
	}

	return google_tts
	except Exception as e:
	print(f"Error fetching Google TTS data: {e}")
	return {}


	@lru_cache(maxsize=1)
	def fetch_elevenlabs_multilingual_v2():
	"""Get ElevenLabs Multilingual v2 supported languages (ISO 639-1 codes)"""
	# Based on https://elevenlabs.io/docs/models#multilingual-v2
	supported_codes = {
	'en', 'ja', 'zh', 'de', 'hi', 'fr', 'ko', 'pt', 'it', 'es',
	'id', 'nl', 'tr', 'fil', 'pl', 'sv', 'bg', 'ro', 'ar', 'cs',
	'el', 'fi', 'hr', 'ms', 'sk', 'da', 'ta', 'uk', 'ru'
	}
	return supported_codes


	@lru_cache(maxsize=1)
	def fetch_elevenlabs_turbo_v3():
	"""Get ElevenLabs Eleven Turbo v3 supported languages (ISO 639-2/3 codes)"""
	# Based on https://elevenlabs.io/docs/models#eleven-v3-alpha
	supported_codes = {
	'afr', 'ara', 'hye', 'asm', 'aze', 'bel', 'ben', 'bos', 'bul', 'cat',
	'ceb', 'nya', 'hrv', 'ces', 'dan', 'nld', 'eng', 'est', 'fil', 'fin',
	'fra', 'glg', 'kat', 'deu', 'ell', 'guj', 'hau', 'heb', 'hin', 'hun',
	'isl', 'ind', 'gle', 'ita', 'jpn', 'jav', 'kan', 'kaz', 'kir', 'kor',
	'lav', 'lin', 'lit', 'ltz', 'mkd', 'msa', 'mal', 'cmn', 'mar', 'nep',
	'nor', 'pus', 'fas', 'pol', 'por', 'pan', 'ron', 'rus', 'srp', 'snd',
	'slk', 'slv', 'som', 'spa', 'swa', 'swe', 'tam', 'tel', 'tha', 'tur',
	'ukr', 'urd', 'vie', 'cym'
	}
	return supported_codes


	@lru_cache(maxsize=1)
	def fetch_aws_transcribe_languages():
	"""Scrape AWS Transcribe (ASR) supported languages"""
	url = "https://docs.aws.amazon.com/transcribe/latest/dg/supported-languages.html"

	try:
	response = requests.get(url, timeout=10)
	response.raise_for_status()
	soup = BeautifulSoup(response.content, 'html.parser')

	# Find tables with language codes
	tables = soup.find_all('table')

	aws_transcribe = {}
	for table in tables:
	rows = table.find_all('tr')
	if not rows:
	continue

	# Check if this table has language code column
	headers = [th.get_text(strip=True) for th in rows[0].find_all('th')]

	# Find language code column index
	lang_code_idx = None
	lang_name_idx = None
	for idx, header in enumerate(headers):
	if 'Language code' in header or 'language code' in header.lower():
	lang_code_idx = idx
	if 'Language' == header or header.startswith('Language'):
	lang_name_idx = idx

	if lang_code_idx is not None:
	for row in rows[1:]: # Skip header
	cols = row.find_all('td')
	if len(cols) > lang_code_idx:
	locale = cols[lang_code_idx].get_text(strip=True)
	language = cols[lang_name_idx].get_text(strip=True) if lang_name_idx and len(cols) > lang_name_idx else ''
	if locale and locale not in ['—', '-', '']:
	aws_transcribe[locale] = language

	return aws_transcribe
	except Exception as e:
	print(f"Error fetching AWS Transcribe data: {e}")
	return {}


	@lru_cache(maxsize=1)
	def fetch_aws_polly_languages():
	"""Scrape AWS Polly (TTS) supported languages"""
	url = "https://docs.aws.amazon.com/polly/latest/dg/supported-languages.html"

	try:
	response = requests.get(url, timeout=10)
	response.raise_for_status()
	soup = BeautifulSoup(response.content, 'html.parser')

	# Find tables with language codes
	tables = soup.find_all('table')

	aws_polly = {}
	for table in tables:
	rows = table.find_all('tr')
	if not rows:
	continue

	# Check if this table has language code column
	headers = [th.get_text(strip=True) for th in rows[0].find_all('th')]

	# Find language code column index
	lang_code_idx = None
	lang_name_idx = None
	for idx, header in enumerate(headers):
	if 'Language code' in header or 'language code' in header.lower():
	lang_code_idx = idx
	if 'Language' == header or header.startswith('Language'):
	lang_name_idx = idx

	if lang_code_idx is not None:
	for row in rows[1:]: # Skip header
	cols = row.find_all('td')
	if len(cols) > lang_code_idx:
	locale = cols[lang_code_idx].get_text(strip=True)
	language = cols[lang_name_idx].get_text(strip=True) if lang_name_idx and len(cols) > lang_name_idx else ''
	if locale and locale not in ['—', '-', '']:
	# Count voices per locale (each row is a different voice/locale combo)
	if locale in aws_polly:
	aws_polly[locale]['voice_count'] += 1
	else:
	aws_polly[locale] = {
	'language': language,
	'voice_count': 1
	}

	return aws_polly
	except Exception as e:
	print(f"Error fetching AWS Polly data: {e}")
	return {}


	def get_azure_locales_for_language(iso_639_1):
	"""
	Get Azure BCP-47 locales for a language using ISO 639-1 (2-letter) code
	Returns list of matching locales from Azure
	"""
	if not iso_639_1:
	return []

	azure_asr = fetch_azure_asr_languages()
	azure_tts = fetch_azure_tts_languages()

	# Find all locales that start with the ISO 639-1 code
	matching_locales = set()

	for locale in azure_asr.keys():
	if locale.startswith(iso_639_1 + '-') or locale == iso_639_1:
	matching_locales.add(locale)

	for locale in azure_tts.keys():
	if locale.startswith(iso_639_1 + '-') or locale == iso_639_1:
	matching_locales.add(locale)

	return sorted(matching_locales)


	def get_google_locales_for_language(iso_639_1):
	"""
	Get Google Cloud BCP-47 locales for a language using ISO 639-1 (2-letter) code
	Returns list of matching locales from Google Cloud
	"""
	if not iso_639_1:
	return []

	google_stt = fetch_google_stt_languages()
	google_tts = fetch_google_tts_languages()

	# Find all locales that start with the ISO 639-1 code
	matching_locales = set()

	for locale in google_stt.keys():
	if locale.startswith(iso_639_1 + '-') or locale == iso_639_1:
	matching_locales.add(locale)

	for locale in google_tts.keys():
	if locale.startswith(iso_639_1 + '-') or locale == iso_639_1:
	matching_locales.add(locale)

	return sorted(matching_locales)


	def check_elevenlabs_multilingual_v2_support(iso_639_1):
	"""
	Check if ElevenLabs Multilingual v2 supports a language using ISO 639-1 code
	Returns True if supported, False otherwise
	"""
	if not iso_639_1:
	return False

	supported_codes = fetch_elevenlabs_multilingual_v2()
	return iso_639_1 in supported_codes


	def check_elevenlabs_turbo_v3_support(iso_639_2):
	"""
	Check if ElevenLabs Turbo v3 supports a language using ISO 639-2 code
	Returns True if supported, False otherwise
	"""
	if not iso_639_2:
	return False

	supported_codes = fetch_elevenlabs_turbo_v3()
	return iso_639_2 in supported_codes


	def get_aws_locales_for_language(iso_639_1):
	"""
	Get AWS locales for a language using ISO 639-1 (2-letter) code
	Returns list of matching locales from AWS Transcribe and Polly
	"""
	if not iso_639_1:
	return []

	aws_transcribe = fetch_aws_transcribe_languages()
	aws_polly = fetch_aws_polly_languages()

	# Find all locales that start with the ISO 639-1 code
	matching_locales = set()

	for locale in aws_transcribe.keys():
	if locale.startswith(iso_639_1 + '-') or locale == iso_639_1:
	matching_locales.add(locale)

	for locale in aws_polly.keys():
	if locale.startswith(iso_639_1 + '-') or locale == iso_639_1:
	matching_locales.add(locale)

	return sorted(matching_locales)