Spaces:

CLEAR-Global
/

speech-resource-finder

Running

speech-resource-finder / wikipedia_info.py

Alp

wiki search, huge refactor

5ea1cbe 2 days ago

5.46 kB

	"""
	Wikipedia language information extraction module
	"""

	import requests
	from bs4 import BeautifulSoup
	import re

	def construct_wiki_url(language_name):
	"""
	Construct Wikipedia URL from language name
	Pattern: https://en.wikipedia.org/wiki/{name}_language
	"""
	# Remove parenthetical info like "(macrolanguage)" or "(Nigeria)"
	clean_name = language_name.split('(')[0].strip()
	# Replace spaces with underscores
	url_name = clean_name.replace(' ', '_')
	return f"https://en.wikipedia.org/wiki/{url_name}_language"

	def check_url_exists(url):
	"""Quick check if URL exists"""
	try:
	headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36'}
	response = requests.head(url, headers=headers, timeout=5, allow_redirects=True)
	return response.status_code == 200
	except:
	return False

	def fetch_language_info(wiki_url):
	"""
	Fetch language info from Wikipedia page
	Returns dict with: speakers_l1, speakers_l2, family, writing_system, glottolog, wiki_url
	"""
	try:
	headers = {
	'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
	}
	response = requests.get(wiki_url, headers=headers, timeout=10)
	response.raise_for_status()

	soup = BeautifulSoup(response.text, 'html.parser')

	# Find the infobox (right sidebar with language info)
	infobox = soup.find('table', {'class': 'infobox'})

	if not infobox:
	return None

	info = {
	'wiki_url': wiki_url,
	'speakers_l1': None,
	'speakers_l2': None,
	'family': None,
	'writing_system': None,
	'glottolog': None
	}

	# Extract rows from infobox
	rows = infobox.find_all('tr')

	for row in rows:
	# Get the header cell (th)
	th = row.find('th')
	if not th:
	continue

	header_text = th.get_text(strip=True).lower()

	# Find the data cell (td)
	td = row.find('td')
	if not td:
	continue

	# Extract speakers (L1 and L2 are often in the same cell)
	if 'native speakers' in header_text or header_text == 'speakers':
	speakers_text = td.get_text(strip=True)
	speakers_text = re.sub(r'\[\d+\]', '', speakers_text)

	# Look for L1: pattern
	l1_match = re.search(r'L1[:\s]([\d,\.]+)\s(million\|billion)', speakers_text, re.IGNORECASE)
	if l1_match:
	info['speakers_l1'] = f"{l1_match.group(1)} {l1_match.group(2).lower()}"
	else:
	# Fallback: extract first number
	match = re.search(r'([\d,\.]+)\s*(million\|billion)', speakers_text, re.IGNORECASE)
	if match:
	info['speakers_l1'] = f"{match.group(1)} {match.group(2).lower()}"
	else:
	match = re.search(r'([\d,]+)', speakers_text)
	if match:
	info['speakers_l1'] = match.group(1)

	# Look for L2: pattern
	l2_match = re.search(r'L2[:\s]([\d,\.]+)\s(million\|billion)', speakers_text, re.IGNORECASE)
	if l2_match:
	info['speakers_l2'] = f"{l2_match.group(1)} {l2_match.group(2).lower()}"

	# Extract language family
	elif 'language family' in header_text or 'family' in header_text:
	# Get first link in the family tree
	family_link = td.find('a')
	if family_link:
	info['family'] = family_link.get_text(strip=True)
	else:
	info['family'] = td.get_text(strip=True)[:100] # Truncate if too long

	# Extract writing system
	elif 'writing system' in header_text:
	# Get the full text first, then clean it up
	ws_text = td.get_text(separator=' ', strip=True)

	# Remove references like [1], [2]
	ws_text = re.sub(r'\[\d+\]', '', ws_text)

	# Clean up extra whitespace
	ws_text = ' '.join(ws_text.split())

	# Filter out "None" or "Unwritten" entries
	if ws_text and ws_text not in ['None', 'none', 'Unwritten', 'unwritten']:
	# Limit length to avoid overly long descriptions
	info['writing_system'] = ws_text[:150]

	# Extract Glottolog ID
	elif 'glottolog' in header_text:
	# Glottolog is usually a link or code
	glottolog_link = td.find('a')
	if glottolog_link:
	# Extract code from link text or href
	glottolog_text = glottolog_link.get_text(strip=True)
	info['glottolog'] = glottolog_text
	else:
	# Sometimes it's just text
	glottolog_text = td.get_text(strip=True)
	if glottolog_text and glottolog_text not in ['None', 'none', '—']:
	info['glottolog'] = glottolog_text

	return info

	except Exception as e:
	print(f"Error fetching {wiki_url}: {e}")
	return None