speech-resource-finder / wikipedia_info.py
Alp
wiki search, huge refactor
5ea1cbe
"""
Wikipedia language information extraction module
"""
import requests
from bs4 import BeautifulSoup
import re
def construct_wiki_url(language_name):
"""
Construct Wikipedia URL from language name
Pattern: https://en.wikipedia.org/wiki/{name}_language
"""
# Remove parenthetical info like "(macrolanguage)" or "(Nigeria)"
clean_name = language_name.split('(')[0].strip()
# Replace spaces with underscores
url_name = clean_name.replace(' ', '_')
return f"https://en.wikipedia.org/wiki/{url_name}_language"
def check_url_exists(url):
"""Quick check if URL exists"""
try:
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36'}
response = requests.head(url, headers=headers, timeout=5, allow_redirects=True)
return response.status_code == 200
except:
return False
def fetch_language_info(wiki_url):
"""
Fetch language info from Wikipedia page
Returns dict with: speakers_l1, speakers_l2, family, writing_system, glottolog, wiki_url
"""
try:
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
response = requests.get(wiki_url, headers=headers, timeout=10)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
# Find the infobox (right sidebar with language info)
infobox = soup.find('table', {'class': 'infobox'})
if not infobox:
return None
info = {
'wiki_url': wiki_url,
'speakers_l1': None,
'speakers_l2': None,
'family': None,
'writing_system': None,
'glottolog': None
}
# Extract rows from infobox
rows = infobox.find_all('tr')
for row in rows:
# Get the header cell (th)
th = row.find('th')
if not th:
continue
header_text = th.get_text(strip=True).lower()
# Find the data cell (td)
td = row.find('td')
if not td:
continue
# Extract speakers (L1 and L2 are often in the same cell)
if 'native speakers' in header_text or header_text == 'speakers':
speakers_text = td.get_text(strip=True)
speakers_text = re.sub(r'\[\d+\]', '', speakers_text)
# Look for L1: pattern
l1_match = re.search(r'L1[:\s]*([\d,\.]+)\s*(million|billion)', speakers_text, re.IGNORECASE)
if l1_match:
info['speakers_l1'] = f"{l1_match.group(1)} {l1_match.group(2).lower()}"
else:
# Fallback: extract first number
match = re.search(r'([\d,\.]+)\s*(million|billion)', speakers_text, re.IGNORECASE)
if match:
info['speakers_l1'] = f"{match.group(1)} {match.group(2).lower()}"
else:
match = re.search(r'([\d,]+)', speakers_text)
if match:
info['speakers_l1'] = match.group(1)
# Look for L2: pattern
l2_match = re.search(r'L2[:\s]*([\d,\.]+)\s*(million|billion)', speakers_text, re.IGNORECASE)
if l2_match:
info['speakers_l2'] = f"{l2_match.group(1)} {l2_match.group(2).lower()}"
# Extract language family
elif 'language family' in header_text or 'family' in header_text:
# Get first link in the family tree
family_link = td.find('a')
if family_link:
info['family'] = family_link.get_text(strip=True)
else:
info['family'] = td.get_text(strip=True)[:100] # Truncate if too long
# Extract writing system
elif 'writing system' in header_text:
# Get the full text first, then clean it up
ws_text = td.get_text(separator=' ', strip=True)
# Remove references like [1], [2]
ws_text = re.sub(r'\[\d+\]', '', ws_text)
# Clean up extra whitespace
ws_text = ' '.join(ws_text.split())
# Filter out "None" or "Unwritten" entries
if ws_text and ws_text not in ['None', 'none', 'Unwritten', 'unwritten']:
# Limit length to avoid overly long descriptions
info['writing_system'] = ws_text[:150]
# Extract Glottolog ID
elif 'glottolog' in header_text:
# Glottolog is usually a link or code
glottolog_link = td.find('a')
if glottolog_link:
# Extract code from link text or href
glottolog_text = glottolog_link.get_text(strip=True)
info['glottolog'] = glottolog_text
else:
# Sometimes it's just text
glottolog_text = td.get_text(strip=True)
if glottolog_text and glottolog_text not in ['None', 'none', '—']:
info['glottolog'] = glottolog_text
return info
except Exception as e:
print(f"Error fetching {wiki_url}: {e}")
return None