File size: 5,456 Bytes
5ea1cbe |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 |
"""
Wikipedia language information extraction module
"""
import requests
from bs4 import BeautifulSoup
import re
def construct_wiki_url(language_name):
"""
Construct Wikipedia URL from language name
Pattern: https://en.wikipedia.org/wiki/{name}_language
"""
# Remove parenthetical info like "(macrolanguage)" or "(Nigeria)"
clean_name = language_name.split('(')[0].strip()
# Replace spaces with underscores
url_name = clean_name.replace(' ', '_')
return f"https://en.wikipedia.org/wiki/{url_name}_language"
def check_url_exists(url):
"""Quick check if URL exists"""
try:
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36'}
response = requests.head(url, headers=headers, timeout=5, allow_redirects=True)
return response.status_code == 200
except:
return False
def fetch_language_info(wiki_url):
"""
Fetch language info from Wikipedia page
Returns dict with: speakers_l1, speakers_l2, family, writing_system, glottolog, wiki_url
"""
try:
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
response = requests.get(wiki_url, headers=headers, timeout=10)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
# Find the infobox (right sidebar with language info)
infobox = soup.find('table', {'class': 'infobox'})
if not infobox:
return None
info = {
'wiki_url': wiki_url,
'speakers_l1': None,
'speakers_l2': None,
'family': None,
'writing_system': None,
'glottolog': None
}
# Extract rows from infobox
rows = infobox.find_all('tr')
for row in rows:
# Get the header cell (th)
th = row.find('th')
if not th:
continue
header_text = th.get_text(strip=True).lower()
# Find the data cell (td)
td = row.find('td')
if not td:
continue
# Extract speakers (L1 and L2 are often in the same cell)
if 'native speakers' in header_text or header_text == 'speakers':
speakers_text = td.get_text(strip=True)
speakers_text = re.sub(r'\[\d+\]', '', speakers_text)
# Look for L1: pattern
l1_match = re.search(r'L1[:\s]*([\d,\.]+)\s*(million|billion)', speakers_text, re.IGNORECASE)
if l1_match:
info['speakers_l1'] = f"{l1_match.group(1)} {l1_match.group(2).lower()}"
else:
# Fallback: extract first number
match = re.search(r'([\d,\.]+)\s*(million|billion)', speakers_text, re.IGNORECASE)
if match:
info['speakers_l1'] = f"{match.group(1)} {match.group(2).lower()}"
else:
match = re.search(r'([\d,]+)', speakers_text)
if match:
info['speakers_l1'] = match.group(1)
# Look for L2: pattern
l2_match = re.search(r'L2[:\s]*([\d,\.]+)\s*(million|billion)', speakers_text, re.IGNORECASE)
if l2_match:
info['speakers_l2'] = f"{l2_match.group(1)} {l2_match.group(2).lower()}"
# Extract language family
elif 'language family' in header_text or 'family' in header_text:
# Get first link in the family tree
family_link = td.find('a')
if family_link:
info['family'] = family_link.get_text(strip=True)
else:
info['family'] = td.get_text(strip=True)[:100] # Truncate if too long
# Extract writing system
elif 'writing system' in header_text:
# Get the full text first, then clean it up
ws_text = td.get_text(separator=' ', strip=True)
# Remove references like [1], [2]
ws_text = re.sub(r'\[\d+\]', '', ws_text)
# Clean up extra whitespace
ws_text = ' '.join(ws_text.split())
# Filter out "None" or "Unwritten" entries
if ws_text and ws_text not in ['None', 'none', 'Unwritten', 'unwritten']:
# Limit length to avoid overly long descriptions
info['writing_system'] = ws_text[:150]
# Extract Glottolog ID
elif 'glottolog' in header_text:
# Glottolog is usually a link or code
glottolog_link = td.find('a')
if glottolog_link:
# Extract code from link text or href
glottolog_text = glottolog_link.get_text(strip=True)
info['glottolog'] = glottolog_text
else:
# Sometimes it's just text
glottolog_text = td.get_text(strip=True)
if glottolog_text and glottolog_text not in ['None', 'none', '—']:
info['glottolog'] = glottolog_text
return info
except Exception as e:
print(f"Error fetching {wiki_url}: {e}")
return None
|