Spaces:

CLEAR-Global
/

speech-resource-finder

Running

File size: 5,456 Bytes

5ea1cbe

"""
Wikipedia language information extraction module
"""

import requests
from bs4 import BeautifulSoup
import re

def construct_wiki_url(language_name):
    """
    Construct Wikipedia URL from language name
    Pattern: https://en.wikipedia.org/wiki/{name}_language
    """
    # Remove parenthetical info like "(macrolanguage)" or "(Nigeria)"
    clean_name = language_name.split('(')[0].strip()
    # Replace spaces with underscores
    url_name = clean_name.replace(' ', '_')
    return f"https://en.wikipedia.org/wiki/{url_name}_language"

def check_url_exists(url):
    """Quick check if URL exists"""
    try:
        headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36'}
        response = requests.head(url, headers=headers, timeout=5, allow_redirects=True)
        return response.status_code == 200
    except:
        return False

def fetch_language_info(wiki_url):
    """
    Fetch language info from Wikipedia page
    Returns dict with: speakers_l1, speakers_l2, family, writing_system, glottolog, wiki_url
    """
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        response = requests.get(wiki_url, headers=headers, timeout=10)
        response.raise_for_status()

        soup = BeautifulSoup(response.text, 'html.parser')

        # Find the infobox (right sidebar with language info)
        infobox = soup.find('table', {'class': 'infobox'})

        if not infobox:
            return None

        info = {
            'wiki_url': wiki_url,
            'speakers_l1': None,
            'speakers_l2': None,
            'family': None,
            'writing_system': None,
            'glottolog': None
        }

        # Extract rows from infobox
        rows = infobox.find_all('tr')

        for row in rows:
            # Get the header cell (th)
            th = row.find('th')
            if not th:
                continue

            header_text = th.get_text(strip=True).lower()

            # Find the data cell (td)
            td = row.find('td')
            if not td:
                continue

            # Extract speakers (L1 and L2 are often in the same cell)
            if 'native speakers' in header_text or header_text == 'speakers':
                speakers_text = td.get_text(strip=True)
                speakers_text = re.sub(r'\[\d+\]', '', speakers_text)

                # Look for L1: pattern
                l1_match = re.search(r'L1[:\s]*([\d,\.]+)\s*(million|billion)', speakers_text, re.IGNORECASE)
                if l1_match:
                    info['speakers_l1'] = f"{l1_match.group(1)} {l1_match.group(2).lower()}"
                else:
                    # Fallback: extract first number
                    match = re.search(r'([\d,\.]+)\s*(million|billion)', speakers_text, re.IGNORECASE)
                    if match:
                        info['speakers_l1'] = f"{match.group(1)} {match.group(2).lower()}"
                    else:
                        match = re.search(r'([\d,]+)', speakers_text)
                        if match:
                            info['speakers_l1'] = match.group(1)

                # Look for L2: pattern
                l2_match = re.search(r'L2[:\s]*([\d,\.]+)\s*(million|billion)', speakers_text, re.IGNORECASE)
                if l2_match:
                    info['speakers_l2'] = f"{l2_match.group(1)} {l2_match.group(2).lower()}"

            # Extract language family
            elif 'language family' in header_text or 'family' in header_text:
                # Get first link in the family tree
                family_link = td.find('a')
                if family_link:
                    info['family'] = family_link.get_text(strip=True)
                else:
                    info['family'] = td.get_text(strip=True)[:100]  # Truncate if too long

            # Extract writing system
            elif 'writing system' in header_text:
                # Get the full text first, then clean it up
                ws_text = td.get_text(separator=' ', strip=True)

                # Remove references like [1], [2]
                ws_text = re.sub(r'\[\d+\]', '', ws_text)

                # Clean up extra whitespace
                ws_text = ' '.join(ws_text.split())

                # Filter out "None" or "Unwritten" entries
                if ws_text and ws_text not in ['None', 'none', 'Unwritten', 'unwritten']:
                    # Limit length to avoid overly long descriptions
                    info['writing_system'] = ws_text[:150]

            # Extract Glottolog ID
            elif 'glottolog' in header_text:
                # Glottolog is usually a link or code
                glottolog_link = td.find('a')
                if glottolog_link:
                    # Extract code from link text or href
                    glottolog_text = glottolog_link.get_text(strip=True)
                    info['glottolog'] = glottolog_text
                else:
                    # Sometimes it's just text
                    glottolog_text = td.get_text(strip=True)
                    if glottolog_text and glottolog_text not in ['None', 'none', '—']:
                        info['glottolog'] = glottolog_text

        return info

    except Exception as e:
        print(f"Error fetching {wiki_url}: {e}")
        return None