File size: 5,456 Bytes
5ea1cbe
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
"""
Wikipedia language information extraction module
"""

import requests
from bs4 import BeautifulSoup
import re

def construct_wiki_url(language_name):
    """
    Construct Wikipedia URL from language name
    Pattern: https://en.wikipedia.org/wiki/{name}_language
    """
    # Remove parenthetical info like "(macrolanguage)" or "(Nigeria)"
    clean_name = language_name.split('(')[0].strip()
    # Replace spaces with underscores
    url_name = clean_name.replace(' ', '_')
    return f"https://en.wikipedia.org/wiki/{url_name}_language"

def check_url_exists(url):
    """Quick check if URL exists"""
    try:
        headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36'}
        response = requests.head(url, headers=headers, timeout=5, allow_redirects=True)
        return response.status_code == 200
    except:
        return False

def fetch_language_info(wiki_url):
    """
    Fetch language info from Wikipedia page
    Returns dict with: speakers_l1, speakers_l2, family, writing_system, glottolog, wiki_url
    """
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        response = requests.get(wiki_url, headers=headers, timeout=10)
        response.raise_for_status()

        soup = BeautifulSoup(response.text, 'html.parser')

        # Find the infobox (right sidebar with language info)
        infobox = soup.find('table', {'class': 'infobox'})

        if not infobox:
            return None

        info = {
            'wiki_url': wiki_url,
            'speakers_l1': None,
            'speakers_l2': None,
            'family': None,
            'writing_system': None,
            'glottolog': None
        }

        # Extract rows from infobox
        rows = infobox.find_all('tr')

        for row in rows:
            # Get the header cell (th)
            th = row.find('th')
            if not th:
                continue

            header_text = th.get_text(strip=True).lower()

            # Find the data cell (td)
            td = row.find('td')
            if not td:
                continue

            # Extract speakers (L1 and L2 are often in the same cell)
            if 'native speakers' in header_text or header_text == 'speakers':
                speakers_text = td.get_text(strip=True)
                speakers_text = re.sub(r'\[\d+\]', '', speakers_text)

                # Look for L1: pattern
                l1_match = re.search(r'L1[:\s]*([\d,\.]+)\s*(million|billion)', speakers_text, re.IGNORECASE)
                if l1_match:
                    info['speakers_l1'] = f"{l1_match.group(1)} {l1_match.group(2).lower()}"
                else:
                    # Fallback: extract first number
                    match = re.search(r'([\d,\.]+)\s*(million|billion)', speakers_text, re.IGNORECASE)
                    if match:
                        info['speakers_l1'] = f"{match.group(1)} {match.group(2).lower()}"
                    else:
                        match = re.search(r'([\d,]+)', speakers_text)
                        if match:
                            info['speakers_l1'] = match.group(1)

                # Look for L2: pattern
                l2_match = re.search(r'L2[:\s]*([\d,\.]+)\s*(million|billion)', speakers_text, re.IGNORECASE)
                if l2_match:
                    info['speakers_l2'] = f"{l2_match.group(1)} {l2_match.group(2).lower()}"

            # Extract language family
            elif 'language family' in header_text or 'family' in header_text:
                # Get first link in the family tree
                family_link = td.find('a')
                if family_link:
                    info['family'] = family_link.get_text(strip=True)
                else:
                    info['family'] = td.get_text(strip=True)[:100]  # Truncate if too long

            # Extract writing system
            elif 'writing system' in header_text:
                # Get the full text first, then clean it up
                ws_text = td.get_text(separator=' ', strip=True)

                # Remove references like [1], [2]
                ws_text = re.sub(r'\[\d+\]', '', ws_text)

                # Clean up extra whitespace
                ws_text = ' '.join(ws_text.split())

                # Filter out "None" or "Unwritten" entries
                if ws_text and ws_text not in ['None', 'none', 'Unwritten', 'unwritten']:
                    # Limit length to avoid overly long descriptions
                    info['writing_system'] = ws_text[:150]

            # Extract Glottolog ID
            elif 'glottolog' in header_text:
                # Glottolog is usually a link or code
                glottolog_link = td.find('a')
                if glottolog_link:
                    # Extract code from link text or href
                    glottolog_text = glottolog_link.get_text(strip=True)
                    info['glottolog'] = glottolog_text
                else:
                    # Sometimes it's just text
                    glottolog_text = td.get_text(strip=True)
                    if glottolog_text and glottolog_text not in ['None', 'none', '—']:
                        info['glottolog'] = glottolog_text

        return info

    except Exception as e:
        print(f"Error fetching {wiki_url}: {e}")
        return None