File size: 14,609 Bytes
5ea1cbe
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
"""
HuggingFace model and dataset search functionality
"""

import re
import requests
from bs4 import BeautifulSoup
from collections import defaultdict


def parse_stat_number(stat_text):
    """
    Parse HuggingFace stat numbers like '4.07M', '23.4k', '349' into integers
    Returns integer value or 0 if parsing fails
    """
    if not stat_text:
        return 0

    stat_text = stat_text.strip().upper()

    try:
        # Handle 'M' (millions)
        if 'M' in stat_text:
            return int(float(stat_text.replace('M', '')) * 1_000_000)
        # Handle 'K' (thousands)
        elif 'K' in stat_text:
            return int(float(stat_text.replace('K', '')) * 1_000)
        # Plain number
        else:
            return int(stat_text.replace(',', ''))
    except (ValueError, AttributeError):
        return 0


def search_huggingface_models(iso_639_1, iso_639_2, pipeline_tag, max_results=100, max_pages=3):
    """
    Search HuggingFace for models supporting a specific language

    Args:
        iso_639_1: ISO 639-1 (2-letter) code
        iso_639_2: ISO 639-2 (3-letter) code
        pipeline_tag: 'automatic-speech-recognition' or 'text-to-speech'
        max_results: maximum number of models to return
        max_pages: maximum number of pages to search per language code

    Returns:
        tuple: (list of model dictionaries, log messages)
    """
    logs = []

    # Try both language code formats
    codes_to_try = []
    if iso_639_1:
        codes_to_try.append(iso_639_1)
    if iso_639_2:
        codes_to_try.append(iso_639_2)

    if not codes_to_try:
        logs.append("No language codes available for search")
        return [], logs

    logs.append(f"Language codes to search: {set(codes_to_try)}")

    models = []
    seen_models = set()

    for code in codes_to_try:
        if len(models) >= max_results:
            break

        logs.append(f"Searching for language code: {code}")

        # Try multiple pages for this language code
        for page in range(max_pages):
            if len(models) >= max_results:
                break

            try:
                # Use HuggingFace model search with pagination
                url = f"https://huggingface.co/models?pipeline_tag={pipeline_tag}&language={code}&sort=trending"
                if page > 0:
                    url += f"&p={page}"

                logs.append(f"  Page {page}: {url}")

                headers = {
                    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36'
                }

                response = requests.get(url, headers=headers, timeout=10)
                response.raise_for_status()

                soup = BeautifulSoup(response.content, 'html.parser')

                # Parse model cards from the page
                model_cards = soup.find_all('article', class_='overview-card-wrapper')

                if not model_cards:
                    logs.append(f"  No model cards found on page {page}")
                    break

                logs.append(f"  Found {len(model_cards)} model cards on page {page}")

                for card in model_cards:
                    if len(models) >= max_results:
                        break

                    try:
                        link = card.find('a', href=True)
                        if link:
                            href = link.get('href', '')
                            model_name = href.lstrip('/')

                            if model_name and model_name != '#' and model_name not in seen_models:
                                seen_models.add(model_name)

                                # Parse stats directly from the card HTML by looking at SVG icons
                                downloads = 0
                                likes = 0
                                size = ""

                                # Find all SVG elements in the card
                                svgs = card.find_all('svg')

                                for svg in svgs:
                                    # Get the next sibling text after the SVG
                                    next_elem = svg.find_next_sibling(string=True)
                                    stat_text = ""

                                    if next_elem and next_elem.strip():
                                        stat_text = next_elem.strip()
                                    else:
                                        # Try to find text in the next sibling element (e.g., <span>)
                                        next_tag = svg.find_next_sibling()
                                        if next_tag:
                                            stat_text = next_tag.get_text(strip=True)

                                    if not stat_text or len(stat_text) < 1:
                                        continue

                                    # Identify icon type by viewBox or path content
                                    svg_str = str(svg)

                                    # Download icon
                                    if 'M26 24v4H6v-4H4v4a2 2 0 0 0 2 2h20a2 2 0 0 0 2-2v-4zm0-10l-1.41-1.41L17 20.17V2h-2v18.17l-7.59-7.58L6 14l10 10l10-10z' in svg_str:
                                        downloads = parse_stat_number(stat_text)

                                    # Like/heart icon
                                    elif 'M22.45,6a5.47,5.47,0,0,1,3.91,1.64,5.7,5.7,0,0,1,0,8L16,26.13' in svg_str:
                                        likes = parse_stat_number(stat_text)

                                    # Model size icon
                                    elif 'M10 10H8.4V8.4H10V10Zm0-3.2H8.4V5.2H10v1.6ZM6.8 10H5.2V8.4h1.6V10Z' in svg_str:
                                        # Model parameter count (e.g., "2B", "0.6B")
                                        if len(stat_text) <= 6 and re.search(r'\d+\.?\d*\s*[Bb]', stat_text):
                                            size = stat_text

                                models.append({
                                    'name': model_name,
                                    'url': f"https://huggingface.co/{model_name}",
                                    'downloads': downloads,
                                    'likes': likes,
                                    'size': size
                                })
                    except Exception as e:
                        logs.append(f"  Error parsing model card: {e}")
                        continue

            except Exception as e:
                logs.append(f"  ERROR searching page {page}: {e}")
                break

    # Sort by downloads (descending)
    models.sort(key=lambda x: x['downloads'], reverse=True)

    logs.append(f"Total unique models found: {len(models)}")
    return models, logs


def search_huggingface_datasets(iso_639_1, iso_639_2, task_category, max_results=100, max_pages=3):
    """
    Search HuggingFace for datasets supporting a specific language

    Args:
        iso_639_1: ISO 639-1 (2-letter) code
        iso_639_2: ISO 639-2 (3-letter) code
        task_category: 'automatic-speech-recognition' or 'text-to-speech'
        max_results: maximum number of datasets to return
        max_pages: maximum number of pages to search per language code

    Returns:
        tuple: (list of dataset dictionaries, log messages)
    """
    logs = []

    # Collect all unique language codes for this language
    language_codes = set()
    if iso_639_1:
        language_codes.add(iso_639_1)
    if iso_639_2:
        language_codes.add(iso_639_2)

    if not language_codes:
        logs.append("No language codes available for search")
        return [], logs

    logs.append(f"Language codes to search: {language_codes}")

    datasets = []
    seen_datasets = set()

    # Search separately for each language code
    for code in language_codes:
        if len(datasets) >= max_results:
            break

        logs.append(f"Searching for language code: {code}")

        for page in range(max_pages):
            if len(datasets) >= max_results:
                break

            try:
                # Use HuggingFace dataset search
                url = f"https://huggingface.co/datasets?task_categories=task_categories:{task_category}&language=language:{code}&sort=trending"
                if page > 0:
                    url += f"&p={page}"

                logs.append(f"  Page {page}: {url}")

                headers = {
                    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36'
                }

                response = requests.get(url, headers=headers, timeout=10)
                response.raise_for_status()

                soup = BeautifulSoup(response.content, 'html.parser')

                # Parse dataset cards from the page
                dataset_cards = soup.find_all('article', class_='overview-card-wrapper')

                if not dataset_cards:
                    logs.append(f"  No dataset cards found on page {page}")
                    break

                logs.append(f"  Found {len(dataset_cards)} dataset cards on page {page}")

                for card in dataset_cards:
                    if len(datasets) >= max_results:
                        break

                    try:
                        link = card.find('a', href=True)
                        if link:
                            href = link.get('href', '')
                            dataset_path = href.lstrip('/')

                            # Remove "datasets/" prefix if present
                            if dataset_path.startswith('datasets/'):
                                dataset_name = dataset_path[9:]
                            else:
                                dataset_name = dataset_path

                            if dataset_name and dataset_name != '#' and dataset_name not in seen_datasets:
                                seen_datasets.add(dataset_name)

                                # Parse stats directly from the card HTML by looking at SVG icons
                                downloads = 0
                                likes = 0
                                size = ""

                                # Find all SVG elements in the card
                                svgs = card.find_all('svg')

                                for svg in svgs:
                                    # Get the next sibling text after the SVG
                                    next_elem = svg.find_next_sibling(string=True)
                                    stat_text = ""

                                    if next_elem and next_elem.strip():
                                        stat_text = next_elem.strip()
                                    else:
                                        # Try to find text in the next sibling element (e.g., <span>)
                                        next_tag = svg.find_next_sibling()
                                        if next_tag:
                                            stat_text = next_tag.get_text(strip=True)

                                    # Skip non-numeric text like "Viewer", "Updated", etc.
                                    if not stat_text or len(stat_text) < 1 or stat_text in ['Viewer', 'Updated']:
                                        continue

                                    # Identify icon type by viewBox or path content
                                    svg_str = str(svg)

                                    # Download icon
                                    if 'M26 24v4H6v-4H4v4a2 2 0 0 0 2 2h20a2 2 0 0 0 2-2v-4zm0-10l-1.41-1.41L17 20.17V2h-2v18.17l-7.59-7.58L6 14l10 10l10-10z' in svg_str:
                                        downloads = parse_stat_number(stat_text)

                                    # Like/heart icon
                                    elif 'M22.45,6a5.47,5.47,0,0,1,3.91,1.64,5.7,5.7,0,0,1,0,8L16,26.13' in svg_str:
                                        likes = parse_stat_number(stat_text)

                                    # Dataset size icon
                                    elif 'fill-rule="evenodd"' in svg_str and 'clip-rule="evenodd"' in svg_str:
                                        # Dataset size (e.g., "411k", "23.4M", "65.1k")
                                        if any(c in stat_text for c in ['k', 'K', 'm', 'M']) or stat_text.replace(',', '').replace('.', '').isdigit():
                                            size = stat_text

                                datasets.append({
                                    'name': dataset_name,
                                    'url': f"https://huggingface.co/datasets/{dataset_name}",
                                    'downloads': downloads,
                                    'likes': likes,
                                    'size': size
                                })
                    except Exception as e:
                        logs.append(f"  Error parsing dataset card: {e}")
                        continue

            except Exception as e:
                logs.append(f"  ERROR searching page {page}: {e}")
                break

    # Sort by downloads (descending)
    datasets.sort(key=lambda x: x['downloads'], reverse=True)

    logs.append(f"Total unique datasets found: {len(datasets)}")
    return datasets, logs


def deduplicate_models(models):
    """
    Deduplicate models by base name (without user/org prefix)
    Keep the model with most downloads and count duplicates
    Returns list of deduplicated models with duplicate count added
    """
    # Group models by base name
    grouped = defaultdict(list)
    for model in models:
        # Extract base name (everything after last '/')
        name_parts = model['name'].split('/')
        if len(name_parts) > 1:
            base_name = name_parts[-1]  # e.g., "whisper-large-v3"
        else:
            base_name = model['name']

        grouped[base_name].append(model)

    # For each group, keep the one with most downloads
    deduplicated = []
    for base_name, model_list in grouped.items():
        # Sort by downloads (descending) and keep the first one
        model_list.sort(key=lambda x: x['downloads'], reverse=True)
        best_model = model_list[0]

        # Add duplicate count (total in group)
        best_model['duplicates'] = len(model_list) - 1

        deduplicated.append(best_model)

    # Sort by downloads again
    deduplicated.sort(key=lambda x: x['downloads'], reverse=True)

    return deduplicated