|
|
""" |
|
|
HuggingFace model and dataset search functionality |
|
|
""" |
|
|
|
|
|
import re |
|
|
import requests |
|
|
from bs4 import BeautifulSoup |
|
|
from collections import defaultdict |
|
|
|
|
|
|
|
|
def parse_stat_number(stat_text): |
|
|
""" |
|
|
Parse HuggingFace stat numbers like '4.07M', '23.4k', '349' into integers |
|
|
Returns integer value or 0 if parsing fails |
|
|
""" |
|
|
if not stat_text: |
|
|
return 0 |
|
|
|
|
|
stat_text = stat_text.strip().upper() |
|
|
|
|
|
try: |
|
|
|
|
|
if 'M' in stat_text: |
|
|
return int(float(stat_text.replace('M', '')) * 1_000_000) |
|
|
|
|
|
elif 'K' in stat_text: |
|
|
return int(float(stat_text.replace('K', '')) * 1_000) |
|
|
|
|
|
else: |
|
|
return int(stat_text.replace(',', '')) |
|
|
except (ValueError, AttributeError): |
|
|
return 0 |
|
|
|
|
|
|
|
|
def search_huggingface_models(iso_639_1, iso_639_2, pipeline_tag, max_results=100, max_pages=3): |
|
|
""" |
|
|
Search HuggingFace for models supporting a specific language |
|
|
|
|
|
Args: |
|
|
iso_639_1: ISO 639-1 (2-letter) code |
|
|
iso_639_2: ISO 639-2 (3-letter) code |
|
|
pipeline_tag: 'automatic-speech-recognition' or 'text-to-speech' |
|
|
max_results: maximum number of models to return |
|
|
max_pages: maximum number of pages to search per language code |
|
|
|
|
|
Returns: |
|
|
tuple: (list of model dictionaries, log messages) |
|
|
""" |
|
|
logs = [] |
|
|
|
|
|
|
|
|
codes_to_try = [] |
|
|
if iso_639_1: |
|
|
codes_to_try.append(iso_639_1) |
|
|
if iso_639_2: |
|
|
codes_to_try.append(iso_639_2) |
|
|
|
|
|
if not codes_to_try: |
|
|
logs.append("No language codes available for search") |
|
|
return [], logs |
|
|
|
|
|
logs.append(f"Language codes to search: {set(codes_to_try)}") |
|
|
|
|
|
models = [] |
|
|
seen_models = set() |
|
|
|
|
|
for code in codes_to_try: |
|
|
if len(models) >= max_results: |
|
|
break |
|
|
|
|
|
logs.append(f"Searching for language code: {code}") |
|
|
|
|
|
|
|
|
for page in range(max_pages): |
|
|
if len(models) >= max_results: |
|
|
break |
|
|
|
|
|
try: |
|
|
|
|
|
url = f"https://huggingface.co/models?pipeline_tag={pipeline_tag}&language={code}&sort=trending" |
|
|
if page > 0: |
|
|
url += f"&p={page}" |
|
|
|
|
|
logs.append(f" Page {page}: {url}") |
|
|
|
|
|
headers = { |
|
|
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36' |
|
|
} |
|
|
|
|
|
response = requests.get(url, headers=headers, timeout=10) |
|
|
response.raise_for_status() |
|
|
|
|
|
soup = BeautifulSoup(response.content, 'html.parser') |
|
|
|
|
|
|
|
|
model_cards = soup.find_all('article', class_='overview-card-wrapper') |
|
|
|
|
|
if not model_cards: |
|
|
logs.append(f" No model cards found on page {page}") |
|
|
break |
|
|
|
|
|
logs.append(f" Found {len(model_cards)} model cards on page {page}") |
|
|
|
|
|
for card in model_cards: |
|
|
if len(models) >= max_results: |
|
|
break |
|
|
|
|
|
try: |
|
|
link = card.find('a', href=True) |
|
|
if link: |
|
|
href = link.get('href', '') |
|
|
model_name = href.lstrip('/') |
|
|
|
|
|
if model_name and model_name != '#' and model_name not in seen_models: |
|
|
seen_models.add(model_name) |
|
|
|
|
|
|
|
|
downloads = 0 |
|
|
likes = 0 |
|
|
size = "" |
|
|
|
|
|
|
|
|
svgs = card.find_all('svg') |
|
|
|
|
|
for svg in svgs: |
|
|
|
|
|
next_elem = svg.find_next_sibling(string=True) |
|
|
stat_text = "" |
|
|
|
|
|
if next_elem and next_elem.strip(): |
|
|
stat_text = next_elem.strip() |
|
|
else: |
|
|
|
|
|
next_tag = svg.find_next_sibling() |
|
|
if next_tag: |
|
|
stat_text = next_tag.get_text(strip=True) |
|
|
|
|
|
if not stat_text or len(stat_text) < 1: |
|
|
continue |
|
|
|
|
|
|
|
|
svg_str = str(svg) |
|
|
|
|
|
|
|
|
if 'M26 24v4H6v-4H4v4a2 2 0 0 0 2 2h20a2 2 0 0 0 2-2v-4zm0-10l-1.41-1.41L17 20.17V2h-2v18.17l-7.59-7.58L6 14l10 10l10-10z' in svg_str: |
|
|
downloads = parse_stat_number(stat_text) |
|
|
|
|
|
|
|
|
elif 'M22.45,6a5.47,5.47,0,0,1,3.91,1.64,5.7,5.7,0,0,1,0,8L16,26.13' in svg_str: |
|
|
likes = parse_stat_number(stat_text) |
|
|
|
|
|
|
|
|
elif 'M10 10H8.4V8.4H10V10Zm0-3.2H8.4V5.2H10v1.6ZM6.8 10H5.2V8.4h1.6V10Z' in svg_str: |
|
|
|
|
|
if len(stat_text) <= 6 and re.search(r'\d+\.?\d*\s*[Bb]', stat_text): |
|
|
size = stat_text |
|
|
|
|
|
models.append({ |
|
|
'name': model_name, |
|
|
'url': f"https://huggingface.co/{model_name}", |
|
|
'downloads': downloads, |
|
|
'likes': likes, |
|
|
'size': size |
|
|
}) |
|
|
except Exception as e: |
|
|
logs.append(f" Error parsing model card: {e}") |
|
|
continue |
|
|
|
|
|
except Exception as e: |
|
|
logs.append(f" ERROR searching page {page}: {e}") |
|
|
break |
|
|
|
|
|
|
|
|
models.sort(key=lambda x: x['downloads'], reverse=True) |
|
|
|
|
|
logs.append(f"Total unique models found: {len(models)}") |
|
|
return models, logs |
|
|
|
|
|
|
|
|
def search_huggingface_datasets(iso_639_1, iso_639_2, task_category, max_results=100, max_pages=3): |
|
|
""" |
|
|
Search HuggingFace for datasets supporting a specific language |
|
|
|
|
|
Args: |
|
|
iso_639_1: ISO 639-1 (2-letter) code |
|
|
iso_639_2: ISO 639-2 (3-letter) code |
|
|
task_category: 'automatic-speech-recognition' or 'text-to-speech' |
|
|
max_results: maximum number of datasets to return |
|
|
max_pages: maximum number of pages to search per language code |
|
|
|
|
|
Returns: |
|
|
tuple: (list of dataset dictionaries, log messages) |
|
|
""" |
|
|
logs = [] |
|
|
|
|
|
|
|
|
language_codes = set() |
|
|
if iso_639_1: |
|
|
language_codes.add(iso_639_1) |
|
|
if iso_639_2: |
|
|
language_codes.add(iso_639_2) |
|
|
|
|
|
if not language_codes: |
|
|
logs.append("No language codes available for search") |
|
|
return [], logs |
|
|
|
|
|
logs.append(f"Language codes to search: {language_codes}") |
|
|
|
|
|
datasets = [] |
|
|
seen_datasets = set() |
|
|
|
|
|
|
|
|
for code in language_codes: |
|
|
if len(datasets) >= max_results: |
|
|
break |
|
|
|
|
|
logs.append(f"Searching for language code: {code}") |
|
|
|
|
|
for page in range(max_pages): |
|
|
if len(datasets) >= max_results: |
|
|
break |
|
|
|
|
|
try: |
|
|
|
|
|
url = f"https://huggingface.co/datasets?task_categories=task_categories:{task_category}&language=language:{code}&sort=trending" |
|
|
if page > 0: |
|
|
url += f"&p={page}" |
|
|
|
|
|
logs.append(f" Page {page}: {url}") |
|
|
|
|
|
headers = { |
|
|
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36' |
|
|
} |
|
|
|
|
|
response = requests.get(url, headers=headers, timeout=10) |
|
|
response.raise_for_status() |
|
|
|
|
|
soup = BeautifulSoup(response.content, 'html.parser') |
|
|
|
|
|
|
|
|
dataset_cards = soup.find_all('article', class_='overview-card-wrapper') |
|
|
|
|
|
if not dataset_cards: |
|
|
logs.append(f" No dataset cards found on page {page}") |
|
|
break |
|
|
|
|
|
logs.append(f" Found {len(dataset_cards)} dataset cards on page {page}") |
|
|
|
|
|
for card in dataset_cards: |
|
|
if len(datasets) >= max_results: |
|
|
break |
|
|
|
|
|
try: |
|
|
link = card.find('a', href=True) |
|
|
if link: |
|
|
href = link.get('href', '') |
|
|
dataset_path = href.lstrip('/') |
|
|
|
|
|
|
|
|
if dataset_path.startswith('datasets/'): |
|
|
dataset_name = dataset_path[9:] |
|
|
else: |
|
|
dataset_name = dataset_path |
|
|
|
|
|
if dataset_name and dataset_name != '#' and dataset_name not in seen_datasets: |
|
|
seen_datasets.add(dataset_name) |
|
|
|
|
|
|
|
|
downloads = 0 |
|
|
likes = 0 |
|
|
size = "" |
|
|
|
|
|
|
|
|
svgs = card.find_all('svg') |
|
|
|
|
|
for svg in svgs: |
|
|
|
|
|
next_elem = svg.find_next_sibling(string=True) |
|
|
stat_text = "" |
|
|
|
|
|
if next_elem and next_elem.strip(): |
|
|
stat_text = next_elem.strip() |
|
|
else: |
|
|
|
|
|
next_tag = svg.find_next_sibling() |
|
|
if next_tag: |
|
|
stat_text = next_tag.get_text(strip=True) |
|
|
|
|
|
|
|
|
if not stat_text or len(stat_text) < 1 or stat_text in ['Viewer', 'Updated']: |
|
|
continue |
|
|
|
|
|
|
|
|
svg_str = str(svg) |
|
|
|
|
|
|
|
|
if 'M26 24v4H6v-4H4v4a2 2 0 0 0 2 2h20a2 2 0 0 0 2-2v-4zm0-10l-1.41-1.41L17 20.17V2h-2v18.17l-7.59-7.58L6 14l10 10l10-10z' in svg_str: |
|
|
downloads = parse_stat_number(stat_text) |
|
|
|
|
|
|
|
|
elif 'M22.45,6a5.47,5.47,0,0,1,3.91,1.64,5.7,5.7,0,0,1,0,8L16,26.13' in svg_str: |
|
|
likes = parse_stat_number(stat_text) |
|
|
|
|
|
|
|
|
elif 'fill-rule="evenodd"' in svg_str and 'clip-rule="evenodd"' in svg_str: |
|
|
|
|
|
if any(c in stat_text for c in ['k', 'K', 'm', 'M']) or stat_text.replace(',', '').replace('.', '').isdigit(): |
|
|
size = stat_text |
|
|
|
|
|
datasets.append({ |
|
|
'name': dataset_name, |
|
|
'url': f"https://huggingface.co/datasets/{dataset_name}", |
|
|
'downloads': downloads, |
|
|
'likes': likes, |
|
|
'size': size |
|
|
}) |
|
|
except Exception as e: |
|
|
logs.append(f" Error parsing dataset card: {e}") |
|
|
continue |
|
|
|
|
|
except Exception as e: |
|
|
logs.append(f" ERROR searching page {page}: {e}") |
|
|
break |
|
|
|
|
|
|
|
|
datasets.sort(key=lambda x: x['downloads'], reverse=True) |
|
|
|
|
|
logs.append(f"Total unique datasets found: {len(datasets)}") |
|
|
return datasets, logs |
|
|
|
|
|
|
|
|
def deduplicate_models(models): |
|
|
""" |
|
|
Deduplicate models by base name (without user/org prefix) |
|
|
Keep the model with most downloads and count duplicates |
|
|
Returns list of deduplicated models with duplicate count added |
|
|
""" |
|
|
|
|
|
grouped = defaultdict(list) |
|
|
for model in models: |
|
|
|
|
|
name_parts = model['name'].split('/') |
|
|
if len(name_parts) > 1: |
|
|
base_name = name_parts[-1] |
|
|
else: |
|
|
base_name = model['name'] |
|
|
|
|
|
grouped[base_name].append(model) |
|
|
|
|
|
|
|
|
deduplicated = [] |
|
|
for base_name, model_list in grouped.items(): |
|
|
|
|
|
model_list.sort(key=lambda x: x['downloads'], reverse=True) |
|
|
best_model = model_list[0] |
|
|
|
|
|
|
|
|
best_model['duplicates'] = len(model_list) - 1 |
|
|
|
|
|
deduplicated.append(best_model) |
|
|
|
|
|
|
|
|
deduplicated.sort(key=lambda x: x['downloads'], reverse=True) |
|
|
|
|
|
return deduplicated |
|
|
|