""" HuggingFace model and dataset search functionality """ import re import requests from bs4 import BeautifulSoup from collections import defaultdict def parse_stat_number(stat_text): """ Parse HuggingFace stat numbers like '4.07M', '23.4k', '349' into integers Returns integer value or 0 if parsing fails """ if not stat_text: return 0 stat_text = stat_text.strip().upper() try: # Handle 'M' (millions) if 'M' in stat_text: return int(float(stat_text.replace('M', '')) * 1_000_000) # Handle 'K' (thousands) elif 'K' in stat_text: return int(float(stat_text.replace('K', '')) * 1_000) # Plain number else: return int(stat_text.replace(',', '')) except (ValueError, AttributeError): return 0 def search_huggingface_models(iso_639_1, iso_639_2, pipeline_tag, max_results=100, max_pages=3): """ Search HuggingFace for models supporting a specific language Args: iso_639_1: ISO 639-1 (2-letter) code iso_639_2: ISO 639-2 (3-letter) code pipeline_tag: 'automatic-speech-recognition' or 'text-to-speech' max_results: maximum number of models to return max_pages: maximum number of pages to search per language code Returns: tuple: (list of model dictionaries, log messages) """ logs = [] # Try both language code formats codes_to_try = [] if iso_639_1: codes_to_try.append(iso_639_1) if iso_639_2: codes_to_try.append(iso_639_2) if not codes_to_try: logs.append("No language codes available for search") return [], logs logs.append(f"Language codes to search: {set(codes_to_try)}") models = [] seen_models = set() for code in codes_to_try: if len(models) >= max_results: break logs.append(f"Searching for language code: {code}") # Try multiple pages for this language code for page in range(max_pages): if len(models) >= max_results: break try: # Use HuggingFace model search with pagination url = f"https://huggingface.co/models?pipeline_tag={pipeline_tag}&language={code}&sort=trending" if page > 0: url += f"&p={page}" logs.append(f" Page {page}: {url}") headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36' } response = requests.get(url, headers=headers, timeout=10) response.raise_for_status() soup = BeautifulSoup(response.content, 'html.parser') # Parse model cards from the page model_cards = soup.find_all('article', class_='overview-card-wrapper') if not model_cards: logs.append(f" No model cards found on page {page}") break logs.append(f" Found {len(model_cards)} model cards on page {page}") for card in model_cards: if len(models) >= max_results: break try: link = card.find('a', href=True) if link: href = link.get('href', '') model_name = href.lstrip('/') if model_name and model_name != '#' and model_name not in seen_models: seen_models.add(model_name) # Parse stats directly from the card HTML by looking at SVG icons downloads = 0 likes = 0 size = "" # Find all SVG elements in the card svgs = card.find_all('svg') for svg in svgs: # Get the next sibling text after the SVG next_elem = svg.find_next_sibling(string=True) stat_text = "" if next_elem and next_elem.strip(): stat_text = next_elem.strip() else: # Try to find text in the next sibling element (e.g., ) next_tag = svg.find_next_sibling() if next_tag: stat_text = next_tag.get_text(strip=True) if not stat_text or len(stat_text) < 1: continue # Identify icon type by viewBox or path content svg_str = str(svg) # Download icon if 'M26 24v4H6v-4H4v4a2 2 0 0 0 2 2h20a2 2 0 0 0 2-2v-4zm0-10l-1.41-1.41L17 20.17V2h-2v18.17l-7.59-7.58L6 14l10 10l10-10z' in svg_str: downloads = parse_stat_number(stat_text) # Like/heart icon elif 'M22.45,6a5.47,5.47,0,0,1,3.91,1.64,5.7,5.7,0,0,1,0,8L16,26.13' in svg_str: likes = parse_stat_number(stat_text) # Model size icon elif 'M10 10H8.4V8.4H10V10Zm0-3.2H8.4V5.2H10v1.6ZM6.8 10H5.2V8.4h1.6V10Z' in svg_str: # Model parameter count (e.g., "2B", "0.6B") if len(stat_text) <= 6 and re.search(r'\d+\.?\d*\s*[Bb]', stat_text): size = stat_text models.append({ 'name': model_name, 'url': f"https://huggingface.co/{model_name}", 'downloads': downloads, 'likes': likes, 'size': size }) except Exception as e: logs.append(f" Error parsing model card: {e}") continue except Exception as e: logs.append(f" ERROR searching page {page}: {e}") break # Sort by downloads (descending) models.sort(key=lambda x: x['downloads'], reverse=True) logs.append(f"Total unique models found: {len(models)}") return models, logs def search_huggingface_datasets(iso_639_1, iso_639_2, task_category, max_results=100, max_pages=3): """ Search HuggingFace for datasets supporting a specific language Args: iso_639_1: ISO 639-1 (2-letter) code iso_639_2: ISO 639-2 (3-letter) code task_category: 'automatic-speech-recognition' or 'text-to-speech' max_results: maximum number of datasets to return max_pages: maximum number of pages to search per language code Returns: tuple: (list of dataset dictionaries, log messages) """ logs = [] # Collect all unique language codes for this language language_codes = set() if iso_639_1: language_codes.add(iso_639_1) if iso_639_2: language_codes.add(iso_639_2) if not language_codes: logs.append("No language codes available for search") return [], logs logs.append(f"Language codes to search: {language_codes}") datasets = [] seen_datasets = set() # Search separately for each language code for code in language_codes: if len(datasets) >= max_results: break logs.append(f"Searching for language code: {code}") for page in range(max_pages): if len(datasets) >= max_results: break try: # Use HuggingFace dataset search url = f"https://huggingface.co/datasets?task_categories=task_categories:{task_category}&language=language:{code}&sort=trending" if page > 0: url += f"&p={page}" logs.append(f" Page {page}: {url}") headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36' } response = requests.get(url, headers=headers, timeout=10) response.raise_for_status() soup = BeautifulSoup(response.content, 'html.parser') # Parse dataset cards from the page dataset_cards = soup.find_all('article', class_='overview-card-wrapper') if not dataset_cards: logs.append(f" No dataset cards found on page {page}") break logs.append(f" Found {len(dataset_cards)} dataset cards on page {page}") for card in dataset_cards: if len(datasets) >= max_results: break try: link = card.find('a', href=True) if link: href = link.get('href', '') dataset_path = href.lstrip('/') # Remove "datasets/" prefix if present if dataset_path.startswith('datasets/'): dataset_name = dataset_path[9:] else: dataset_name = dataset_path if dataset_name and dataset_name != '#' and dataset_name not in seen_datasets: seen_datasets.add(dataset_name) # Parse stats directly from the card HTML by looking at SVG icons downloads = 0 likes = 0 size = "" # Find all SVG elements in the card svgs = card.find_all('svg') for svg in svgs: # Get the next sibling text after the SVG next_elem = svg.find_next_sibling(string=True) stat_text = "" if next_elem and next_elem.strip(): stat_text = next_elem.strip() else: # Try to find text in the next sibling element (e.g., ) next_tag = svg.find_next_sibling() if next_tag: stat_text = next_tag.get_text(strip=True) # Skip non-numeric text like "Viewer", "Updated", etc. if not stat_text or len(stat_text) < 1 or stat_text in ['Viewer', 'Updated']: continue # Identify icon type by viewBox or path content svg_str = str(svg) # Download icon if 'M26 24v4H6v-4H4v4a2 2 0 0 0 2 2h20a2 2 0 0 0 2-2v-4zm0-10l-1.41-1.41L17 20.17V2h-2v18.17l-7.59-7.58L6 14l10 10l10-10z' in svg_str: downloads = parse_stat_number(stat_text) # Like/heart icon elif 'M22.45,6a5.47,5.47,0,0,1,3.91,1.64,5.7,5.7,0,0,1,0,8L16,26.13' in svg_str: likes = parse_stat_number(stat_text) # Dataset size icon elif 'fill-rule="evenodd"' in svg_str and 'clip-rule="evenodd"' in svg_str: # Dataset size (e.g., "411k", "23.4M", "65.1k") if any(c in stat_text for c in ['k', 'K', 'm', 'M']) or stat_text.replace(',', '').replace('.', '').isdigit(): size = stat_text datasets.append({ 'name': dataset_name, 'url': f"https://huggingface.co/datasets/{dataset_name}", 'downloads': downloads, 'likes': likes, 'size': size }) except Exception as e: logs.append(f" Error parsing dataset card: {e}") continue except Exception as e: logs.append(f" ERROR searching page {page}: {e}") break # Sort by downloads (descending) datasets.sort(key=lambda x: x['downloads'], reverse=True) logs.append(f"Total unique datasets found: {len(datasets)}") return datasets, logs def deduplicate_models(models): """ Deduplicate models by base name (without user/org prefix) Keep the model with most downloads and count duplicates Returns list of deduplicated models with duplicate count added """ # Group models by base name grouped = defaultdict(list) for model in models: # Extract base name (everything after last '/') name_parts = model['name'].split('/') if len(name_parts) > 1: base_name = name_parts[-1] # e.g., "whisper-large-v3" else: base_name = model['name'] grouped[base_name].append(model) # For each group, keep the one with most downloads deduplicated = [] for base_name, model_list in grouped.items(): # Sort by downloads (descending) and keep the first one model_list.sort(key=lambda x: x['downloads'], reverse=True) best_model = model_list[0] # Add duplicate count (total in group) best_model['duplicates'] = len(model_list) - 1 deduplicated.append(best_model) # Sort by downloads again deduplicated.sort(key=lambda x: x['downloads'], reverse=True) return deduplicated