"""
Data loading utilities for Speech Resource Finder
"""

import csv
import json
import requests


def load_language_list(csv_path):
    """
    Load ISO 639 language codes from CSV file

    Returns:
        dict: {iso_639_2: {"name": str, "iso_639_1": str, "french_name": str}}
    """
    languages = {}

    try:
        with open(csv_path, 'r', encoding='utf-8') as f:
            reader = csv.DictReader(f)
            for row in reader:
                iso_639_2 = row['ISO 639-2'].strip()
                iso_639_1 = row['ISO 639-1'].strip()
                name = row['English name'].strip()
                french_name = row['French name'].strip()

                if iso_639_2 and name:
                    languages[iso_639_2] = {
                        "name": name,
                        "iso_639_1": iso_639_1,
                        "french_name": french_name,
                    }
        print(f"Loaded {len(languages)} languages from {csv_path}")
    except Exception as e:
        print(f"ERROR: Failed to load language list from {csv_path}: {e}")
        print("The application cannot run without the language codes CSV file.")

    return languages


def load_language_taxonomy(taxonomy_url):
    """
    Load language taxonomy data from Microsoft's linguistic diversity project

    Returns:
        dict: {language_name_lowercase: level}
    """
    taxonomy = {}

    try:
        response = requests.get(taxonomy_url, timeout=10)
        response.raise_for_status()

        # Parse the CSV-like content (format: language_name,level)
        for line in response.text.strip().split('\n'):
            if line.strip():
                parts = line.strip().split(',')
                if len(parts) == 2:
                    lang_name = parts[0].strip().lower()
                    level = int(parts[1].strip())
                    taxonomy[lang_name] = level

        print(f"Loaded taxonomy data for {len(taxonomy)} languages")
    except Exception as e:
        print(f"Warning: Could not load language taxonomy: {e}")
        print("Language classification will show as 'Unknown'")

    return taxonomy


def load_common_voice_data(json_path):
    """
    Load Common Voice dataset statistics

    Returns:
        dict: {locale_code: {validHrs, totalHrs, splits, ...}}
    """
    cv_data = {}

    try:
        with open(json_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
            cv_data = data.get('locales', {})
        print(f"Loaded Common Voice data for {len(cv_data)} locales")
    except Exception as e:
        print(f"Warning: Could not load Common Voice data: {e}")
        print("Common Voice information will not be available")

    return cv_data


def load_app_content(content_path):
    """
    Load app content from markdown file

    Returns:
        dict: {"title": str, "description": str, "full_content": str}
    """
    app_content = {
        "title": "Speech Resource Finder",
        "description": "Search for speech resources",
        "full_content": ""
    }

    try:
        with open(content_path, 'r', encoding='utf-8') as f:
            content = f.read()

        # Parse markdown content
        lines = content.split('\n')

        # Extract title (first # heading)
        title = "Speech Resource Finder"
        for line in lines:
            if line.startswith('# '):
                title = line[2:].strip()
                break

        # Extract description (text after ## Description until next ##)
        description = ""
        in_description = False
        for line in lines:
            if line.startswith('## Description'):
                in_description = True
                continue
            elif in_description and line.startswith('##'):
                break
            elif in_description and line.strip():
                description += line.strip() + " "

        app_content = {
            "title": title,
            "description": description.strip(),
            "full_content": content
        }
        print(f"Loaded app content from {content_path}")
    except Exception as e:
        print(f"Error loading app content: {e}")
        print("Using default content")

    return app_content


def get_common_voice_stats(language_code, iso_639_1, cv_data):
    """
    Get Common Voice statistics for a language

    Args:
        language_code: ISO 639-2 (3-letter) code
        iso_639_1: ISO 639-1 (2-letter) code
        cv_data: Common Voice dataset dictionary

    Returns:
        dict or None: Statistics if found, None otherwise
    """
    # Try to find CV data using different code formats
    cv_locale = None
    locale_data = None

    # 1. Try ISO 639-2 (3-letter) code directly (e.g., "zgh", "kab")
    if language_code and language_code in cv_data:
        cv_locale = language_code
        locale_data = cv_data[language_code]
    # 2. Try ISO 639-1 (2-letter) code (e.g., "en", "fr")
    elif iso_639_1 and iso_639_1 in cv_data:
        cv_locale = iso_639_1
        locale_data = cv_data[iso_639_1]
    # 3. Try to find any locale that starts with the 2-letter code (e.g., "fy-NL", "ga-IE")
    elif iso_639_1:
        matching_locales = [loc for loc in cv_data.keys() if loc.startswith(iso_639_1 + '-')]
        if matching_locales:
            cv_locale = matching_locales[0]  # Take the first match
            locale_data = cv_data[cv_locale]

    if not locale_data:
        return None

    # Extract statistics
    valid_hrs = locale_data.get('validHrs', 0)
    total_hrs = locale_data.get('totalHrs', 0)
    users = locale_data.get('users', 0)

    # Extract gender balance
    gender_splits = locale_data.get('splits', {}).get('gender', {})
    male_pct = gender_splits.get('male_masculine', 0) * 100
    female_pct = gender_splits.get('female_feminine', 0) * 100

    # Format users count
    if users >= 1000:
        users_formatted = f"{users / 1000:.0f}k"
    else:
        users_formatted = str(users)

    return {
        'locale': cv_locale,
        'valid_hrs': valid_hrs,
        'total_hrs': total_hrs,
        'male_pct': male_pct,
        'female_pct': female_pct,
        'users': users,
        'users_formatted': users_formatted
    }