# app.py – ES→NI con mejoras en detección morfológica y pipeline
# Versión 2.2 con afinación completa y mejoras finales - CORREGIDO

import gradio as gr
import os, csv, re, base64, unicodedata
import torch
from transformers import AutoProcessor, VitsModel
import numpy as np

os.environ['TRANSFORMERS_CACHE'] = '/tmp/cache'
os.environ['HF_HOME'] = '/tmp/hf'

# Modo depuración
DEBUG_MODE = False  # Cambiar a True para depuración


def debug_print(message):
    """Función de depuración para rastrear el pipeline"""
    if DEBUG_MODE:
        print(f"[DEBUG] {message}")


# =========================
# 1) LÉXICO (CSVs)
# =========================
CSV_CANDIDATES = [
    "HF_Pairs_ES_NI_RICH.csv",
    "HF_Pairs_ES_NI.csv",
    "Diccionario_ES_Neoibero.csv",
    "salida/hf_pairs_rich.csv",
    "salida/hf_pairs.csv",
    "salida/Diccionario_ES_Neoibero.csv",
]

SURF_RICH = {}  # (source_es_lower, es_morph_tag) -> ni_surface
LEX_FORM = {}  # forma_es -> ni_lemma o superficie forzada
LEX_LEMMA = {}  # infinitivo_es -> ni_lemma
FOLD_FORM = {}  # forma_es_sin_tildes -> ni_lemma

# Metadatos por forma/lema (POS y permiso de TAM)
LEX_META = {}  # es_form_lower -> {"pos": "V/N/ADJ/…", "tam_ok": True/False}

# Conjunto global de formas forzadas (español) para congelar su superficie NI
FORCE_KEYS = set()


# =========================
# 2) Morfología ES MEJORADA
# =========================
RE_GER = re.compile(r"(ando|iendo|yendo)$", re.I)
RE_PART = re.compile(r"(ado|ido|to|so|cho)$", re.I)

# Terminaciones verbales
FUT_END = ("é", "ás", "á", "emos", "éis", "án")
COND_END = ("ía", "ías", "ía", "íamos", "íais", "ían")
PRET_AR = ("é", "aste", "ó", "amos", "asteis", "aron")
PRET_ERIR = ("í", "iste", "ió", "imos", "isteis", "ieron")
IMPF_AR = ("aba", "abas", "ábamos", "abais", "aban")
IMPF_ERIR = ("ía", "ías", "íamos", "íais", "ían")
SUBJ_AR = ("e", "es", "e", "emos", "éis", "en")
SUBJ_ERIR = ("a", "as", "a", "amos", "áis", "an")

# Subjuntivo imperfecto (dos series)
SUBJ_PAST_AR = ("ara", "aras", "ara", "áramos", "arais", "aran", "ase", "ases", "ase", "ásemos", "aseis", "asen")
SUBJ_PAST_ERIR = ("iera", "ieras", "iera", "iéramos", "ierais", "ieran", "iese", "ieses", "iese", "iésemos", "ieseis", "iesen")

PRS_AR = ("o", "as", "a", "amos", "áis", "an")
PRS_ER = ("o", "es", "e", "emos", "éis", "en")
PRS_IR = ("o", "es", "e", "imos", "ís", "en")


def _strip_any(w, ends):
    for s in sorted(ends, key=len, reverse=True):
        if w.endswith(s):
            return w[:-len(s)], s
    return None, None


def _guess_class_from_ending(ending):
    if ending in PRET_AR or ending in IMPF_AR or ending in SUBJ_AR or ending in PRS_AR:
        return "ar"
    return "er"


# Irregulares expandidos
IRREG_LEMMA = {
    # SER / IR / HABER / ESTAR
    "fui": "ir", "fuiste": "ir", "fue": "ir", "fuimos": "ir", "fuisteis": "ir", "fueron": "ir",
    "voy": "ir", "vas": "ir", "va": "ir", "vamos": "ir", "vais": "ir", "van": "ir",
    "soy": "ser", "eres": "ser", "es": "ser", "somos": "ser", "sois": "ser", "son": "ser",
    "era": "ser", "eras": "ser", "éramos": "ser", "erais": "ser", "eran": "ser",
    "he": "haber", "has": "haber", "ha": "haber", "hemos": "haber", "habéis": "haber", "han": "haber",
    "hube": "haber", "hubo": "haber", "hubimos": "haber", "hubiste": "haber", "hubisteis": "haber", "hubieron": "haber",
    "estoy": "estar", "estás": "estar", "está": "estar", "estamos": "estar", "estáis": "estar", "están": "estar",
    "estuve": "estar", "estuviste": "estar", "estuvo": "estar", "estuvimos": "estar", "estuvisteis": "estar", "estuvieron": "estar",
    "estaba": "estar", "estabas": "estar", "estábamos": "estar", "estabais": "estar", "estaban": "estar",

    # Pretéritos fuertes
    "tuve": "tener", "tuviste": "tener", "tuvo": "tener", "tuvimos": "tener", "tuvisteis": "tener", "tuvieron": "tener",
    "vine": "venir", "viniste": "venir", "vino": "venir", "vinimos": "venir", "vinisteis": "venir", "vinieron": "venir",
    "hice": "hacer", "hiciste": "hacer", "hizo": "hacer", "hicimos": "hacer", "hicisteis": "hacer", "hicieron": "hacer",
    "puse": "poner", "pusiste": "poner", "puso": "poner", "pusimos": "poner", "pusisteis": "poner", "pusieron": "poner",
    "pude": "poder", "pudiste": "poder", "pudo": "poder", "pudimos": "poder", "pudisteis": "poder", "pudieron": "poder",
    "quise": "querer", "quisiste": "querer", "quiso": "querer", "quisimos": "querer", "quisisteis": "querer", "quisieron": "querer",
    "supe": "saber", "supiste": "saber", "supo": "saber", "supimos": "saber", "supisteis": "saber", "supieron": "saber",
    "traje": "traer", "trajiste": "traer", "trajo": "traer", "trajimos": "traer", "trajisteis": "traer", "trajeron": "traer",
    "dije": "decir", "dijiste": "decir", "dijo": "decir", "dijimos": "decir", "dijisteis": "decir", "dijeron": "decir",
    "conduje": "conducir", "condujiste": "conducir", "condujo": "conducir", "condujimos": "conducir", "condujisteis": "conducir", "condujeron": "conducir",
    "anduve": "andar", "anduviste": "andar", "anduvo": "andar", "anduvimos": "andar", "anduvisteis": "andar", "anduvieron": "andar",
    "cupe": "caber", "cupiste": "caber", "cupo": "caber", "cupimos": "caber", "cupisteis": "caber", "cupieron": "caber",
    "di": "dar", "diste": "dar", "dio": "dar", "dimos": "dar", "disteis": "dar", "dieron": "dar",
    "vi": "ver", "viste": "ver", "vio": "ver", "vimos": "ver", "visteis": "ver", "vieron": "ver",

    # 1ª sg -go
    "tengo": "tener", "vengo": "venir", "pongo": "poner", "salgo": "salir", "traigo": "traer", "caigo": "caer",
    "hago": "hacer", "oigo": "oír", "digo": "decir", "valgo": "valer", "sigo": "seguir",

    # Presentes e>ie / o>ue / e>i
    "tienes": "tener", "tiene": "tener", "tienen": "tener",
    "vienes": "venir", "viene": "venir", "vienen": "venir",
    "pienso": "pensar", "piensas": "pensar", "piensa": "pensar", "piensan": "pensar",
    "quiero": "querer", "quieres": "querer", "quiere": "querer", "quieren": "querer",
    "prefiero": "preferir", "prefieres": "preferir", "prefiere": "preferir", "prefieren": "preferir",

    # Subjuntivos irregulares
    "vaya": "ir", "vayas": "ir", "vayamos": "ir", "vayáis": "ir", "vayan": "ir",
    "sea": "ser", "seas": "ser", "seamos": "ser", "seáis": "ser", "sean": "ser",
    "haya": "haber", "hayas": "haber", "hayamos": "haber", "hayáis": "haber", "hayan": "haber",
    "dé": "dar", "des": "dar", "demos": "dar", "deis": "dar", "den": "dar",
    "esté": "estar", "estés": "estar", "estemos": "estar", "estéis": "estar", "estén": "estar",
    "tenga": "tener", "tengas": "tener", "tengamos": "tener", "tengáis": "tener", "tengan": "tener",
    "venga": "venir", "vengas": "venir", "vengamos": "venir", "vengáis": "venir", "vengan": "venir",
    "haga": "hacer", "hagas": "hacer", "hagamos": "hacer", "hagáis": "hacer", "hagan": "hacer",

    # Imperativos irregulares
    "ve": "ir", "id": "ir",
    "sé": "ser", "sed": "ser",
    "haz": "hacer", "haced": "hacer",
    "pon": "poner", "poned": "poner",
    "ven": "venir", "venid": "venir",
    "ten": "tener", "tened": "tener",
    "sal": "salir", "salid": "salir",
    "di": "decir", "decid": "decir",

    # Más formas
    "doy": "dar", "das": "dar", "da": "dar", "damos": "dar", "dais": "dar", "dan": "dar",
    "veo": "ver", "ves": "ver", "vemos": "ver", "veis": "ver", "ven": "ver",
    "oí": "oír", "oíste": "oír", "oyó": "oír", "oímos": "oír", "oísteis": "oír", "oyeron": "oír",
    "iba": "ir", "ibas": "ir", "íbamos": "ir", "ibais": "ir", "iban": "ir",
    "veía": "ver", "veías": "ver", "veíamos": "ver", "veíais": "ver", "veían": "ver",

    # algunos de subjuntivo imperfecto típicos
    "vinieras": "venir", "lloviera": "llover",
}

# Etiquetas morfológicas específicas para irregulares
IRREG_MORPH_TAGS = {
    # Subjuntivos
    "vaya": "SBJV", "vayas": "SBJV", "vayamos": "SBJV", "vayáis": "SBJV", "vayan": "SBJV",
    "sea": "SBJV", "seas": "SBJV", "seamos": "SBJV", "seáis": "SBJV", "sean": "SBJV",
    "haya": "SBJV", "hayas": "SBJV", "hayamos": "SBJV", "hayáis": "SBJV", "hayan": "SBJV",
    "dé": "SBJV", "des": "SBJV", "demos": "SBJV", "deis": "SBJV", "den": "SBJV",
    "esté": "SBJV", "estés": "SBJV", "estemos": "SBJV", "estéis": "SBJV", "estén": "SBJV",
    "tenga": "SBJV", "tengas": "SBJV", "tengamos": "SBJV", "tengáis": "SBJV", "tengan": "SBJV",
    "venga": "SBJV", "vengas": "SBJV", "vengamos": "SBJV", "vengáis": "SBJV", "vengan": "SBJV",
    "haga": "SBJV", "hagas": "SBJV", "hagamos": "SBJV", "hagáis": "SBJV", "hagan": "SBJV",
    "pueda": "SBJV", "puedas": "SBJV", "podamos": "SBJV", "podáis": "SBJV", "puedan": "SBJV",

    # Imperativos (nota: "ve", "di", "sé" se desambiguan por contexto en detect_tam_with_context)
    "id": "IMP", "sed": "IMP",
    "haz": "IMP", "haced": "IMP", "pon": "IMP", "poned": "IMP",
    "ven": "IMP", "venid": "IMP", "ten": "IMP", "tened": "IMP",
    "sal": "IMP", "salid": "IMP", "decid": "IMP",

    # Imperativos con clíticos
    "llámame": "IMP",
    "llámalo": "IMP",
    "llámala": "IMP",
    "llámanos": "IMP",
    "llámalos": "IMP",
    "llámalas": "IMP",
    "dime": "IMP",
    "dímelo": "IMP",
    "dinos": "IMP",
    "dínoslo": "IMP",
    "hazme": "IMP",
    "hazlo": "IMP",
    "hazla": "IMP",
    "haznos": "IMP",
    "ponme": "IMP",
    "ponlo": "IMP",
    "ponla": "IMP",
    "ponnos": "IMP",
    "dame": "IMP",
    "dámelo": "IMP",
    "danos": "IMP",
    "dánoslo": "IMP",
    "tráeme": "IMP",
    "tráelo": "IMP",
    "tráela": "IMP",
    "tráenos": "IMP",
    "díselo": "IMP",
    "pónselo": "IMP",
    "házselo": "IMP",

    # Futuro de subjuntivo (arcaico)
    "viniere": "FUT_SBJV",
    "vinieres": "FUT_SBJV",
    "vinieren": "FUT_SBJV",
    "hiciere": "FUT_SBJV",
    "hicieres": "FUT_SBJV",
    "hicieren": "FUT_SBJV",
    "fuere": "FUT_SBJV",
    "fueres": "FUT_SBJV",
    "fueren": "FUT_SBJV",
    "hubiere": "FUT_SBJV",
    "hubieres": "FUT_SBJV",
    "hubieren": "FUT_SBJV",

    # Formas de vosotros (pretérito)
    "creísteis": "PST",
    "dijisteis": "PST",
    "hicisteis": "PST",
    "pusisteis": "PST",
    "supisteis": "PST",
    "quisisteis": "PST",
    "trajisteis": "PST",

    # Pretéritos monosilábicos sin tilde (crítico)
    "vi": "PST",
    "dio": "PST",
    "fue": "PST",
    "fui": "PST",

    # Imperfectos
    "iba": "IPFV", "ibas": "IPFV", "íbamos": "IPFV", "ibais": "IPFV", "iban": "IPFV",
    "veía": "IPFV", "veías": "IPFV", "veíamos": "IPFV", "veíais": "IPFV", "veían": "IPFV",
}


def looks_like_verb_form_strict(w: str) -> bool:
    w = (w or "").lower()
    if w.endswith(("ar", "er", "ir")):
        return True
    if RE_GER.search(w) or RE_PART.search(w):
        return True
    if re.search(r"(á|ás|áis|és|éis|ís)$", w):
        return True
    if _strip_any(w, FUT_END + COND_END)[0] is not None:
        return True
    if _strip_any(w, PRET_AR + PRET_ERIR)[0] is not None:
        return True
    if _strip_any(w, IMPF_AR + IMPF_ERIR)[0] is not None:
        return True
    if _strip_any(w, SUBJ_PAST_AR + SUBJ_PAST_ERIR)[0] is not None:
        return True
    if re.search(
        r"(anduve|anduviste|anduvo|anduvimos|anduvieron|conduje|traduje|produje|reduje|introduje|supe|quise|pude|puse|hice|hizo|dije|dijo|traje|trajo|tuve|tuvo|vine|vino|cupe|cupo)$",
        w,
    ):
        return True
    return False


def _zco_guess(w: str) -> str:
    if w.endswith("uzco"):
        return w[:-4] + "ucir"
    if w.endswith("ezco"):
        return w[:-4] + "ecer"
    if w.endswith("ozco"):
        return w[:-4] + "ocer"
    if w.endswith("azco"):
        return w[:-4] + "acer"
    return ""


def guess_infinitive_es(w: str) -> str:
    w = (w or "").lower()

    # 0) memoria explícita
    if w in IRREG_LEMMA:
        return IRREG_LEMMA[w]
    if w in ("vámonos", "vamonos"):
        return "ir"

    # 1) -zco
    if w.endswith("zco"):
        z = _zco_guess(w)
        if z:
            return z

    # 2) -go respaldo
    if w.endswith("go"):
        base = w[:-2]
        MAP = {
            "ten": "tener", "ven": "venir", "pon": "poner", "sal": "salir",
            "tra": "traer", "ca": "caer", "ha": "hacer", "oi": "oír",
            "di": "decir", "val": "valer", "si": "seguir"
        }
        for k, v in MAP.items():
            if base.startswith(k):
                return v

    # 3) infinitivo ya
    if w.endswith(("ar", "er", "ir")):
        return w

    # 4) gerundio / participio
    m = RE_GER.search(w)
    if m:
        base = w[:m.start()]
        return base + ("ar" if m.group(0) == "ando" else "er")

    m = RE_PART.search(w)
    if m:
        base = w[:m.start()]
        PART_IRREG = {
            "hecho": "hacer", "dicho": "decir", "visto": "ver", "puesto": "poner", "escrito": "escribir",
            "abierto": "abrir", "cubierto": "cubrir", "muerto": "morir", "roto": "romper",
            "vuelto": "volver", "resuelto": "resolver", "frito": "freír", "impreso": "imprimir",
            "satisfecho": "satisfacer", "provisto": "proveer"
        }
        if w in PART_IRREG:
            return PART_IRREG[w]
        return base + "er"

    # 5) Futuro / condicional (correcto) + irregulares de base
    base, end = _strip_any(w, FUT_END + COND_END)
    if base is not None:
        irreg = {
            "saldr": "salir", "vendr": "venir", "tendr": "tener", "pondr": "poner", "valdr": "valer", "podr": "poder",
            "habr": "haber", "sabr": "saber", "cabr": "caber", "querr": "querer", "dir": "decir", "har": "hacer"
        }
        if base in irreg:
            return irreg[base]
        return base

    # 6) presentes con tilde
    if w.endswith("áis"):
        return w[:-3] + "ar"
    if w.endswith("éis"):
        return w[:-3] + "er"
    if w.endswith("ís"):
        return w[:-2] + "ir"
    if w.endswith("ás"):
        return w[:-2] + "ar"
    if w.endswith("és"):
        return w[:-2] + "er"
    if w.endswith("á"):
        return w[:-1] + "ar"

    # 7) pretérito / imperfecto / subj. presente
    for group in (PRET_AR + PRET_ERIR, IMPF_AR + IMPF_ERIR, SUBJ_AR + SUBJ_ERIR, PRS_AR + PRS_ER + PRS_IR):
        base, end = _strip_any(w, group)
        if base is not None:
            return base + _guess_class_from_ending(end)

    # 8) subjuntivo imperfecto (ara/ase ; iera/iese)
    base, end = _strip_any(w, SUBJ_PAST_AR)
    if base is not None:
        return base + "ar"
    base, end = _strip_any(w, SUBJ_PAST_ERIR)
    if base is not None:
        return base + "er"

    return ""


def es_morph_tag(w: str) -> str:
    """Versión mejorada con mejor detección"""
    w = (w or "").lower()

    # Chequeo directo de irregulares
    if w in IRREG_MORPH_TAGS:
        return IRREG_MORPH_TAGS[w]

    # Detectar imperativos con clíticos
    if re.search(r"^(llám|dím|házm|pónm|vén|dám|tén|tráe)(a|e)?(me|te|lo|la|nos|os|les|se|melo|telo|selo)$", w):
        return "IMP"
    if re.search(r"(adme|edme|idme|adlo|edle|idle|adnos|ednos)$", w):
        return "IMP"

    # Detectar gerundios con pronombres
    if re.search(r"^.*[áéí]ndo(me|te|se|lo|la|nos|os|les|melo|telo|selo)$", w):
        return "IPFV"

    # Detectar múltiples clíticos en imperativo
    if re.search(r"(melo|telo|selo|noslo|oslo|sela|selas|selos)$", w):
        # Verificar si es imperativo por contexto
        base = re.sub(r"(melo|telo|selo|noslo|oslo|sela|selas|selos)$", "", w)
        if base and len(base) > 2:
            return "IMP"

    if w.endswith(("ar", "er", "ir")):
        return "INF"
    if RE_GER.search(w):
        return "IPFV"
    if RE_PART.search(w):
        return "PST"
    if _strip_any(w, PRET_AR + PRET_ERIR)[0] is not None:
        return "PST"
    if _strip_any(w, IMPF_AR + IMPF_ERIR)[0] is not None:
        return "IPFV"
    if _strip_any(w, FUT_END)[0] is not None:
        return "FUT"
    if _strip_any(w, COND_END)[0] is not None:
        return "COND"
    if re.search(r"(á|ás|áis|és|éis|ís)$", w):
        return "PRS"
    if _strip_any(w, SUBJ_AR + SUBJ_ERIR)[0] is not None:
        return "SBJV"
    if _strip_any(w, PRS_AR + PRS_ER + PRS_IR)[0] is not None:
        return "PRS"
    if _strip_any(w, SUBJ_PAST_AR + SUBJ_PAST_ERIR)[0] is not None:
        return "SBJV"
    if re.search(
        r"(anduve|conduje|traduje|produje|reduje|introduje|supe|quise|pude|puse|hice|hizo|dije|dijo|traje|trajo|tuve|tuvo|vine|vino|cupe|cupo)$",
        w,
    ):
        return "PST"

    # Detección mejorada de imperativos
    if re.search(r"^.+[aei]d$", w):  # terminaciones -ad, -ed, -id
        return "IMP"

    return "UNK"


# =========================
# 3) Utilidades y carga léxico
# =========================
def fold(s: str) -> str:
    return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != "Mn")


def has_diacritic(s: str) -> bool:
    return bool(re.search(r"[áéíóúüÁÉÍÓÚÜ]", s or ""))


# Normalización de POS y tam_ok desde CSV
def _canon_pos(p: str) -> str:
    p = (p or "").strip().upper()
    MAP = {
        "V": "V", "VERB": "V",
        "N": "N", "NOUN": "N",
        "ADJ": "ADJ", "ADJECTIVE": "ADJ",
        "ADV": "ADV", "ADVERB": "ADV",
        "INTJ": "INTJ", "INTERJ": "INTJ", "INTERJECTION": "INTJ",
        "PRON": "PRON", "PRONOUN": "PRON",
        "PART": "PART", "PARTICLE": "PART",
        "POSTP": "POSTP", "ADP": "POSTP", "ADPOSITION": "POSTP"
    }
    return MAP.get(p, "")


def _boolish(x):
    if x is None:
        return None
    s = str(x).strip().lower()
    if s in ("1", "true", "t", "yes", "y", "si", "sí"):
        return True
    if s in ("0", "false", "f", "no", "n"):
        return False
    return None


def _meta_set(form_es: str, pos: str = None, tam_ok=None):
    if not form_es:
        return
    d = LEX_META.setdefault(form_es, {})
    if pos and not d.get("pos"):
        d["pos"] = pos
    if tam_ok is not None and d.get("tam_ok") is None:
        d["tam_ok"] = bool(tam_ok)


def pos_of_es(token_low: str) -> str:
    m = LEX_META.get(token_low, {})
    if m.get("pos"):
        return m["pos"]
    # Fallback: si "parece" forma verbal, dilo
    return "V" if looks_like_verb_form_strict(token_low) else ""


def tam_allowed_for_es(token_low: str) -> bool:
    m = LEX_META.get(token_low, {})
    if m.get("tam_ok") is not None:
        return bool(m["tam_ok"])
    return pos_of_es(token_low) == "V"


# =========================
# 4) TTS (Meta MMS VITS)
# =========================
print("Cargando modelo de voz de Meta AI (TTS)...")
device = "cuda" if torch.cuda.is_available() else "cpu"
processor = model = None

try:
    processor = AutoProcessor.from_pretrained("facebook/mms-tts-spa")
    model = VitsModel.from_pretrained("facebook/mms-tts-spa").to(device)
    print("Modelo de voz cargado con éxito.")
except Exception as e:
    print(f"ERROR: No se pudo cargar el modelo de voz. La locución no funcionará. Error: {e}")

PAUSE_LEVEL = 3


def add_reading_pauses(text: str, level: int = 3) -> str:
    if level <= 1:
        return text
    t = text
    if level >= 2:
        t = re.sub(r",\s*", ", , ", t)
    if level >= 3:
        t = re.sub(r"\.\s*", ". . ", t)
        t = re.sub(r";\s*", "; ; ", t)
    return re.sub(r"\s+", " ", t).strip()


def hispanize_for_tts(ni_text: str) -> str:
    text = (ni_text or "").lower()
    text = text.replace('ŕ', 'rr').replace('ś', 's').replace('eś', 'es')
    text = text.replace('ŕa', 'rra').replace('aŕe', 'arre').replace('-', ' ')
    text = re.sub(r'\[.*?\]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return add_reading_pauses(text, PAUSE_LEVEL)


def synthesize_speech(text):
    if not text or not text.strip() or model is None or processor is None:
        return None
    try:
        inputs = processor(text=hispanize_for_tts(text), return_tensors="pt").to(device)
        with torch.no_grad():
            output = model(**inputs).waveform
        speech_np = output.cpu().numpy().squeeze()
        mx = max(abs(speech_np.min()), abs(speech_np.max()))
        if mx > 0:
            speech_np = speech_np / mx * 0.9
        return (16000, speech_np.astype(np.float32))
    except Exception as e:
        print(f"Error durante la síntesis de voz: {e}")
        return None


# =========================
# 5) Línea ibérica (visual)
# =========================
KEYS_MODE = "explicit"
V = "aeiou"

SYL_FOR = {
    "b": ["‹BA›", "‹BE›", "‹BI›", "‹BO›", "‹BU›"],
    "d": ["‹DA›", "‹DE›", "‹DI›", "‹DO›", "‹DU›"],
    "t": ["‹TA›", "‹TE›", "‹TI›", "‹TO›", "‹TU›"],
    "g": ["‹GA›", "‹GE›", "‹GI›", "‹GO›", "‹GU›"],
    "k": ["‹KA›", "‹KE›", "‹KI›", "‹KO›", "‹KU›"]
}
ALPHA_FOR = {"a": "‹A›", "e": "‹E›", "i": "‹I›", "o": "‹O›", "u": "‹U›", "s": "‹S›", "ś": "‹Ś›", "l": "‹L›",
             "r": "‹R›", "ŕ": "‹Ŕ›", "n": "‹N›", "m": "‹M›"}
CODA_FOR = {"": "", "n": "‹N›", "s": "‹S›", "ś": "‹Ś›", "r": "‹R›", "ŕ": "‹Ŕ›", "l": "‹L›", "m": "‹M›", "k": "‹K›",
            "t": "‹T›"}


def tokens_from_latin(ni: str) -> str:
    out = []
    i = 0
    ni = (ni or "").lower()
    while i < len(ni):
        c = ni[i]
        # FIX: en el signario no hay /p/ independiente → grafía como B-
        if c == "p":
            c = "b"
        if c == "-":
            out.append("—")
            i += 1
            continue
        if c in V:
            out.append(ALPHA_FOR[c])
            i += 1
            continue
        if c in SYL_FOR and i + 1 < len(ni) and ni[i + 1] in V:
            idx = V.index(ni[i + 1])
            tok = SYL_FOR[c][idx]
            coda = ni[i + 2] if i + 2 < len(ni) else ""
            if coda in CODA_FOR and coda != "":
                tok += CODA_FOR[coda]
                i += 3
            else:
                i += 2
            out.append(tok)
            continue
        out.append(ALPHA_FOR.get(c, c.upper()))
        i += 1
    return "".join(out)


KEYS_OVERRIDE = {"ka": "K", "mi": "MI", "te": "TE", "ne": "N", "o": "O", "eś": "X"}


def georgeos_keys(token_str: str, ni_plain: str) -> str:
    low = (ni_plain or "").lower()
    if low in KEYS_OVERRIDE:
        return KEYS_OVERRIDE[low]
    m = re.findall(r"‹(.*?)›", token_str)
    out = []
    for t in m:
        if KEYS_MODE == "compact":
            if len(t) == 2 and t[0] in "BDTGK":
                out.append(t[0])
            elif t in ("A", "E", "I", "O", "U"):
                out.append(t)
            elif t == "Ś":
                out.append("X")
            elif t == "Ŕ":
                out.append("r")
            else:
                out.append(t[0].upper())
        else:
            if len(t) == 2 and t[0] in "BDTGK":
                out.append(t)
            elif t == "Ś":
                out.append("X")
            elif t == "Ŕ":
                out.append("r")
            else:
                out.append(t.upper())
    return "".join(out)


TRIDOT = "/"

# FIX: incluir comillas tipográficas
VISIBLE_PUNCT = set(",.;:…()[]{}\"'«»——""''")
HARD_BOUND = {".", ";", "—", "—", ":", "(", ")", "«", "»", """, """, "'", "'" }  # límites de cláusula fuertes


def render_ib_with_tridots(toks):
    res = []
    prev_word = False
    for tk in toks:
        is_punct = tk in VISIBLE_PUNCT
        if is_punct:
            res.append(" " + tk + " ")
            prev_word = False
        else:
            if prev_word:
                res.append(" " + TRIDOT + " ")
            res.append(tk)
            prev_word = True
    return "".join(res).strip()


# =========================
# 6) Traductor ES→NI MEJORADO
# =========================
# TAM mejorado con imperativo y futuro de subjuntivo
TAM_SUFFIX = {"PRS": "-ke", "PST": "-bo", "FUT": "-ta", "IPFV": "-ri", "COND": "-ni", "SBJV": "-ni", "IMP": "-tu",
              "INF": "", "FUT_SBJV": "-ra", "UNK": "-ke"}

# Anti-doble TAM (evita -bo-bo, -ni-ri, etc.)
VERB_TAM = ("-ke", "-ta", "-bo", "-ri", "-ni", "-tu", "-ra")


def strip_ni_tam(lemma: str):
    lemma = lemma or ""
    for s in sorted(VERB_TAM, key=len, reverse=True):
        if lemma.endswith(s):
            return lemma[:-len(s)], s
    return lemma, ""


STOP = set(
    """
 el la los las lo un una unos unas al del de en con sin por sobre entre hasta desde hacia según tras
 pero aunque sino que como si porque cuando donde mientras muy ya sí no también solo sólo aún aun más
 menos mi mis tu tus su sus nuestro nuestra nuestros nuestras esto eso aquello ese esa esos esas aquel
 aquella aquellos aquellas quien quién quiénes cual cuál cuales cuáles cuyo cuya cuyos cuyas eh ay oh
 uy ah aja jeje jaja aah ahh ohh uhh
 """.split()
)

# ======= Modalidad "presunto ibero-vascoide" =======
Q_ENCLITIC_INT = "-na"  # ¿ ... ?
Q_ENCLITIC_EXC = "-ba"  # ¡ ... !
WH_WORDS = {
    "qué", "quien", "quién", "quienes", "quiénes", "cual", "cuál", "cuales", "cuáles",
    "donde", "dónde", "cuando", "cuándo", "como", "cómo",
    "cuanto", "cuánto", "cuanta", "cuánta", "cuantos", "cuántos", "cuantas", "cuántas"
}


def is_wh_token(t: str) -> bool:
    low = (t or "").lower()
    if low in WH_WORDS:
        return True
    f = fold(low)
    return f in {"que", "quien", "quienes", "cual", "cuales", "donde", "cuando", "como", "cuanto", "cuanta", "cuantos", "cuantas"}


# FIX: el detector WH ignora WH dentro de paréntesis y comillas tipográficas
def has_wh_outside_parens(toks) -> bool:
    depth = 0
    for tk in toks:
        if tk in {"(", "«", """, "'"}:
            depth += 1
        elif tk in {")", "»", """, "'"}:
            depth = max(0, depth - 1)
        elif depth == 0 and is_wh_token(tk):
            return True
    return False


def normalize_es(text: str) -> str:
    return re.sub(r"\s+", " ", (text or "").strip())


# FIX: tokenizar también " " ' '
def tokenize_es(text: str):
    text = re.sub(r"([,.;:!?¡¿…()\[\]{}\"'«»——""''])", r" \1 ", text)
    return [t for t in text.split() if t]


# Reglas "a" → ka/mi/te (simplificado)
def rule_a(prev_tok: str, token: str, next_tok: str) -> str:
    verbs = {"dar", "decir", "contar", "enviar", "ofrecer", "mostrar", "prestar", "regalar", "entregar"}
    if prev_tok in verbs:
        return "mi"
    nombres = {"ana", "marta", "juan", "pedro", "luis", "maría", "jose", "carlos", "laura"}
    if next_tok in nombres:
        return "te"
    return "ka"


ESTAR_SET = {"estoy", "estás", "está", "estamos", "estáis", "están", "estaba", "estabas", "estábamos", "estabais", "estaban"}
HABER_SET = {"he", "has", "ha", "hemos", "habéis", "han", "había", "habías", "habíamos", "habíais", "habían"}


def detect_tam_with_context(toks, i, sentence_start=False):
    """Versión mejorada con análisis contextual profundo y desambiguación - CORREGIDO"""
    t = toks[i].lower()
    prev = toks[i - 1].lower() if i > 0 else ""
    prev2 = toks[i - 2].lower() if i > 1 else ""
    prev3 = toks[i - 3].lower() if i > 2 else ""
    nxt = toks[i + 1].lower() if i + 1 < len(toks) else ""

    # Detectar contexto de exclamación (buscar ¡ hacia atrás)
    in_exclamation = False
    for j in range(max(0, i - 5), i):
        if toks[j] == "¡":
            in_exclamation = True
            break
        if toks[j] in {".", ";", ":", "?"}:
            break

    # === VERIFICACIONES TEMPRANAS (antes de es_morph_tag) ===
    # Estar/Haber en presente (crítico: evitar detección como FUT)
    if t in ESTAR_SET or t in HABER_SET:
        return "PRS"

    # Primero usar detección morfológica mejorada
    tag = es_morph_tag(t)

    # "di" - dar(PST) vs decir(IMP)
    if t == "di":
        # Contexto imperativo: inicio + exclamación
        if (sentence_start or prev in {",", ".", "!", "¡", ";", ":"}) and in_exclamation:
            return "IMP"

        # Coordinación con otro pretérito: "vi y di", "vino y di"
        # Buscar hacia atrás hasta 4 tokens
        for lookback in range(1, min(5, i + 1)):
            check_tok = toks[i - lookback].lower()
            if check_tok in {"vi", "vine", "fui", "fue", "hice", "hizo", "dije", "dijo", "tuve", "tuvo", "puse", "puso", "vio", "vino"}:
                return "PST"
            # Si encontramos punto/interrogación, parar búsqueda
            if check_tok in {".", "?", "!", ";", ":"}:
                break

        # Después de pronombre sujeto
        if prev in {"yo", "él", "ella", "usted", "le"}:
            return "PST"

        # Después de "y le" o "y me" → pretérito
        if prev in {"le", "me", "te", "les", "nos"} and prev2 in {"y", "e", "pero"}:
            return "PST"

        # Por defecto: pretérito (más frecuente)
        return "PST"

    # "ve" - ir(IMP) vs ver(IMP)
    if t == "ve":
        # Con destino: "ve a casa" → ir
        if nxt in {"a", "al", "hacia", "hasta", "para"}:
            return "IMP"
        # Inicio + exclamación → imperativo
        if (sentence_start or prev in {",", ".", "!", "¡", ";", ":"}) and in_exclamation:
            return "IMP"
        # Por defecto: imperativo
        return "IMP"

    # "sé" - ser(IMP) vs saber(PRS)
    if t == "sé":
        # Con adjetivo/nombre: "sé bueno", "sé feliz" → ser(IMP)
        if nxt and not nxt in {"que", "si", "cuando", "donde", "como", "por", "para"}:
            # Verificar si siguiente es probablemente adjetivo
            if nxt in {"bueno", "malo", "feliz", "fuerte", "valiente", "honesto", "paciente"}:
                return "IMP"
        # Después de "yo"
        if prev == "yo":
            return "PRS"
        # Con subordinada: "sé que..." → saber(PRS)
        if nxt == "que":
            return "PRS"
        # Inicio + exclamación → imperativo
        if (sentence_start or prev in {",", ".", "!", "¡", ";", ":"}) and in_exclamation:
            return "IMP"
        # Por defecto: presente de saber (más frecuente)
        return "PRS"

    # === FIN DESAMBIGUACIÓN ===

    # Detectar múltiples clíticos en contexto imperativo
    if re.search(r"(melo|telo|selo|noslo|oslo)$", t):
        if sentence_start or prev in {",", ".", "!", "¡", ";", ":"}:
            return "IMP"

    # Contextos especiales
    # Imperativo al inicio de oración o después de puntuación fuerte
    if i == 0 or prev in {",", ".", "!", "¡", ";", ":"}:
        if t in {"ven", "haz", "pon", "sal", "ten", "id", "venid", "tened", "salid"}:
            return "IMP"

    # Subjuntivo después de ciertos nexos
    if prev in {"que", "si", "cuando", "aunque", "mientras", "hasta", "para"}:
        if tag == "SBJV":
            return "SBJV"
        # Intentar detectar subjuntivo por contexto si tag es UNK
        if tag == "UNK" and re.search(r"(e|a)$", t) and not t.endswith(("ar", "er", "ir")):
            return "SBJV"

    # Estar + gerundio
    if prev in ESTAR_SET and RE_GER.search(nxt):
        return "IPFV"

    # Haber + participio
    if prev in HABER_SET and RE_PART.search(nxt):
        return "PST"

    # Ir a + infinitivo (perífrasis de futuro)
    if prev == "a" and prev2 in {"voy", "vas", "va", "vamos", "vais", "van"}:
        if t.endswith(("ar", "er", "ir")):
            return "FUT"

    # Si es gerundio o participio por sí mismo
    if RE_GER.search(t):
        return "IPFV"
    if RE_PART.search(t):
        return "PST"

    return tag if tag != "UNK" else "PRS"


def forced_lemma_with_context(low: str, prev: str, nxt: str) -> str:
    if low == "visto" and nxt == "de":
        return "vestir"
    return ""


def has_tilde_equiv_lookup(low: str) -> str:
    if has_diacritic(low) and not looks_like_verb_form_strict(low):
        f = fold(low)
        if f in LEX_FORM:
            return LEX_FORM[f]
        if f in FOLD_FORM:
            return FOLD_FORM[f]
    return ""


def lookup_form_lemma(token: str, prev: str, nxt: str):
    if not token:
        return "", False
    low = token.lower()

    fl = forced_lemma_with_context(low, prev, nxt)
    if fl and fl in LEX_LEMMA:
        return LEX_LEMMA[fl], True

    if low in LEX_FORM:
        return LEX_FORM[low], True

    til = has_tilde_equiv_lookup(low)
    if til:
        return til, True

    if looks_like_verb_form_strict(low):
        lem = guess_infinitive_es(low)
        if lem and lem in LEX_LEMMA:
            return LEX_LEMMA[lem], True

    return "", False


# === Helpers para enclíticos y puntuación ===
def attach_enclitic(out_words, ib_keys, plain, attach_idx, encl):
    """Añade -na/-ba al ítem attach_idx, evitando duplicados exactos."""
    if attach_idx is None or attach_idx < 0 or attach_idx >= len(out_words):
        return
    cur = out_words[attach_idx] or ""
    if cur.endswith(encl):
        return
    out_words[attach_idx] = cur + encl
    plain[attach_idx] = (plain[attach_idx] or "") + encl
    ib_keys[attach_idx] = georgeos_keys(tokens_from_latin(plain[attach_idx]), plain[attach_idx])


def ensure_terminal_qmark(out_words, ib_keys, plain):
    """Si al final se inyectó -na por WH pero no había '?', garantizar que termine en '?'."""
    if not out_words:
        out_words.append("?")
        ib_keys.append("")
        plain.append("?")
        return
    j = len(out_words) - 1  # saltar tokens vacíos (por si acaso)
    while j >= 0 and (out_words[j] == "" or out_words[j] is None):
        j -= 1
    if j < 0:
        out_words.append("?")
        ib_keys.append("")
        plain.append("?")
        return
    if out_words[j] == ".":
        out_words[j] = "?"
        ib_keys[j] = ""
        plain[j] = "?"
    elif out_words[j] not in {"?", "!"}:
        out_words.append("?")
        ib_keys.append("")
        plain.append("?")


# Normaliza superficie si POS no permite TAM
def normalize_surface_by_pos(ni_surface: str, pos: str) -> str:
    if not ni_surface:
        return ni_surface
    if pos != "V":
        root, _ = strip_ni_tam(ni_surface)
        return root
    return ni_surface


def translate_sentence(sent: str):
    """Pipeline mejorado con detección contextual y manejo de imperativos - CORREGIDO"""
    toks = tokenize_es(normalize_es(sent))
    out_words = []   # palabras en ni (latín)
    ib_keys = []     # claves/teclas (línea ibérica)
    plain = []       # palabra ni "plana"
    neg_next = False
    last_finite_idx = None
    has_qmark = False
    has_emark = False
    saw_wh = has_wh_outside_parens(toks)
    sentence_start = True  # Track inicio de oración

    for i, t in enumerate(toks):
        # apertura ¿ ¡
        if t == "¿" or t == "¡":
            sentence_start = True  # Reset en apertura
            continue

        # cierre ? !
        if t == "?" or t == "!":
            if t == "?":
                has_qmark = True
            else:
                has_emark = True
            encl = Q_ENCLITIC_INT if t == "?" else Q_ENCLITIC_EXC
            attach_idx = last_finite_idx
            if attach_idx is None:
                for j in range(len(out_words) - 1, -1, -1):
                    if out_words[j] and out_words[j] not in VISIBLE_PUNCT:
                        attach_idx = j
                        break
            if attach_idx is not None:
                attach_enclitic(out_words, ib_keys, plain, attach_idx, encl)
            out_words.append(t)
            ib_keys.append("")
            plain.append(t)
            sentence_start = True  # Reset después de cierre
            continue

        # puntuación visible
        if t in VISIBLE_PUNCT:
            out_words.append(t)
            ib_keys.append(t)
            plain.append(t)
            # corte de cláusula fuerte → no arrastrar enclítico a la siguiente
            if t in HARD_BOUND:
                last_finite_idx = None
            sentence_start = (t in {".", ":", ";", "—"})
            continue

        # pipeline normal
        low = t.lower()
        prev = toks[i - 1].lower() if i > 0 else ""
        nxt = toks[i + 1].lower() if i + 1 < len(toks) else ""

        # === DETECCIÓN MEJORADA ===
        # Detectar imperativo por contexto de inicio o por clíticos
        if (
            (sentence_start and t in {"ve", "ven", "haz", "pon", "sal", "di", "ten", "sé", "id", "venid", "tened"})
            or (re.search(r"(me|te|lo|la|nos|os|les|se)$", low) and looks_like_verb_form_strict(low))
        ):
            tag_detected = "IMP"
        else:
            tag_detected = detect_tam_with_context(toks, i, sentence_start)

        debug_print(f"Procesando: {t} → morfología detectada: {tag_detected}")

        pos_hint = pos_of_es(low)
        is_verb_like = looks_like_verb_form_strict(low) or (pos_hint == "V")
        tam_ok = tam_allowed_for_es(low)

        if low == "no":
            neg_next = True
            continue

        if low in {"el", "la", "los", "las", "al", "del"}:
            continue

        if low == "a":
            ni = rule_a(prev, low, nxt)
            out_words.append(ni)
            ib_keys.append(georgeos_keys(tokens_from_latin(ni), ni))
            plain.append(ni)
            continue

        if low in {"un", "una"}:
            ni = "ban"
            out_words.append(ni)
            ib_keys.append(georgeos_keys(tokens_from_latin(ni), ni))
            plain.append(ni)
            continue

        if (low in STOP) and (low not in LEX_FORM):
            continue

        # 1) Superficie directa desde rich con TAM mejorado
        ni_direct = SURF_RICH.get((low, tag_detected))

        if neg_next and is_verb_like:
            out_words.append("eś")
            ib_keys.append(georgeos_keys(tokens_from_latin("eś"), "eś"))
            plain.append("eś")
            neg_next = False

        if ni_direct:
            debug_print(f"Encontrado en SURF_RICH: {low}, {tag_detected} → {ni_direct}")
            # CORRECCIÓN CRÍTICA: Si el CSV ya tiene el sufijo TAM correcto, usarlo tal cual
            if any(ni_direct.endswith(s) for s in VERB_TAM):
                # Ya tiene un sufijo TAM del CSV, confiar en él
                ni = ni_direct
            else:
                # Solo normalizar si NO tiene sufijo TAM
                ni = normalize_surface_by_pos(ni_direct, "V" if tam_ok else (pos_hint or ""))
            out_words.append(ni)
            ib_keys.append(georgeos_keys(tokens_from_latin(ni), ni))
            plain.append(ni)
            if tam_ok and any(ni.endswith(s) for s in VERB_TAM):
                last_finite_idx = len(out_words) - 1
            sentence_start = False  # Ya no estamos al inicio
            continue

        # 2) Diccionario forma/lema con gating TAM
        ni_lemma, ok = lookup_form_lemma(t, prev, nxt)
        if ok:
            # Caso 2.a – forma forzada (imperativos/presentes: conservar superficie, NO re-TAM)
            if low in FORCE_KEYS:
                ni = LEX_FORM.get(low, ni_lemma)  # superficie fija (p.ej., "ven" -> "nuker-tu")
                out_words.append(ni)
                ib_keys.append(georgeos_keys(tokens_from_latin(ni), ni))
                plain.append(ni)
                last_finite_idx = len(out_words) - 1
                sentence_start = False
                continue

            # Caso 2.b – si detectamos imperativo, forzar -tu
            root, old_suf = strip_ni_tam(ni_lemma or "")
            if tag_detected == "IMP":
                ni = root + "-tu"
                out_words.append(ni)
                ib_keys.append(georgeos_keys(tokens_from_latin(ni), ni))
                plain.append(ni)
                last_finite_idx = len(out_words) - 1
                sentence_start = False
                continue

            # Si ya trae -tu, preservarlo
            if old_suf == "-tu":
                ni = ni_lemma
                out_words.append(ni)
                ib_keys.append(georgeos_keys(tokens_from_latin(ni), ni))
                plain.append(ni)
                last_finite_idx = len(out_words) - 1
                sentence_start = False
                continue

            # Caso 2.c – verbo normal: compón TAM si procede
            if tam_ok and is_verb_like:
                suf = TAM_SUFFIX.get(tag_detected, "-ke")
                base = root or (ni_lemma or "")
                ni = base + suf if suf else base  # No añadir sufijo si es INF
                out_words.append(ni)
                ib_keys.append(georgeos_keys(tokens_from_latin(ni), ni))
                plain.append(ni)
                last_finite_idx = len(out_words) - 1
            else:
                ni = normalize_surface_by_pos(ni_lemma if ni_lemma != "" else "Ø", pos_hint or "")
                out_words.append(ni)
                ib_keys.append(georgeos_keys(tokens_from_latin(ni), ni))
                plain.append(ni)
            sentence_start = False
            continue

        # 3) sin léxico
        placeholder = f"[SIN-LEX:{t}]"
        out_words.append(placeholder)
        ib_keys.append(placeholder)
        plain.append(placeholder)
        sentence_start = False

    # Heurística WH: si hay interrogativa sin ?, añade -na
    appended_na = False
    if saw_wh and not has_qmark:
        encl = Q_ENCLITIC_INT
        attach_idx = last_finite_idx
        if attach_idx is None:
            for j in range(len(out_words) - 1, -1, -1):
                if (
                    out_words[j]
                    and out_words[j] not in VISIBLE_PUNCT
                    and out_words[j] not in {"?", "!"}
                    and not out_words[j].startswith("[")
                ):
                    attach_idx = j
                    break
        if attach_idx is not None and not (
            out_words[attach_idx].endswith("-na") or out_words[attach_idx].endswith("-ba")
        ):
            attach_enclitic(out_words, ib_keys, plain, attach_idx, encl)
            appended_na = True

    # Si inyectamos -na por WH, garantizar "?" terminal
    if appended_na and not has_qmark:
        ensure_terminal_qmark(out_words, ib_keys, plain)

    ib_clean = [k for k in ib_keys if k != ""]
    return " ".join(out_words), ib_clean


def translate(text: str):
    lines = [l for l in (text or "").split("\n") if l.strip()]
    ni_lines = []
    ib_lines = []
    for ln in lines:
        ni, ib_toks = translate_sentence(ln)
        ni_lines.append(ni)
        ib_lines.append(render_ib_with_tridots(ib_toks))
    return "\n".join(ni_lines), "\n".join(ib_lines)


# =========================
# 7) Carga léxico (con POS/tam_ok)
# =========================
def load_lexicon():
    loaded = False
    for p in CSV_CANDIDATES:
        if not os.path.exists(p):
            continue
        try:
            with open(p, encoding="utf-8") as f:
                rd = csv.DictReader(f)
                flds = set(rd.fieldnames or [])

                if {"source_es", "es_morph"}.issubset(flds):  # rich
                    for r in rd:
                        es = (r.get("source_es") or "").strip().lower()
                        tag = (r.get("es_morph") or "").strip().upper()
                        surf = (r.get("ni_surface") or "").strip()
                        if not surf:
                            root = (r.get("ni_root") or "").strip()
                            suf = (r.get("ni_suffix") or "").strip()
                            if root or suf:
                                surf = f"{root}{suf}"
                        if es and tag and surf:
                            SURF_RICH[(es, tag)] = surf

                        ni = (r.get("target_ni") or "").strip()
                        es_lem = (r.get("es_lemma") or "").strip().lower()

                        # POS/tam_ok desde CSV (retrocompatible)
                        pos = _canon_pos(
                            r.get("pos") or r.get("es_pos") or r.get("target_pos") or r.get("pos_es") or r.get("ni_pos") or ""
                        )
                        tam_ok = _boolish(r.get("tam_ok"))

                        if es:
                            _meta_set(es, pos=pos, tam_ok=(tam_ok if tam_ok is not None else (pos == "V" if pos else None)))
                        if es_lem:
                            _meta_set(
                                es_lem,
                                pos=("V" if es_lem.endswith(("ar", "er", "ir")) else (pos or "")),
                                tam_ok=(tam_ok if tam_ok is not None else (pos == "V" if pos else None)),
                            )

                        if es and ni != "":
                            LEX_FORM.setdefault(es, ni)
                        if es_lem and ni != "":
                            LEX_LEMMA.setdefault(es_lem, ni)
                    loaded = True
                    continue

                if {"source_es", "target_ni"}.issubset(flds):  # simple
                    for r in rd:
                        es = (r.get("source_es") or "").strip().lower()
                        ni = (r.get("target_ni") or "").strip()
                        if not es:
                            continue
                        LEX_FORM.setdefault(es, ni)
                        _meta_set(es, pos="", tam_ok=None)
                        if looks_like_verb_form_strict(es):
                            lem = guess_infinitive_es(es)
                            if lem:
                                LEX_LEMMA.setdefault(lem, ni)
                                _meta_set(lem, pos="V", tam_ok=True)
                    loaded = True
                    continue

                if {"es", "ni_lemma"}.issubset(flds):  # diccionario bonito
                    for r in rd:
                        es = (r.get("es") or "").strip().lower()
                        ni = (r.get("ni_lemma") or "").strip()
                        if not es:
                            continue
                        LEX_FORM.setdefault(es, ni)
                        _meta_set(es, pos="", tam_ok=None)
                        if looks_like_verb_form_strict(es):
                            lem = guess_infinitive_es(es)
                            if lem:
                                LEX_LEMMA.setdefault(lem, ni)
                                _meta_set(lem, pos="V", tam_ok=True)
                    loaded = True
                    continue

        except Exception as e:
            print(f"[WARN] No se pudo leer {p}: {e}")

    # Fallback "sin tildes" para no verbos
    global FOLD_FORM
    FOLD_FORM = {}
    for k, v in LEX_FORM.items():
        fk = fold(k)
        if fk != k and len(k) >= 5 and not looks_like_verb_form_strict(k):
            FOLD_FORM.setdefault(fk, v)

    # Atestiguados mínimos y pronombres personales ampliados
    KEEP_MIN = {
        # Partículas y determinantes
        "y": "ne", "o": "o", "no": "eś", "a": "ka", "para": "ka", "eso": "kok", "tarta": "gatel",
        "el": "", "la": "", "los": "", "las": "",
        "un": "ban", "una": "ban", "unos": "", "unas": "",
        "este": "aŕe", "esta": "aŕe", "estos": "aŕe", "estas": "aŕe",

        # Numerales
        "uno": "ban", "dos": "bi", "tres": "irur", "cuatro": "laur", "cinco": "borste", "seis": "śei",
        "siete": "sisbi", "ocho": "sorse", "nueve": "lauŕbi", "diez": "abaŕ", "veinte": "oŕkei",

        # Pronombres personales
        "yo": "ni",
        "tú": "zu",
        "él": "nar",
        "ella": "nar",
        "nosotros": "gu",
        "nosotras": "gu",
        "vosotros": "zuek",
        "vosotras": "zuek",
        "ellos": "narek",
        "ellas": "narek",

        # Conectores esenciales
        "que": "ze",
        "si": "baldin",
        "cuando": "noiz",
        "donde": "non",
        "como": "nola",
        "porque": "zeren",
        "mientras": "bitarte",

        # Palabras frecuentes del test
        "versión": "bertsi",
        "test": "froga",
        "prueba": "froga",
        "ejemplo": "adibid",
        "texto": "testu",
        "palabra": "hitz"
    }
    for k, v in KEEP_MIN.items():
        LEX_FORM.setdefault(k, v)
        if k in {"yo", "tú", "él", "ella", "nosotros", "nosotras", "vosotros", "vosotras", "ellos", "ellas"}:
            _meta_set(k, pos="PRON", tam_ok=False)
        elif k in {"que", "si", "cuando", "donde", "como", "porque", "mientras"}:
            _meta_set(k, pos="PART", tam_ok=False)
        else:
            _meta_set(k, pos=_canon_pos("PART" if k in {"y", "o", "no", "a", "para"} else "DET"), tam_ok=False)

    # LEXEMAS embebidos (seguridad)
    BUILTIN_LEMMA = {
        "venir": "nuker", "llover": "xemmo", "ver": "giŕok", "decir": "siśnesiŕ", "llamar": "lankur",
        "mostrar": "sunlirket", "andar": "sorsak", "dar": "buś", "enviar": "barmosak",
        "construir": "giknus", "poder": "giokk", "hacer": "giotael", "querer": "quers", "saber": "suber",
        "poner": "pusen", "salir": "salku", "ir": "nitus", "ser": "izan", "estar": "egon", "haber": "ukan"
    }
    for k, v in BUILTIN_LEMMA.items():
        LEX_LEMMA.setdefault(k, v)
        _meta_set(k, pos="V", tam_ok=True)

    # Fuerzos de formas (presentes y algunos imperativos irregulares) — prioridad dura
    FORCE_FORMS = {
        "voy": "nitus-ke", "vas": "nitus-ke", "va": "nitus-ke", "vamos": "nitus-ke", "vais": "nitus-ke", "van": "nitus-ke",
        "vengo": "nuker-ke", "vienes": "nuker-ke", "viene": "nuker-ke", "venimos": "nuker-ke", "venís": "nuker-ke", "vienen": "nuker-ke",
        # Imperativos frecuentes
        "ven": "nuker-tu", "haz": "giotael-tu", "pon": "pusen-tu", "di": "siśnesir-tu", "sal": "salku-tu", "ten": "giokk-tu", "sé": "suber-tu"
        # Nota: "ve" es ambiguo (ver/ir); se omite por seguridad.
    }
    for form, ni in FORCE_FORMS.items():
        LEX_FORM[form] = ni  # prioridad dura sobre CSV
        _meta_set(form, pos="V", tam_ok=True)

    # Exponer claves de forzados para congelar superficie en el pipeline
    global FORCE_KEYS
    FORCE_KEYS = set(FORCE_FORMS.keys())
    return loaded


_ = load_lexicon()


# =========================
# 8) UI + Docs
# =========================
LABELS = {
    "ES": {
        "title": "Traductor Español → Neoíbero",
        "subtitle": "Explora el renacimiento de la lengua ibérica antigua con tecnología moderna",
        "in_label": "✏️ Entrada (Español)",
        "in_ph": "Escribe aquí. Ej.: Veo a Ana y doy pan a Marta.",
        "out_lat": "📜 Salida: Neoíbero (latín)",
        "out_ib": "🗿 Línea ibérica",
        "out_audio": "🔊 Locución (Audio)",
        "btn": "🔄 Traducir",
        "combo": "🌍 Idioma (UI + explicación)",
        "doc_header": "📚 Documentación y Referencia",
        "acc_titles": [
            "🎓 Marco académico y decisiones del neoíbero",
            "🏛️ Herencia posible del íbero histórico",
            "🎨 Diseño de la conlang (neoíbero)",
            "⚙️ Pipeline del traductor (paso a paso)",
            "🔤 Ortografía, línea ibérica y claves",
            "❓/❗ Modalidad presunto vascoide (-na / -ba)",
            "📖 Gramática de referencia (v1.2)",
            "📚 Bibliografía de base",
            "🧾 Siglas y glosario"
        ]
    },
    "EN": {
        "title": "Spanish → Neo-Iberian Translator",
        "subtitle": "Explore the revival of the ancient Iberian language with modern tech",
        "in_label": "✏️ Input (Spanish)",
        "in_ph": "Type here. E.g., Veo a Ana y doy pan to Marta.",
        "out_lat": "📜 Output: Neo-Iberian (Latin)",
        "out_ib": "🗿 Iberian line",
        "out_audio": "🔊 Speech (Audio)",
        "btn": "🔄 Translate",
        "combo": "🌍 Language (UI + docs)",
        "doc_header": "📚 Documentation & Reference",
        "acc_titles": [
            "🎓 Background & design choices",
            "🏛️ Possible inheritance from ancient Iberian",
            "🎨 Conlang design (Neo-Iberian)",
            "⚙️ Translator pipeline (step by step)",
            "🔤 Orthography, Iberian line & keys",
            "❓/❗ 'Vascoid' mood (-na / -ba)",
            "📖 Reference grammar (v1.2)",
            "📚 Core references",
            "🧾 Acronyms & glossary"
        ]
    }
}

DOC_ES_0 = """**Escritura y datos.** Este proyecto recrea un "neoíbero" plausible apoyado en:
- El signario nororiental y el alfabeto greco-ibérico (uso **visual**, no paleográfico).
- Un léxico experimental (CSVs) con pares ES↔NI y superficies condicionadas por morfología.
- Un motor morfológico que reconoce tiempos/asp./modo del español y compone sufijos TAM en NI."""
DOC_ES_1 = """**Qué heredamos (consenso aproximado / modelos típicos):**
- Fonotaxis **CV(C)**; ausencia de **/p/** fonémica; *r/ŕ* no inicial.
- Postposiciones/sufijos nominales: **-k** (pl), **-te** (agentivo), **-ar/-en** (genitivo/origen), **-ka** (dat./loc./dist.), **-i** (acusativo con PN).
- Partículas: **ne** 'y', **o** 'o', **eś** 'no'.
- Numerales plausibles: *ban, bi, irur, laur, borste, śei, abaŕ (10), oŕkei (20)*.
> Implementación **conlang** (hipótesis operativas)."""
DOC_ES_2 = """**Diseño de la conlang:**
- **TAM verbal**: PRS **-ke**, PST **-bo**, FUT **-ta**, IPFV **-ri**, IMP **-tu**, COND/SBJV **-ni**.
- Derivación: verbos (-ke/-ta/-bo/-ri/-ni), adjetivos (-si), nombres (-ar/-en/-tu/-la/-ŕa/-si).
- Orden preferente **SOV**."""
DOC_ES_3 = """**Pipeline:**
1) Tokeniza; elimina artículos/contracciones.
2) a → ka/mi/te.
3) CSV rich → **superficie directa**; CSV simple → **lema**.
4) **Gating POS/TAM**: solo verbos reciben TAM; no-verbos purgan TAM si llegara.
5) Negación **eś** antes del primer finito.
6) ¿? / ¡! → enclíticos **-na/-ba** al último verbo finito.
7) WH-sin-¿? → **-na** igualmente.
8) Línea ibérica: solo puntuación visible."""
DOC_ES_4 = """**Ortografía/clave ibérica:**
- Línea ibérica por **claves**; modo **explicit** (BA/BE/BI/BO/BU).
- Separador de palabra = "/" (tridots).
- Atajos: ka→K, mi→MI, te→TE, ne→N, o→O, eś→X."""
DOC_ES_5 = """**Modalidad (-na/-ba):**
- **-na** interrogativa; **-ba** exclamativa.
- Se adhieren al **último finito**; si no hay, al último constituyente."""
DOC_ES_6 = """**Gramática mínima:**
- Verbo: raíz + TAM.
- Negación **eś**.
- Casos productivos en léxico atestiguado: -k, -te, -ka, -ar/-en."""
DOC_ES_7 = """**Bibliografía:** Untermann, de Hoz, Ferrer i Jané, Correa, etc."""
DOC_ES_8 = """**Glosario:** TAM, DOM, SOV, CV(C), CSV, superficie, enclítico…"""

DOC = {
    "ES": [DOC_ES_0, DOC_ES_1, DOC_ES_2, DOC_ES_3, DOC_ES_4, DOC_ES_5, DOC_ES_6, DOC_ES_7, DOC_ES_8],
    "EN": ["Script & data.", "Possible inheritance.", "Conlang design.", "Pipeline summary.", "Orthography & keys.",
           "'Vascoid' mood.", "Reference grammar.", "Core references.", "Acronyms & glossary."]
}


def build_css():
    b64 = None
    if os.path.exists("Iberia-Georgeos.ttf"):
        with open("Iberia-Georgeos.ttf", "rb") as f:
            b64 = base64.b64encode(f.read()).decode("ascii")
    font_src = f"url(data:font/ttf;base64,{b64}) format('truetype')" if b64 else "local('sans-serif')"
    return f"""
 @font-face {{
   font-family: 'IberiaGeorgeos';
   src: {font_src};
   font-weight: normal;
   font-style: normal;
 }}

 :root {{
   --iberian-clay: #8B4513;
   --iberian-ochre: #CC7722;
   --iberian-stone: #5C5C5C;
   --iberian-sand: #D2B48C;
   --iberian-rust: #A0522D;
   --iberian-bronze: #CD7F32;
 }}

 .gradio-container {{
   background: linear-gradient(135deg, #f4e8d8 0%, #e8d5c4 50%, #d4c4b0 100%) !important;
   font-family: 'Georgia', 'Times New Roman', serif !important;
 }}

 .gradio-container h1, .gradio-container h2, .gradio-container h3 {{
   color: var(--iberian-clay) !important;
   text-shadow: 2px 2px 4px rgba(139,69,19,.15) !important;
   border-bottom: 3px solid var(--iberian-bronze) !important;
   padding-bottom: .5rem !important;
   letter-spacing: .5px !important;
 }}

 .gradio-container .gr-group {{
   background: linear-gradient(to bottom, #f9f6f0, #ede6dc) !important;
   border: 2px solid var(--iberian-sand) !important;
   border-radius: 8px !important;
   box-shadow: 0 4px 12px rgba(139,69,19,.2), inset 0 1px 0 rgba(255,255,255,.5) !important;
   padding: 1.5rem !important;
   margin-bottom: 1.5rem !important;
 }}

 .gradio-container .gr-accordion {{
   background: linear-gradient(145deg, #ebe3d5, #d9cec0) !important;
   border: 2px solid var(--iberian-rust) !important;
   border-radius: 6px !important;
   margin-bottom: .8rem !important;
   box-shadow: 2px 2px 6px rgba(0,0,0,.15) !important;
 }}

 .gradio-container .gr-accordion .label-wrap {{
   background: linear-gradient(to right, var(--iberian-ochre), var(--iberian-rust)) !important;
   color: #fff !important;
   font-weight: 600 !important;
   padding: .8rem 1rem !important;
   border-radius: 4px !important;
   text-shadow: 1px 1px 2px rgba(0,0,0,.3) !important;
 }}

 .gradio-container .gr-textbox textarea, .gradio-container .gr-textbox input {{
   background: linear-gradient(to bottom, #faf8f3, #f5f0e8) !important;
   border: 2px solid var(--iberian-sand) !important;
   border-radius: 6px !important;
   color: var(--iberian-stone) !important;
   font-family: 'Georgia', serif !important;
   box-shadow: inset 2px 2px 4px rgba(139,69,19,.1) !important;
 }}

 .gradio-container .gr-textbox textarea:focus, .gradio-container .gr-textbox input:focus {{
   border-color: var(--iberian-bronze) !important;
   box-shadow: inset 2px 2px 4px rgba(139,69,19,.1), 0 0 8px rgba(205,127,50,.3) !important;
 }}

 .gradio-container .gr-button.gr-button-primary {{
   background: linear-gradient(145deg, var(--iberian-bronze), var(--iberian-rust)) !important;
   border: 2px solid var(--iberian-clay) !important;
   color: #fff !important;
   font-weight: bold !important;
   text-shadow: 1px 1px 2px rgba(0,0,0,.4) !important;
   box-shadow: 0 4px 8px rgba(139,69,19,.3), inset 0 1px 0 rgba(255,255,255,.2) !important;
   border-radius: 8px !important;
   padding: .8rem 1.5rem !important;
   transition: all .3s ease !important;
 }}

 .gradio-container .gr-button.gr-button-primary:hover {{
   background: linear-gradient(145deg, var(--iberian-rust), var(--iberian-bronze)) !important;
   transform: translateY(-2px) !important;
   box-shadow: 0 6px 12px rgba(139,69,19,.4) !important;
 }}

 .gradio-container .gr-dropdown {{
   background: linear-gradient(to bottom, #f5efe8, #ebe3d5) !important;
   border: 2px solid var(--iberian-sand) !important;
   border-radius: 6px !important;
 }}

 .gradio-container .gr-dropdown .wrap {{
   background: linear-gradient(to bottom, #f5efe8, #ebe3d5) !important;
   border: 2px solid var(--iberian-sand) !important;
   border-radius: 6px !important;
 }}

 .ib-line {{
   font-family: 'IberiaGeorgeos', monospace, sans-serif !important;
   font-size: 1.9rem !important;
   line-height: 2.4rem !important;
   white-space: pre-wrap !important;
   background: linear-gradient(135deg, #e8dcc8 0%, #d4c4a8 50%, #c4b098 100%) !important;
   padding: 24px !important;
   border-radius: 10px !important;
   border: 3px solid var(--iberian-rust) !important;
   border-left: 6px solid var(--iberian-bronze) !important;
   box-shadow: 0 4px 15px rgba(139,69,19,.25), inset 0 2px 4px rgba(0,0,0,.1) !important;
   color: var(--iberian-clay) !important;
   text-shadow: 1px 1px 1px rgba(255,255,255,.3) !important;
   position: relative !important;
 }}

 .ib-line::before {{
   content: '' !important;
   position: absolute !important;
   top: 0 !important;
   left: 0 !important;
   right: 0 !important;
   bottom: 0 !important;
   background-image: repeating-linear-gradient(0deg, transparent, transparent 2px,
     rgba(139,69,19,.03) 2px, rgba(139,69,19,.03) 4px) !important;
   pointer-events: none !important;
   border-radius: 10px !important;
 }}

 @media (max-width: 768px) {{
   .ib-line {{
     font-size: 1.5rem !important;
     line-height: 2rem !important;
     padding: 16px !important;
   }}
   .gradio-container .gr-group {{
     padding: 1rem !important;
   }}
   .gradio-container h1 {{
     font-size: 1.8rem !important;
   }}
 }}

 @media (max-width: 480px) {{
   .ib-line {{
     font-size: 1.3rem !important;
     line-height: 1.8rem !important;
     padding: 12px !important;
   }}
   .gradio-container h1 {{
     font-size: 1.5rem !important;
   }}
 }}
 """


CSS = build_css()

with gr.Blocks(css=CSS, theme=gr.themes.Soft(primary_hue="indigo", secondary_hue="purple")) as demo:
    with gr.Group():
        title = gr.Markdown(f"# {LABELS['ES']['title']}")
        subtitle = gr.Markdown(f"*{LABELS['ES']['subtitle']}*")
        combo = gr.Dropdown(choices=["ES", "EN"], value="ES", label=LABELS["ES"]["combo"])

    with gr.Group():
        doc_header = gr.Markdown(f"## {LABELS['ES']['doc_header']}")
        acc_titles = LABELS["ES"]["acc_titles"]

        with gr.Accordion(acc_titles[0], open=False) as acc1:
            md1 = gr.Markdown(DOC["ES"][0])

        with gr.Accordion(acc_titles[1], open=False) as acc2:
            md2 = gr.Markdown(DOC["ES"][1])

        with gr.Accordion(acc_titles[2], open=False) as acc3:
            md3 = gr.Markdown(DOC["ES"][2])

        with gr.Accordion(acc_titles[3], open=False) as acc4:
            md4 = gr.Markdown(DOC["ES"][3])

        with gr.Accordion(acc_titles[4], open=False) as acc5:
            md5 = gr.Markdown(DOC["ES"][4])

        with gr.Accordion(acc_titles[5], open=False) as acc6:
            md6 = gr.Markdown(DOC["ES"][5])

        with gr.Accordion(acc_titles[6], open=False) as acc7:
            md7 = gr.Markdown(DOC["ES"][6])

        with gr.Accordion(acc_titles[7], open=False) as acc8:
            md8 = gr.Markdown(DOC["ES"][7])

        with gr.Accordion(acc_titles[8], open=False) as acc9:
            md9 = gr.Markdown(DOC["ES"][8])

    with gr.Group():
        es_in = gr.Textbox(label=LABELS["ES"]["in_label"], placeholder=LABELS["ES"]["in_ph"], lines=5)
        btn_tr = gr.Button(LABELS["ES"]["btn"], variant="primary")

    with gr.Row():
        with gr.Column(scale=2):
            ni_out = gr.Textbox(label=LABELS["ES"]["out_lat"], lines=5, interactive=False)
            loc_btn = gr.Button("🔊 Locutar", variant="secondary", visible=False)
            audio_out = gr.Audio(label=LABELS["ES"]["out_audio"], type="numpy")
        with gr.Column(scale=1):
            ib_out = gr.HTML(label=LABELS["ES"]["out_ib"])

    def run_translate_only(txt):
        latin, ib = translate(txt or "")
        ib_html = f'<div class="ib-line">{ib.replace("&","&amp;").replace("<","&lt;").replace(">","&gt;")}</div>'
        return latin, ib_html, gr.update(visible=True)

    btn_tr.click(run_translate_only, es_in, [ni_out, ib_out, loc_btn])

    def run_locution_from_out(latin_text):
        return synthesize_speech(latin_text)

    loc_btn.click(run_locution_from_out, ni_out, audio_out)

    def switch_lang(sel):
        L = LABELS[sel]
        T = L["acc_titles"]
        D = DOC[sel]
        return (
            gr.update(value=f"# {L['title']}"),
            gr.update(value=f"*{L['subtitle']}*"),
            gr.update(label=L["combo"], value=sel),
            gr.update(value=f"## {L['doc_header']}"),
            gr.update(label=T[0]),
            gr.update(value=D[0]),
            gr.update(label=T[1]),
            gr.update(value=D[1]),
            gr.update(label=T[2]),
            gr.update(value=D[2]),
            gr.update(label=T[3]),
            gr.update(value=D[3]),
            gr.update(label=T[4]),
            gr.update(value=D[4]),
            gr.update(label=T[5]),
            gr.update(value=D[5]),
            gr.update(label=T[6]),
            gr.update(value=D[6]),
            gr.update(label=T[7]),
            gr.update(value=D[7]),
            gr.update(label=T[8]),
            gr.update(value=D[8]),
            gr.update(label=L["in_label"], placeholder=L["in_ph"]),
            gr.update(label=L["out_lat"]),
            gr.update(label=L["out_ib"]),
            gr.update(label=L["out_audio"]),
            gr.update(value=L["btn"])
        )

    combo.change(
        switch_lang,
        combo,
        [
            title, subtitle, combo, doc_header,
            acc1, md1, acc2, md2, acc3, md3, acc4, md4, acc5, md5, acc6, md6, acc7, md7, acc8, md8, acc9, md9,
            es_in, ni_out, ib_out, audio_out, btn_tr
        ]
    )

if __name__ == "__main__":
    demo.queue().launch()