# app.py – ES→NI con mejoras en detección morfológica y pipeline # Versión 2.2 con afinación completa y mejoras finales - CORREGIDO import gradio as gr import os, csv, re, base64, unicodedata import torch from transformers import AutoProcessor, VitsModel import numpy as np os.environ['TRANSFORMERS_CACHE'] = '/tmp/cache' os.environ['HF_HOME'] = '/tmp/hf' # Modo depuración DEBUG_MODE = False # Cambiar a True para depuración def debug_print(message): """Función de depuración para rastrear el pipeline""" if DEBUG_MODE: print(f"[DEBUG] {message}") # ========================= # 1) LÉXICO (CSVs) # ========================= CSV_CANDIDATES = [ "HF_Pairs_ES_NI_RICH.csv", "HF_Pairs_ES_NI.csv", "Diccionario_ES_Neoibero.csv", "salida/hf_pairs_rich.csv", "salida/hf_pairs.csv", "salida/Diccionario_ES_Neoibero.csv", ] SURF_RICH = {} # (source_es_lower, es_morph_tag) -> ni_surface LEX_FORM = {} # forma_es -> ni_lemma o superficie forzada LEX_LEMMA = {} # infinitivo_es -> ni_lemma FOLD_FORM = {} # forma_es_sin_tildes -> ni_lemma # Metadatos por forma/lema (POS y permiso de TAM) LEX_META = {} # es_form_lower -> {"pos": "V/N/ADJ/…", "tam_ok": True/False} # Conjunto global de formas forzadas (español) para congelar su superficie NI FORCE_KEYS = set() # ========================= # 2) Morfología ES MEJORADA # ========================= RE_GER = re.compile(r"(ando|iendo|yendo)$", re.I) RE_PART = re.compile(r"(ado|ido|to|so|cho)$", re.I) # Terminaciones verbales FUT_END = ("é", "ás", "á", "emos", "éis", "án") COND_END = ("ía", "ías", "ía", "íamos", "íais", "ían") PRET_AR = ("é", "aste", "ó", "amos", "asteis", "aron") PRET_ERIR = ("í", "iste", "ió", "imos", "isteis", "ieron") IMPF_AR = ("aba", "abas", "ábamos", "abais", "aban") IMPF_ERIR = ("ía", "ías", "íamos", "íais", "ían") SUBJ_AR = ("e", "es", "e", "emos", "éis", "en") SUBJ_ERIR = ("a", "as", "a", "amos", "áis", "an") # Subjuntivo imperfecto (dos series) SUBJ_PAST_AR = ("ara", "aras", "ara", "áramos", "arais", "aran", "ase", "ases", "ase", "ásemos", "aseis", "asen") SUBJ_PAST_ERIR = ("iera", "ieras", "iera", "iéramos", "ierais", "ieran", "iese", "ieses", "iese", "iésemos", "ieseis", "iesen") PRS_AR = ("o", "as", "a", "amos", "áis", "an") PRS_ER = ("o", "es", "e", "emos", "éis", "en") PRS_IR = ("o", "es", "e", "imos", "ís", "en") def _strip_any(w, ends): for s in sorted(ends, key=len, reverse=True): if w.endswith(s): return w[:-len(s)], s return None, None def _guess_class_from_ending(ending): if ending in PRET_AR or ending in IMPF_AR or ending in SUBJ_AR or ending in PRS_AR: return "ar" return "er" # Irregulares expandidos IRREG_LEMMA = { # SER / IR / HABER / ESTAR "fui": "ir", "fuiste": "ir", "fue": "ir", "fuimos": "ir", "fuisteis": "ir", "fueron": "ir", "voy": "ir", "vas": "ir", "va": "ir", "vamos": "ir", "vais": "ir", "van": "ir", "soy": "ser", "eres": "ser", "es": "ser", "somos": "ser", "sois": "ser", "son": "ser", "era": "ser", "eras": "ser", "éramos": "ser", "erais": "ser", "eran": "ser", "he": "haber", "has": "haber", "ha": "haber", "hemos": "haber", "habéis": "haber", "han": "haber", "hube": "haber", "hubo": "haber", "hubimos": "haber", "hubiste": "haber", "hubisteis": "haber", "hubieron": "haber", "estoy": "estar", "estás": "estar", "está": "estar", "estamos": "estar", "estáis": "estar", "están": "estar", "estuve": "estar", "estuviste": "estar", "estuvo": "estar", "estuvimos": "estar", "estuvisteis": "estar", "estuvieron": "estar", "estaba": "estar", "estabas": "estar", "estábamos": "estar", "estabais": "estar", "estaban": "estar", # Pretéritos fuertes "tuve": "tener", "tuviste": "tener", "tuvo": "tener", "tuvimos": "tener", "tuvisteis": "tener", "tuvieron": "tener", "vine": "venir", "viniste": "venir", "vino": "venir", "vinimos": "venir", "vinisteis": "venir", "vinieron": "venir", "hice": "hacer", "hiciste": "hacer", "hizo": "hacer", "hicimos": "hacer", "hicisteis": "hacer", "hicieron": "hacer", "puse": "poner", "pusiste": "poner", "puso": "poner", "pusimos": "poner", "pusisteis": "poner", "pusieron": "poner", "pude": "poder", "pudiste": "poder", "pudo": "poder", "pudimos": "poder", "pudisteis": "poder", "pudieron": "poder", "quise": "querer", "quisiste": "querer", "quiso": "querer", "quisimos": "querer", "quisisteis": "querer", "quisieron": "querer", "supe": "saber", "supiste": "saber", "supo": "saber", "supimos": "saber", "supisteis": "saber", "supieron": "saber", "traje": "traer", "trajiste": "traer", "trajo": "traer", "trajimos": "traer", "trajisteis": "traer", "trajeron": "traer", "dije": "decir", "dijiste": "decir", "dijo": "decir", "dijimos": "decir", "dijisteis": "decir", "dijeron": "decir", "conduje": "conducir", "condujiste": "conducir", "condujo": "conducir", "condujimos": "conducir", "condujisteis": "conducir", "condujeron": "conducir", "anduve": "andar", "anduviste": "andar", "anduvo": "andar", "anduvimos": "andar", "anduvisteis": "andar", "anduvieron": "andar", "cupe": "caber", "cupiste": "caber", "cupo": "caber", "cupimos": "caber", "cupisteis": "caber", "cupieron": "caber", "di": "dar", "diste": "dar", "dio": "dar", "dimos": "dar", "disteis": "dar", "dieron": "dar", "vi": "ver", "viste": "ver", "vio": "ver", "vimos": "ver", "visteis": "ver", "vieron": "ver", # 1ª sg -go "tengo": "tener", "vengo": "venir", "pongo": "poner", "salgo": "salir", "traigo": "traer", "caigo": "caer", "hago": "hacer", "oigo": "oír", "digo": "decir", "valgo": "valer", "sigo": "seguir", # Presentes e>ie / o>ue / e>i "tienes": "tener", "tiene": "tener", "tienen": "tener", "vienes": "venir", "viene": "venir", "vienen": "venir", "pienso": "pensar", "piensas": "pensar", "piensa": "pensar", "piensan": "pensar", "quiero": "querer", "quieres": "querer", "quiere": "querer", "quieren": "querer", "prefiero": "preferir", "prefieres": "preferir", "prefiere": "preferir", "prefieren": "preferir", # Subjuntivos irregulares "vaya": "ir", "vayas": "ir", "vayamos": "ir", "vayáis": "ir", "vayan": "ir", "sea": "ser", "seas": "ser", "seamos": "ser", "seáis": "ser", "sean": "ser", "haya": "haber", "hayas": "haber", "hayamos": "haber", "hayáis": "haber", "hayan": "haber", "dé": "dar", "des": "dar", "demos": "dar", "deis": "dar", "den": "dar", "esté": "estar", "estés": "estar", "estemos": "estar", "estéis": "estar", "estén": "estar", "tenga": "tener", "tengas": "tener", "tengamos": "tener", "tengáis": "tener", "tengan": "tener", "venga": "venir", "vengas": "venir", "vengamos": "venir", "vengáis": "venir", "vengan": "venir", "haga": "hacer", "hagas": "hacer", "hagamos": "hacer", "hagáis": "hacer", "hagan": "hacer", # Imperativos irregulares "ve": "ir", "id": "ir", "sé": "ser", "sed": "ser", "haz": "hacer", "haced": "hacer", "pon": "poner", "poned": "poner", "ven": "venir", "venid": "venir", "ten": "tener", "tened": "tener", "sal": "salir", "salid": "salir", "di": "decir", "decid": "decir", # Más formas "doy": "dar", "das": "dar", "da": "dar", "damos": "dar", "dais": "dar", "dan": "dar", "veo": "ver", "ves": "ver", "vemos": "ver", "veis": "ver", "ven": "ver", "oí": "oír", "oíste": "oír", "oyó": "oír", "oímos": "oír", "oísteis": "oír", "oyeron": "oír", "iba": "ir", "ibas": "ir", "íbamos": "ir", "ibais": "ir", "iban": "ir", "veía": "ver", "veías": "ver", "veíamos": "ver", "veíais": "ver", "veían": "ver", # algunos de subjuntivo imperfecto típicos "vinieras": "venir", "lloviera": "llover", } # Etiquetas morfológicas específicas para irregulares IRREG_MORPH_TAGS = { # Subjuntivos "vaya": "SBJV", "vayas": "SBJV", "vayamos": "SBJV", "vayáis": "SBJV", "vayan": "SBJV", "sea": "SBJV", "seas": "SBJV", "seamos": "SBJV", "seáis": "SBJV", "sean": "SBJV", "haya": "SBJV", "hayas": "SBJV", "hayamos": "SBJV", "hayáis": "SBJV", "hayan": "SBJV", "dé": "SBJV", "des": "SBJV", "demos": "SBJV", "deis": "SBJV", "den": "SBJV", "esté": "SBJV", "estés": "SBJV", "estemos": "SBJV", "estéis": "SBJV", "estén": "SBJV", "tenga": "SBJV", "tengas": "SBJV", "tengamos": "SBJV", "tengáis": "SBJV", "tengan": "SBJV", "venga": "SBJV", "vengas": "SBJV", "vengamos": "SBJV", "vengáis": "SBJV", "vengan": "SBJV", "haga": "SBJV", "hagas": "SBJV", "hagamos": "SBJV", "hagáis": "SBJV", "hagan": "SBJV", "pueda": "SBJV", "puedas": "SBJV", "podamos": "SBJV", "podáis": "SBJV", "puedan": "SBJV", # Imperativos (nota: "ve", "di", "sé" se desambiguan por contexto en detect_tam_with_context) "id": "IMP", "sed": "IMP", "haz": "IMP", "haced": "IMP", "pon": "IMP", "poned": "IMP", "ven": "IMP", "venid": "IMP", "ten": "IMP", "tened": "IMP", "sal": "IMP", "salid": "IMP", "decid": "IMP", # Imperativos con clíticos "llámame": "IMP", "llámalo": "IMP", "llámala": "IMP", "llámanos": "IMP", "llámalos": "IMP", "llámalas": "IMP", "dime": "IMP", "dímelo": "IMP", "dinos": "IMP", "dínoslo": "IMP", "hazme": "IMP", "hazlo": "IMP", "hazla": "IMP", "haznos": "IMP", "ponme": "IMP", "ponlo": "IMP", "ponla": "IMP", "ponnos": "IMP", "dame": "IMP", "dámelo": "IMP", "danos": "IMP", "dánoslo": "IMP", "tráeme": "IMP", "tráelo": "IMP", "tráela": "IMP", "tráenos": "IMP", "díselo": "IMP", "pónselo": "IMP", "házselo": "IMP", # Futuro de subjuntivo (arcaico) "viniere": "FUT_SBJV", "vinieres": "FUT_SBJV", "vinieren": "FUT_SBJV", "hiciere": "FUT_SBJV", "hicieres": "FUT_SBJV", "hicieren": "FUT_SBJV", "fuere": "FUT_SBJV", "fueres": "FUT_SBJV", "fueren": "FUT_SBJV", "hubiere": "FUT_SBJV", "hubieres": "FUT_SBJV", "hubieren": "FUT_SBJV", # Formas de vosotros (pretérito) "creísteis": "PST", "dijisteis": "PST", "hicisteis": "PST", "pusisteis": "PST", "supisteis": "PST", "quisisteis": "PST", "trajisteis": "PST", # Pretéritos monosilábicos sin tilde (crítico) "vi": "PST", "dio": "PST", "fue": "PST", "fui": "PST", # Imperfectos "iba": "IPFV", "ibas": "IPFV", "íbamos": "IPFV", "ibais": "IPFV", "iban": "IPFV", "veía": "IPFV", "veías": "IPFV", "veíamos": "IPFV", "veíais": "IPFV", "veían": "IPFV", } def looks_like_verb_form_strict(w: str) -> bool: w = (w or "").lower() if w.endswith(("ar", "er", "ir")): return True if RE_GER.search(w) or RE_PART.search(w): return True if re.search(r"(á|ás|áis|és|éis|ís)$", w): return True if _strip_any(w, FUT_END + COND_END)[0] is not None: return True if _strip_any(w, PRET_AR + PRET_ERIR)[0] is not None: return True if _strip_any(w, IMPF_AR + IMPF_ERIR)[0] is not None: return True if _strip_any(w, SUBJ_PAST_AR + SUBJ_PAST_ERIR)[0] is not None: return True if re.search( r"(anduve|anduviste|anduvo|anduvimos|anduvieron|conduje|traduje|produje|reduje|introduje|supe|quise|pude|puse|hice|hizo|dije|dijo|traje|trajo|tuve|tuvo|vine|vino|cupe|cupo)$", w, ): return True return False def _zco_guess(w: str) -> str: if w.endswith("uzco"): return w[:-4] + "ucir" if w.endswith("ezco"): return w[:-4] + "ecer" if w.endswith("ozco"): return w[:-4] + "ocer" if w.endswith("azco"): return w[:-4] + "acer" return "" def guess_infinitive_es(w: str) -> str: w = (w or "").lower() # 0) memoria explícita if w in IRREG_LEMMA: return IRREG_LEMMA[w] if w in ("vámonos", "vamonos"): return "ir" # 1) -zco if w.endswith("zco"): z = _zco_guess(w) if z: return z # 2) -go respaldo if w.endswith("go"): base = w[:-2] MAP = { "ten": "tener", "ven": "venir", "pon": "poner", "sal": "salir", "tra": "traer", "ca": "caer", "ha": "hacer", "oi": "oír", "di": "decir", "val": "valer", "si": "seguir" } for k, v in MAP.items(): if base.startswith(k): return v # 3) infinitivo ya if w.endswith(("ar", "er", "ir")): return w # 4) gerundio / participio m = RE_GER.search(w) if m: base = w[:m.start()] return base + ("ar" if m.group(0) == "ando" else "er") m = RE_PART.search(w) if m: base = w[:m.start()] PART_IRREG = { "hecho": "hacer", "dicho": "decir", "visto": "ver", "puesto": "poner", "escrito": "escribir", "abierto": "abrir", "cubierto": "cubrir", "muerto": "morir", "roto": "romper", "vuelto": "volver", "resuelto": "resolver", "frito": "freír", "impreso": "imprimir", "satisfecho": "satisfacer", "provisto": "proveer" } if w in PART_IRREG: return PART_IRREG[w] return base + "er" # 5) Futuro / condicional (correcto) + irregulares de base base, end = _strip_any(w, FUT_END + COND_END) if base is not None: irreg = { "saldr": "salir", "vendr": "venir", "tendr": "tener", "pondr": "poner", "valdr": "valer", "podr": "poder", "habr": "haber", "sabr": "saber", "cabr": "caber", "querr": "querer", "dir": "decir", "har": "hacer" } if base in irreg: return irreg[base] return base # 6) presentes con tilde if w.endswith("áis"): return w[:-3] + "ar" if w.endswith("éis"): return w[:-3] + "er" if w.endswith("ís"): return w[:-2] + "ir" if w.endswith("ás"): return w[:-2] + "ar" if w.endswith("és"): return w[:-2] + "er" if w.endswith("á"): return w[:-1] + "ar" # 7) pretérito / imperfecto / subj. presente for group in (PRET_AR + PRET_ERIR, IMPF_AR + IMPF_ERIR, SUBJ_AR + SUBJ_ERIR, PRS_AR + PRS_ER + PRS_IR): base, end = _strip_any(w, group) if base is not None: return base + _guess_class_from_ending(end) # 8) subjuntivo imperfecto (ara/ase ; iera/iese) base, end = _strip_any(w, SUBJ_PAST_AR) if base is not None: return base + "ar" base, end = _strip_any(w, SUBJ_PAST_ERIR) if base is not None: return base + "er" return "" def es_morph_tag(w: str) -> str: """Versión mejorada con mejor detección""" w = (w or "").lower() # Chequeo directo de irregulares if w in IRREG_MORPH_TAGS: return IRREG_MORPH_TAGS[w] # Detectar imperativos con clíticos if re.search(r"^(llám|dím|házm|pónm|vén|dám|tén|tráe)(a|e)?(me|te|lo|la|nos|os|les|se|melo|telo|selo)$", w): return "IMP" if re.search(r"(adme|edme|idme|adlo|edle|idle|adnos|ednos)$", w): return "IMP" # Detectar gerundios con pronombres if re.search(r"^.*[áéí]ndo(me|te|se|lo|la|nos|os|les|melo|telo|selo)$", w): return "IPFV" # Detectar múltiples clíticos en imperativo if re.search(r"(melo|telo|selo|noslo|oslo|sela|selas|selos)$", w): # Verificar si es imperativo por contexto base = re.sub(r"(melo|telo|selo|noslo|oslo|sela|selas|selos)$", "", w) if base and len(base) > 2: return "IMP" if w.endswith(("ar", "er", "ir")): return "INF" if RE_GER.search(w): return "IPFV" if RE_PART.search(w): return "PST" if _strip_any(w, PRET_AR + PRET_ERIR)[0] is not None: return "PST" if _strip_any(w, IMPF_AR + IMPF_ERIR)[0] is not None: return "IPFV" if _strip_any(w, FUT_END)[0] is not None: return "FUT" if _strip_any(w, COND_END)[0] is not None: return "COND" if re.search(r"(á|ás|áis|és|éis|ís)$", w): return "PRS" if _strip_any(w, SUBJ_AR + SUBJ_ERIR)[0] is not None: return "SBJV" if _strip_any(w, PRS_AR + PRS_ER + PRS_IR)[0] is not None: return "PRS" if _strip_any(w, SUBJ_PAST_AR + SUBJ_PAST_ERIR)[0] is not None: return "SBJV" if re.search( r"(anduve|conduje|traduje|produje|reduje|introduje|supe|quise|pude|puse|hice|hizo|dije|dijo|traje|trajo|tuve|tuvo|vine|vino|cupe|cupo)$", w, ): return "PST" # Detección mejorada de imperativos if re.search(r"^.+[aei]d$", w): # terminaciones -ad, -ed, -id return "IMP" return "UNK" # ========================= # 3) Utilidades y carga léxico # ========================= def fold(s: str) -> str: return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != "Mn") def has_diacritic(s: str) -> bool: return bool(re.search(r"[áéíóúüÁÉÍÓÚÜ]", s or "")) # Normalización de POS y tam_ok desde CSV def _canon_pos(p: str) -> str: p = (p or "").strip().upper() MAP = { "V": "V", "VERB": "V", "N": "N", "NOUN": "N", "ADJ": "ADJ", "ADJECTIVE": "ADJ", "ADV": "ADV", "ADVERB": "ADV", "INTJ": "INTJ", "INTERJ": "INTJ", "INTERJECTION": "INTJ", "PRON": "PRON", "PRONOUN": "PRON", "PART": "PART", "PARTICLE": "PART", "POSTP": "POSTP", "ADP": "POSTP", "ADPOSITION": "POSTP" } return MAP.get(p, "") def _boolish(x): if x is None: return None s = str(x).strip().lower() if s in ("1", "true", "t", "yes", "y", "si", "sí"): return True if s in ("0", "false", "f", "no", "n"): return False return None def _meta_set(form_es: str, pos: str = None, tam_ok=None): if not form_es: return d = LEX_META.setdefault(form_es, {}) if pos and not d.get("pos"): d["pos"] = pos if tam_ok is not None and d.get("tam_ok") is None: d["tam_ok"] = bool(tam_ok) def pos_of_es(token_low: str) -> str: m = LEX_META.get(token_low, {}) if m.get("pos"): return m["pos"] # Fallback: si "parece" forma verbal, dilo return "V" if looks_like_verb_form_strict(token_low) else "" def tam_allowed_for_es(token_low: str) -> bool: m = LEX_META.get(token_low, {}) if m.get("tam_ok") is not None: return bool(m["tam_ok"]) return pos_of_es(token_low) == "V" # ========================= # 4) TTS (Meta MMS VITS) # ========================= print("Cargando modelo de voz de Meta AI (TTS)...") device = "cuda" if torch.cuda.is_available() else "cpu" processor = model = None try: processor = AutoProcessor.from_pretrained("facebook/mms-tts-spa") model = VitsModel.from_pretrained("facebook/mms-tts-spa").to(device) print("Modelo de voz cargado con éxito.") except Exception as e: print(f"ERROR: No se pudo cargar el modelo de voz. La locución no funcionará. Error: {e}") PAUSE_LEVEL = 3 def add_reading_pauses(text: str, level: int = 3) -> str: if level <= 1: return text t = text if level >= 2: t = re.sub(r",\s*", ", , ", t) if level >= 3: t = re.sub(r"\.\s*", ". . ", t) t = re.sub(r";\s*", "; ; ", t) return re.sub(r"\s+", " ", t).strip() def hispanize_for_tts(ni_text: str) -> str: text = (ni_text or "").lower() text = text.replace('ŕ', 'rr').replace('ś', 's').replace('eś', 'es') text = text.replace('ŕa', 'rra').replace('aŕe', 'arre').replace('-', ' ') text = re.sub(r'\[.*?\]', '', text) text = re.sub(r'\s+', ' ', text).strip() return add_reading_pauses(text, PAUSE_LEVEL) def synthesize_speech(text): if not text or not text.strip() or model is None or processor is None: return None try: inputs = processor(text=hispanize_for_tts(text), return_tensors="pt").to(device) with torch.no_grad(): output = model(**inputs).waveform speech_np = output.cpu().numpy().squeeze() mx = max(abs(speech_np.min()), abs(speech_np.max())) if mx > 0: speech_np = speech_np / mx * 0.9 return (16000, speech_np.astype(np.float32)) except Exception as e: print(f"Error durante la síntesis de voz: {e}") return None # ========================= # 5) Línea ibérica (visual) # ========================= KEYS_MODE = "explicit" V = "aeiou" SYL_FOR = { "b": ["‹BA›", "‹BE›", "‹BI›", "‹BO›", "‹BU›"], "d": ["‹DA›", "‹DE›", "‹DI›", "‹DO›", "‹DU›"], "t": ["‹TA›", "‹TE›", "‹TI›", "‹TO›", "‹TU›"], "g": ["‹GA›", "‹GE›", "‹GI›", "‹GO›", "‹GU›"], "k": ["‹KA›", "‹KE›", "‹KI›", "‹KO›", "‹KU›"] } ALPHA_FOR = {"a": "‹A›", "e": "‹E›", "i": "‹I›", "o": "‹O›", "u": "‹U›", "s": "‹S›", "ś": "‹Ś›", "l": "‹L›", "r": "‹R›", "ŕ": "‹Ŕ›", "n": "‹N›", "m": "‹M›"} CODA_FOR = {"": "", "n": "‹N›", "s": "‹S›", "ś": "‹Ś›", "r": "‹R›", "ŕ": "‹Ŕ›", "l": "‹L›", "m": "‹M›", "k": "‹K›", "t": "‹T›"} def tokens_from_latin(ni: str) -> str: out = [] i = 0 ni = (ni or "").lower() while i < len(ni): c = ni[i] # FIX: en el signario no hay /p/ independiente → grafía como B- if c == "p": c = "b" if c == "-": out.append("—") i += 1 continue if c in V: out.append(ALPHA_FOR[c]) i += 1 continue if c in SYL_FOR and i + 1 < len(ni) and ni[i + 1] in V: idx = V.index(ni[i + 1]) tok = SYL_FOR[c][idx] coda = ni[i + 2] if i + 2 < len(ni) else "" if coda in CODA_FOR and coda != "": tok += CODA_FOR[coda] i += 3 else: i += 2 out.append(tok) continue out.append(ALPHA_FOR.get(c, c.upper())) i += 1 return "".join(out) KEYS_OVERRIDE = {"ka": "K", "mi": "MI", "te": "TE", "ne": "N", "o": "O", "eś": "X"} def georgeos_keys(token_str: str, ni_plain: str) -> str: low = (ni_plain or "").lower() if low in KEYS_OVERRIDE: return KEYS_OVERRIDE[low] m = re.findall(r"‹(.*?)›", token_str) out = [] for t in m: if KEYS_MODE == "compact": if len(t) == 2 and t[0] in "BDTGK": out.append(t[0]) elif t in ("A", "E", "I", "O", "U"): out.append(t) elif t == "Ś": out.append("X") elif t == "Ŕ": out.append("r") else: out.append(t[0].upper()) else: if len(t) == 2 and t[0] in "BDTGK": out.append(t) elif t == "Ś": out.append("X") elif t == "Ŕ": out.append("r") else: out.append(t.upper()) return "".join(out) TRIDOT = "/" # FIX: incluir comillas tipográficas VISIBLE_PUNCT = set(",.;:…()[]{}\"'«»——""''") HARD_BOUND = {".", ";", "—", "—", ":", "(", ")", "«", "»", """, """, "'", "'" } # límites de cláusula fuertes def render_ib_with_tridots(toks): res = [] prev_word = False for tk in toks: is_punct = tk in VISIBLE_PUNCT if is_punct: res.append(" " + tk + " ") prev_word = False else: if prev_word: res.append(" " + TRIDOT + " ") res.append(tk) prev_word = True return "".join(res).strip() # ========================= # 6) Traductor ES→NI MEJORADO # ========================= # TAM mejorado con imperativo y futuro de subjuntivo TAM_SUFFIX = {"PRS": "-ke", "PST": "-bo", "FUT": "-ta", "IPFV": "-ri", "COND": "-ni", "SBJV": "-ni", "IMP": "-tu", "INF": "", "FUT_SBJV": "-ra", "UNK": "-ke"} # Anti-doble TAM (evita -bo-bo, -ni-ri, etc.) VERB_TAM = ("-ke", "-ta", "-bo", "-ri", "-ni", "-tu", "-ra") def strip_ni_tam(lemma: str): lemma = lemma or "" for s in sorted(VERB_TAM, key=len, reverse=True): if lemma.endswith(s): return lemma[:-len(s)], s return lemma, "" STOP = set( """ el la los las lo un una unos unas al del de en con sin por sobre entre hasta desde hacia según tras pero aunque sino que como si porque cuando donde mientras muy ya sí no también solo sólo aún aun más menos mi mis tu tus su sus nuestro nuestra nuestros nuestras esto eso aquello ese esa esos esas aquel aquella aquellos aquellas quien quién quiénes cual cuál cuales cuáles cuyo cuya cuyos cuyas eh ay oh uy ah aja jeje jaja aah ahh ohh uhh """.split() ) # ======= Modalidad "presunto ibero-vascoide" ======= Q_ENCLITIC_INT = "-na" # ¿ ... ? Q_ENCLITIC_EXC = "-ba" # ¡ ... ! WH_WORDS = { "qué", "quien", "quién", "quienes", "quiénes", "cual", "cuál", "cuales", "cuáles", "donde", "dónde", "cuando", "cuándo", "como", "cómo", "cuanto", "cuánto", "cuanta", "cuánta", "cuantos", "cuántos", "cuantas", "cuántas" } def is_wh_token(t: str) -> bool: low = (t or "").lower() if low in WH_WORDS: return True f = fold(low) return f in {"que", "quien", "quienes", "cual", "cuales", "donde", "cuando", "como", "cuanto", "cuanta", "cuantos", "cuantas"} # FIX: el detector WH ignora WH dentro de paréntesis y comillas tipográficas def has_wh_outside_parens(toks) -> bool: depth = 0 for tk in toks: if tk in {"(", "«", """, "'"}: depth += 1 elif tk in {")", "»", """, "'"}: depth = max(0, depth - 1) elif depth == 0 and is_wh_token(tk): return True return False def normalize_es(text: str) -> str: return re.sub(r"\s+", " ", (text or "").strip()) # FIX: tokenizar también " " ' ' def tokenize_es(text: str): text = re.sub(r"([,.;:!?¡¿…()\[\]{}\"'«»——""''])", r" \1 ", text) return [t for t in text.split() if t] # Reglas "a" → ka/mi/te (simplificado) def rule_a(prev_tok: str, token: str, next_tok: str) -> str: verbs = {"dar", "decir", "contar", "enviar", "ofrecer", "mostrar", "prestar", "regalar", "entregar"} if prev_tok in verbs: return "mi" nombres = {"ana", "marta", "juan", "pedro", "luis", "maría", "jose", "carlos", "laura"} if next_tok in nombres: return "te" return "ka" ESTAR_SET = {"estoy", "estás", "está", "estamos", "estáis", "están", "estaba", "estabas", "estábamos", "estabais", "estaban"} HABER_SET = {"he", "has", "ha", "hemos", "habéis", "han", "había", "habías", "habíamos", "habíais", "habían"} def detect_tam_with_context(toks, i, sentence_start=False): """Versión mejorada con análisis contextual profundo y desambiguación - CORREGIDO""" t = toks[i].lower() prev = toks[i - 1].lower() if i > 0 else "" prev2 = toks[i - 2].lower() if i > 1 else "" prev3 = toks[i - 3].lower() if i > 2 else "" nxt = toks[i + 1].lower() if i + 1 < len(toks) else "" # Detectar contexto de exclamación (buscar ¡ hacia atrás) in_exclamation = False for j in range(max(0, i - 5), i): if toks[j] == "¡": in_exclamation = True break if toks[j] in {".", ";", ":", "?"}: break # === VERIFICACIONES TEMPRANAS (antes de es_morph_tag) === # Estar/Haber en presente (crítico: evitar detección como FUT) if t in ESTAR_SET or t in HABER_SET: return "PRS" # Primero usar detección morfológica mejorada tag = es_morph_tag(t) # "di" - dar(PST) vs decir(IMP) if t == "di": # Contexto imperativo: inicio + exclamación if (sentence_start or prev in {",", ".", "!", "¡", ";", ":"}) and in_exclamation: return "IMP" # Coordinación con otro pretérito: "vi y di", "vino y di" # Buscar hacia atrás hasta 4 tokens for lookback in range(1, min(5, i + 1)): check_tok = toks[i - lookback].lower() if check_tok in {"vi", "vine", "fui", "fue", "hice", "hizo", "dije", "dijo", "tuve", "tuvo", "puse", "puso", "vio", "vino"}: return "PST" # Si encontramos punto/interrogación, parar búsqueda if check_tok in {".", "?", "!", ";", ":"}: break # Después de pronombre sujeto if prev in {"yo", "él", "ella", "usted", "le"}: return "PST" # Después de "y le" o "y me" → pretérito if prev in {"le", "me", "te", "les", "nos"} and prev2 in {"y", "e", "pero"}: return "PST" # Por defecto: pretérito (más frecuente) return "PST" # "ve" - ir(IMP) vs ver(IMP) if t == "ve": # Con destino: "ve a casa" → ir if nxt in {"a", "al", "hacia", "hasta", "para"}: return "IMP" # Inicio + exclamación → imperativo if (sentence_start or prev in {",", ".", "!", "¡", ";", ":"}) and in_exclamation: return "IMP" # Por defecto: imperativo return "IMP" # "sé" - ser(IMP) vs saber(PRS) if t == "sé": # Con adjetivo/nombre: "sé bueno", "sé feliz" → ser(IMP) if nxt and not nxt in {"que", "si", "cuando", "donde", "como", "por", "para"}: # Verificar si siguiente es probablemente adjetivo if nxt in {"bueno", "malo", "feliz", "fuerte", "valiente", "honesto", "paciente"}: return "IMP" # Después de "yo" if prev == "yo": return "PRS" # Con subordinada: "sé que..." → saber(PRS) if nxt == "que": return "PRS" # Inicio + exclamación → imperativo if (sentence_start or prev in {",", ".", "!", "¡", ";", ":"}) and in_exclamation: return "IMP" # Por defecto: presente de saber (más frecuente) return "PRS" # === FIN DESAMBIGUACIÓN === # Detectar múltiples clíticos en contexto imperativo if re.search(r"(melo|telo|selo|noslo|oslo)$", t): if sentence_start or prev in {",", ".", "!", "¡", ";", ":"}: return "IMP" # Contextos especiales # Imperativo al inicio de oración o después de puntuación fuerte if i == 0 or prev in {",", ".", "!", "¡", ";", ":"}: if t in {"ven", "haz", "pon", "sal", "ten", "id", "venid", "tened", "salid"}: return "IMP" # Subjuntivo después de ciertos nexos if prev in {"que", "si", "cuando", "aunque", "mientras", "hasta", "para"}: if tag == "SBJV": return "SBJV" # Intentar detectar subjuntivo por contexto si tag es UNK if tag == "UNK" and re.search(r"(e|a)$", t) and not t.endswith(("ar", "er", "ir")): return "SBJV" # Estar + gerundio if prev in ESTAR_SET and RE_GER.search(nxt): return "IPFV" # Haber + participio if prev in HABER_SET and RE_PART.search(nxt): return "PST" # Ir a + infinitivo (perífrasis de futuro) if prev == "a" and prev2 in {"voy", "vas", "va", "vamos", "vais", "van"}: if t.endswith(("ar", "er", "ir")): return "FUT" # Si es gerundio o participio por sí mismo if RE_GER.search(t): return "IPFV" if RE_PART.search(t): return "PST" return tag if tag != "UNK" else "PRS" def forced_lemma_with_context(low: str, prev: str, nxt: str) -> str: if low == "visto" and nxt == "de": return "vestir" return "" def has_tilde_equiv_lookup(low: str) -> str: if has_diacritic(low) and not looks_like_verb_form_strict(low): f = fold(low) if f in LEX_FORM: return LEX_FORM[f] if f in FOLD_FORM: return FOLD_FORM[f] return "" def lookup_form_lemma(token: str, prev: str, nxt: str): if not token: return "", False low = token.lower() fl = forced_lemma_with_context(low, prev, nxt) if fl and fl in LEX_LEMMA: return LEX_LEMMA[fl], True if low in LEX_FORM: return LEX_FORM[low], True til = has_tilde_equiv_lookup(low) if til: return til, True if looks_like_verb_form_strict(low): lem = guess_infinitive_es(low) if lem and lem in LEX_LEMMA: return LEX_LEMMA[lem], True return "", False # === Helpers para enclíticos y puntuación === def attach_enclitic(out_words, ib_keys, plain, attach_idx, encl): """Añade -na/-ba al ítem attach_idx, evitando duplicados exactos.""" if attach_idx is None or attach_idx < 0 or attach_idx >= len(out_words): return cur = out_words[attach_idx] or "" if cur.endswith(encl): return out_words[attach_idx] = cur + encl plain[attach_idx] = (plain[attach_idx] or "") + encl ib_keys[attach_idx] = georgeos_keys(tokens_from_latin(plain[attach_idx]), plain[attach_idx]) def ensure_terminal_qmark(out_words, ib_keys, plain): """Si al final se inyectó -na por WH pero no había '?', garantizar que termine en '?'.""" if not out_words: out_words.append("?") ib_keys.append("") plain.append("?") return j = len(out_words) - 1 # saltar tokens vacíos (por si acaso) while j >= 0 and (out_words[j] == "" or out_words[j] is None): j -= 1 if j < 0: out_words.append("?") ib_keys.append("") plain.append("?") return if out_words[j] == ".": out_words[j] = "?" ib_keys[j] = "" plain[j] = "?" elif out_words[j] not in {"?", "!"}: out_words.append("?") ib_keys.append("") plain.append("?") # Normaliza superficie si POS no permite TAM def normalize_surface_by_pos(ni_surface: str, pos: str) -> str: if not ni_surface: return ni_surface if pos != "V": root, _ = strip_ni_tam(ni_surface) return root return ni_surface def translate_sentence(sent: str): """Pipeline mejorado con detección contextual y manejo de imperativos - CORREGIDO""" toks = tokenize_es(normalize_es(sent)) out_words = [] # palabras en ni (latín) ib_keys = [] # claves/teclas (línea ibérica) plain = [] # palabra ni "plana" neg_next = False last_finite_idx = None has_qmark = False has_emark = False saw_wh = has_wh_outside_parens(toks) sentence_start = True # Track inicio de oración for i, t in enumerate(toks): # apertura ¿ ¡ if t == "¿" or t == "¡": sentence_start = True # Reset en apertura continue # cierre ? ! if t == "?" or t == "!": if t == "?": has_qmark = True else: has_emark = True encl = Q_ENCLITIC_INT if t == "?" else Q_ENCLITIC_EXC attach_idx = last_finite_idx if attach_idx is None: for j in range(len(out_words) - 1, -1, -1): if out_words[j] and out_words[j] not in VISIBLE_PUNCT: attach_idx = j break if attach_idx is not None: attach_enclitic(out_words, ib_keys, plain, attach_idx, encl) out_words.append(t) ib_keys.append("") plain.append(t) sentence_start = True # Reset después de cierre continue # puntuación visible if t in VISIBLE_PUNCT: out_words.append(t) ib_keys.append(t) plain.append(t) # corte de cláusula fuerte → no arrastrar enclítico a la siguiente if t in HARD_BOUND: last_finite_idx = None sentence_start = (t in {".", ":", ";", "—"}) continue # pipeline normal low = t.lower() prev = toks[i - 1].lower() if i > 0 else "" nxt = toks[i + 1].lower() if i + 1 < len(toks) else "" # === DETECCIÓN MEJORADA === # Detectar imperativo por contexto de inicio o por clíticos if ( (sentence_start and t in {"ve", "ven", "haz", "pon", "sal", "di", "ten", "sé", "id", "venid", "tened"}) or (re.search(r"(me|te|lo|la|nos|os|les|se)$", low) and looks_like_verb_form_strict(low)) ): tag_detected = "IMP" else: tag_detected = detect_tam_with_context(toks, i, sentence_start) debug_print(f"Procesando: {t} → morfología detectada: {tag_detected}") pos_hint = pos_of_es(low) is_verb_like = looks_like_verb_form_strict(low) or (pos_hint == "V") tam_ok = tam_allowed_for_es(low) if low == "no": neg_next = True continue if low in {"el", "la", "los", "las", "al", "del"}: continue if low == "a": ni = rule_a(prev, low, nxt) out_words.append(ni) ib_keys.append(georgeos_keys(tokens_from_latin(ni), ni)) plain.append(ni) continue if low in {"un", "una"}: ni = "ban" out_words.append(ni) ib_keys.append(georgeos_keys(tokens_from_latin(ni), ni)) plain.append(ni) continue if (low in STOP) and (low not in LEX_FORM): continue # 1) Superficie directa desde rich con TAM mejorado ni_direct = SURF_RICH.get((low, tag_detected)) if neg_next and is_verb_like: out_words.append("eś") ib_keys.append(georgeos_keys(tokens_from_latin("eś"), "eś")) plain.append("eś") neg_next = False if ni_direct: debug_print(f"Encontrado en SURF_RICH: {low}, {tag_detected} → {ni_direct}") # CORRECCIÓN CRÍTICA: Si el CSV ya tiene el sufijo TAM correcto, usarlo tal cual if any(ni_direct.endswith(s) for s in VERB_TAM): # Ya tiene un sufijo TAM del CSV, confiar en él ni = ni_direct else: # Solo normalizar si NO tiene sufijo TAM ni = normalize_surface_by_pos(ni_direct, "V" if tam_ok else (pos_hint or "")) out_words.append(ni) ib_keys.append(georgeos_keys(tokens_from_latin(ni), ni)) plain.append(ni) if tam_ok and any(ni.endswith(s) for s in VERB_TAM): last_finite_idx = len(out_words) - 1 sentence_start = False # Ya no estamos al inicio continue # 2) Diccionario forma/lema con gating TAM ni_lemma, ok = lookup_form_lemma(t, prev, nxt) if ok: # Caso 2.a – forma forzada (imperativos/presentes: conservar superficie, NO re-TAM) if low in FORCE_KEYS: ni = LEX_FORM.get(low, ni_lemma) # superficie fija (p.ej., "ven" -> "nuker-tu") out_words.append(ni) ib_keys.append(georgeos_keys(tokens_from_latin(ni), ni)) plain.append(ni) last_finite_idx = len(out_words) - 1 sentence_start = False continue # Caso 2.b – si detectamos imperativo, forzar -tu root, old_suf = strip_ni_tam(ni_lemma or "") if tag_detected == "IMP": ni = root + "-tu" out_words.append(ni) ib_keys.append(georgeos_keys(tokens_from_latin(ni), ni)) plain.append(ni) last_finite_idx = len(out_words) - 1 sentence_start = False continue # Si ya trae -tu, preservarlo if old_suf == "-tu": ni = ni_lemma out_words.append(ni) ib_keys.append(georgeos_keys(tokens_from_latin(ni), ni)) plain.append(ni) last_finite_idx = len(out_words) - 1 sentence_start = False continue # Caso 2.c – verbo normal: compón TAM si procede if tam_ok and is_verb_like: suf = TAM_SUFFIX.get(tag_detected, "-ke") base = root or (ni_lemma or "") ni = base + suf if suf else base # No añadir sufijo si es INF out_words.append(ni) ib_keys.append(georgeos_keys(tokens_from_latin(ni), ni)) plain.append(ni) last_finite_idx = len(out_words) - 1 else: ni = normalize_surface_by_pos(ni_lemma if ni_lemma != "" else "Ø", pos_hint or "") out_words.append(ni) ib_keys.append(georgeos_keys(tokens_from_latin(ni), ni)) plain.append(ni) sentence_start = False continue # 3) sin léxico placeholder = f"[SIN-LEX:{t}]" out_words.append(placeholder) ib_keys.append(placeholder) plain.append(placeholder) sentence_start = False # Heurística WH: si hay interrogativa sin ?, añade -na appended_na = False if saw_wh and not has_qmark: encl = Q_ENCLITIC_INT attach_idx = last_finite_idx if attach_idx is None: for j in range(len(out_words) - 1, -1, -1): if ( out_words[j] and out_words[j] not in VISIBLE_PUNCT and out_words[j] not in {"?", "!"} and not out_words[j].startswith("[") ): attach_idx = j break if attach_idx is not None and not ( out_words[attach_idx].endswith("-na") or out_words[attach_idx].endswith("-ba") ): attach_enclitic(out_words, ib_keys, plain, attach_idx, encl) appended_na = True # Si inyectamos -na por WH, garantizar "?" terminal if appended_na and not has_qmark: ensure_terminal_qmark(out_words, ib_keys, plain) ib_clean = [k for k in ib_keys if k != ""] return " ".join(out_words), ib_clean def translate(text: str): lines = [l for l in (text or "").split("\n") if l.strip()] ni_lines = [] ib_lines = [] for ln in lines: ni, ib_toks = translate_sentence(ln) ni_lines.append(ni) ib_lines.append(render_ib_with_tridots(ib_toks)) return "\n".join(ni_lines), "\n".join(ib_lines) # ========================= # 7) Carga léxico (con POS/tam_ok) # ========================= def load_lexicon(): loaded = False for p in CSV_CANDIDATES: if not os.path.exists(p): continue try: with open(p, encoding="utf-8") as f: rd = csv.DictReader(f) flds = set(rd.fieldnames or []) if {"source_es", "es_morph"}.issubset(flds): # rich for r in rd: es = (r.get("source_es") or "").strip().lower() tag = (r.get("es_morph") or "").strip().upper() surf = (r.get("ni_surface") or "").strip() if not surf: root = (r.get("ni_root") or "").strip() suf = (r.get("ni_suffix") or "").strip() if root or suf: surf = f"{root}{suf}" if es and tag and surf: SURF_RICH[(es, tag)] = surf ni = (r.get("target_ni") or "").strip() es_lem = (r.get("es_lemma") or "").strip().lower() # POS/tam_ok desde CSV (retrocompatible) pos = _canon_pos( r.get("pos") or r.get("es_pos") or r.get("target_pos") or r.get("pos_es") or r.get("ni_pos") or "" ) tam_ok = _boolish(r.get("tam_ok")) if es: _meta_set(es, pos=pos, tam_ok=(tam_ok if tam_ok is not None else (pos == "V" if pos else None))) if es_lem: _meta_set( es_lem, pos=("V" if es_lem.endswith(("ar", "er", "ir")) else (pos or "")), tam_ok=(tam_ok if tam_ok is not None else (pos == "V" if pos else None)), ) if es and ni != "": LEX_FORM.setdefault(es, ni) if es_lem and ni != "": LEX_LEMMA.setdefault(es_lem, ni) loaded = True continue if {"source_es", "target_ni"}.issubset(flds): # simple for r in rd: es = (r.get("source_es") or "").strip().lower() ni = (r.get("target_ni") or "").strip() if not es: continue LEX_FORM.setdefault(es, ni) _meta_set(es, pos="", tam_ok=None) if looks_like_verb_form_strict(es): lem = guess_infinitive_es(es) if lem: LEX_LEMMA.setdefault(lem, ni) _meta_set(lem, pos="V", tam_ok=True) loaded = True continue if {"es", "ni_lemma"}.issubset(flds): # diccionario bonito for r in rd: es = (r.get("es") or "").strip().lower() ni = (r.get("ni_lemma") or "").strip() if not es: continue LEX_FORM.setdefault(es, ni) _meta_set(es, pos="", tam_ok=None) if looks_like_verb_form_strict(es): lem = guess_infinitive_es(es) if lem: LEX_LEMMA.setdefault(lem, ni) _meta_set(lem, pos="V", tam_ok=True) loaded = True continue except Exception as e: print(f"[WARN] No se pudo leer {p}: {e}") # Fallback "sin tildes" para no verbos global FOLD_FORM FOLD_FORM = {} for k, v in LEX_FORM.items(): fk = fold(k) if fk != k and len(k) >= 5 and not looks_like_verb_form_strict(k): FOLD_FORM.setdefault(fk, v) # Atestiguados mínimos y pronombres personales ampliados KEEP_MIN = { # Partículas y determinantes "y": "ne", "o": "o", "no": "eś", "a": "ka", "para": "ka", "eso": "kok", "tarta": "gatel", "el": "", "la": "", "los": "", "las": "", "un": "ban", "una": "ban", "unos": "", "unas": "", "este": "aŕe", "esta": "aŕe", "estos": "aŕe", "estas": "aŕe", # Numerales "uno": "ban", "dos": "bi", "tres": "irur", "cuatro": "laur", "cinco": "borste", "seis": "śei", "siete": "sisbi", "ocho": "sorse", "nueve": "lauŕbi", "diez": "abaŕ", "veinte": "oŕkei", # Pronombres personales "yo": "ni", "tú": "zu", "él": "nar", "ella": "nar", "nosotros": "gu", "nosotras": "gu", "vosotros": "zuek", "vosotras": "zuek", "ellos": "narek", "ellas": "narek", # Conectores esenciales "que": "ze", "si": "baldin", "cuando": "noiz", "donde": "non", "como": "nola", "porque": "zeren", "mientras": "bitarte", # Palabras frecuentes del test "versión": "bertsi", "test": "froga", "prueba": "froga", "ejemplo": "adibid", "texto": "testu", "palabra": "hitz" } for k, v in KEEP_MIN.items(): LEX_FORM.setdefault(k, v) if k in {"yo", "tú", "él", "ella", "nosotros", "nosotras", "vosotros", "vosotras", "ellos", "ellas"}: _meta_set(k, pos="PRON", tam_ok=False) elif k in {"que", "si", "cuando", "donde", "como", "porque", "mientras"}: _meta_set(k, pos="PART", tam_ok=False) else: _meta_set(k, pos=_canon_pos("PART" if k in {"y", "o", "no", "a", "para"} else "DET"), tam_ok=False) # LEXEMAS embebidos (seguridad) BUILTIN_LEMMA = { "venir": "nuker", "llover": "xemmo", "ver": "giŕok", "decir": "siśnesiŕ", "llamar": "lankur", "mostrar": "sunlirket", "andar": "sorsak", "dar": "buś", "enviar": "barmosak", "construir": "giknus", "poder": "giokk", "hacer": "giotael", "querer": "quers", "saber": "suber", "poner": "pusen", "salir": "salku", "ir": "nitus", "ser": "izan", "estar": "egon", "haber": "ukan" } for k, v in BUILTIN_LEMMA.items(): LEX_LEMMA.setdefault(k, v) _meta_set(k, pos="V", tam_ok=True) # Fuerzos de formas (presentes y algunos imperativos irregulares) — prioridad dura FORCE_FORMS = { "voy": "nitus-ke", "vas": "nitus-ke", "va": "nitus-ke", "vamos": "nitus-ke", "vais": "nitus-ke", "van": "nitus-ke", "vengo": "nuker-ke", "vienes": "nuker-ke", "viene": "nuker-ke", "venimos": "nuker-ke", "venís": "nuker-ke", "vienen": "nuker-ke", # Imperativos frecuentes "ven": "nuker-tu", "haz": "giotael-tu", "pon": "pusen-tu", "di": "siśnesir-tu", "sal": "salku-tu", "ten": "giokk-tu", "sé": "suber-tu" # Nota: "ve" es ambiguo (ver/ir); se omite por seguridad. } for form, ni in FORCE_FORMS.items(): LEX_FORM[form] = ni # prioridad dura sobre CSV _meta_set(form, pos="V", tam_ok=True) # Exponer claves de forzados para congelar superficie en el pipeline global FORCE_KEYS FORCE_KEYS = set(FORCE_FORMS.keys()) return loaded _ = load_lexicon() # ========================= # 8) UI + Docs # ========================= LABELS = { "ES": { "title": "Traductor Español → Neoíbero", "subtitle": "Explora el renacimiento de la lengua ibérica antigua con tecnología moderna", "in_label": "✏️ Entrada (Español)", "in_ph": "Escribe aquí. Ej.: Veo a Ana y doy pan a Marta.", "out_lat": "📜 Salida: Neoíbero (latín)", "out_ib": "🗿 Línea ibérica", "out_audio": "🔊 Locución (Audio)", "btn": "🔄 Traducir", "combo": "🌍 Idioma (UI + explicación)", "doc_header": "📚 Documentación y Referencia", "acc_titles": [ "🎓 Marco académico y decisiones del neoíbero", "🏛️ Herencia posible del íbero histórico", "🎨 Diseño de la conlang (neoíbero)", "⚙️ Pipeline del traductor (paso a paso)", "🔤 Ortografía, línea ibérica y claves", "❓/❗ Modalidad presunto vascoide (-na / -ba)", "📖 Gramática de referencia (v1.2)", "📚 Bibliografía de base", "🧾 Siglas y glosario" ] }, "EN": { "title": "Spanish → Neo-Iberian Translator", "subtitle": "Explore the revival of the ancient Iberian language with modern tech", "in_label": "✏️ Input (Spanish)", "in_ph": "Type here. E.g., Veo a Ana y doy pan to Marta.", "out_lat": "📜 Output: Neo-Iberian (Latin)", "out_ib": "🗿 Iberian line", "out_audio": "🔊 Speech (Audio)", "btn": "🔄 Translate", "combo": "🌍 Language (UI + docs)", "doc_header": "📚 Documentation & Reference", "acc_titles": [ "🎓 Background & design choices", "🏛️ Possible inheritance from ancient Iberian", "🎨 Conlang design (Neo-Iberian)", "⚙️ Translator pipeline (step by step)", "🔤 Orthography, Iberian line & keys", "❓/❗ 'Vascoid' mood (-na / -ba)", "📖 Reference grammar (v1.2)", "📚 Core references", "🧾 Acronyms & glossary" ] } } DOC_ES_0 = """**Escritura y datos.** Este proyecto recrea un "neoíbero" plausible apoyado en: - El signario nororiental y el alfabeto greco-ibérico (uso **visual**, no paleográfico). - Un léxico experimental (CSVs) con pares ES↔NI y superficies condicionadas por morfología. - Un motor morfológico que reconoce tiempos/asp./modo del español y compone sufijos TAM en NI.""" DOC_ES_1 = """**Qué heredamos (consenso aproximado / modelos típicos):** - Fonotaxis **CV(C)**; ausencia de **/p/** fonémica; *r/ŕ* no inicial. - Postposiciones/sufijos nominales: **-k** (pl), **-te** (agentivo), **-ar/-en** (genitivo/origen), **-ka** (dat./loc./dist.), **-i** (acusativo con PN). - Partículas: **ne** 'y', **o** 'o', **eś** 'no'. - Numerales plausibles: *ban, bi, irur, laur, borste, śei, abaŕ (10), oŕkei (20)*. > Implementación **conlang** (hipótesis operativas).""" DOC_ES_2 = """**Diseño de la conlang:** - **TAM verbal**: PRS **-ke**, PST **-bo**, FUT **-ta**, IPFV **-ri**, IMP **-tu**, COND/SBJV **-ni**. - Derivación: verbos (-ke/-ta/-bo/-ri/-ni), adjetivos (-si), nombres (-ar/-en/-tu/-la/-ŕa/-si). - Orden preferente **SOV**.""" DOC_ES_3 = """**Pipeline:** 1) Tokeniza; elimina artículos/contracciones. 2) a → ka/mi/te. 3) CSV rich → **superficie directa**; CSV simple → **lema**. 4) **Gating POS/TAM**: solo verbos reciben TAM; no-verbos purgan TAM si llegara. 5) Negación **eś** antes del primer finito. 6) ¿? / ¡! → enclíticos **-na/-ba** al último verbo finito. 7) WH-sin-¿? → **-na** igualmente. 8) Línea ibérica: solo puntuación visible.""" DOC_ES_4 = """**Ortografía/clave ibérica:** - Línea ibérica por **claves**; modo **explicit** (BA/BE/BI/BO/BU). - Separador de palabra = "/" (tridots). - Atajos: ka→K, mi→MI, te→TE, ne→N, o→O, eś→X.""" DOC_ES_5 = """**Modalidad (-na/-ba):** - **-na** interrogativa; **-ba** exclamativa. - Se adhieren al **último finito**; si no hay, al último constituyente.""" DOC_ES_6 = """**Gramática mínima:** - Verbo: raíz + TAM. - Negación **eś**. - Casos productivos en léxico atestiguado: -k, -te, -ka, -ar/-en.""" DOC_ES_7 = """**Bibliografía:** Untermann, de Hoz, Ferrer i Jané, Correa, etc.""" DOC_ES_8 = """**Glosario:** TAM, DOM, SOV, CV(C), CSV, superficie, enclítico…""" DOC = { "ES": [DOC_ES_0, DOC_ES_1, DOC_ES_2, DOC_ES_3, DOC_ES_4, DOC_ES_5, DOC_ES_6, DOC_ES_7, DOC_ES_8], "EN": ["Script & data.", "Possible inheritance.", "Conlang design.", "Pipeline summary.", "Orthography & keys.", "'Vascoid' mood.", "Reference grammar.", "Core references.", "Acronyms & glossary."] } def build_css(): b64 = None if os.path.exists("Iberia-Georgeos.ttf"): with open("Iberia-Georgeos.ttf", "rb") as f: b64 = base64.b64encode(f.read()).decode("ascii") font_src = f"url(data:font/ttf;base64,{b64}) format('truetype')" if b64 else "local('sans-serif')" return f""" @font-face {{ font-family: 'IberiaGeorgeos'; src: {font_src}; font-weight: normal; font-style: normal; }} :root {{ --iberian-clay: #8B4513; --iberian-ochre: #CC7722; --iberian-stone: #5C5C5C; --iberian-sand: #D2B48C; --iberian-rust: #A0522D; --iberian-bronze: #CD7F32; }} .gradio-container {{ background: linear-gradient(135deg, #f4e8d8 0%, #e8d5c4 50%, #d4c4b0 100%) !important; font-family: 'Georgia', 'Times New Roman', serif !important; }} .gradio-container h1, .gradio-container h2, .gradio-container h3 {{ color: var(--iberian-clay) !important; text-shadow: 2px 2px 4px rgba(139,69,19,.15) !important; border-bottom: 3px solid var(--iberian-bronze) !important; padding-bottom: .5rem !important; letter-spacing: .5px !important; }} .gradio-container .gr-group {{ background: linear-gradient(to bottom, #f9f6f0, #ede6dc) !important; border: 2px solid var(--iberian-sand) !important; border-radius: 8px !important; box-shadow: 0 4px 12px rgba(139,69,19,.2), inset 0 1px 0 rgba(255,255,255,.5) !important; padding: 1.5rem !important; margin-bottom: 1.5rem !important; }} .gradio-container .gr-accordion {{ background: linear-gradient(145deg, #ebe3d5, #d9cec0) !important; border: 2px solid var(--iberian-rust) !important; border-radius: 6px !important; margin-bottom: .8rem !important; box-shadow: 2px 2px 6px rgba(0,0,0,.15) !important; }} .gradio-container .gr-accordion .label-wrap {{ background: linear-gradient(to right, var(--iberian-ochre), var(--iberian-rust)) !important; color: #fff !important; font-weight: 600 !important; padding: .8rem 1rem !important; border-radius: 4px !important; text-shadow: 1px 1px 2px rgba(0,0,0,.3) !important; }} .gradio-container .gr-textbox textarea, .gradio-container .gr-textbox input {{ background: linear-gradient(to bottom, #faf8f3, #f5f0e8) !important; border: 2px solid var(--iberian-sand) !important; border-radius: 6px !important; color: var(--iberian-stone) !important; font-family: 'Georgia', serif !important; box-shadow: inset 2px 2px 4px rgba(139,69,19,.1) !important; }} .gradio-container .gr-textbox textarea:focus, .gradio-container .gr-textbox input:focus {{ border-color: var(--iberian-bronze) !important; box-shadow: inset 2px 2px 4px rgba(139,69,19,.1), 0 0 8px rgba(205,127,50,.3) !important; }} .gradio-container .gr-button.gr-button-primary {{ background: linear-gradient(145deg, var(--iberian-bronze), var(--iberian-rust)) !important; border: 2px solid var(--iberian-clay) !important; color: #fff !important; font-weight: bold !important; text-shadow: 1px 1px 2px rgba(0,0,0,.4) !important; box-shadow: 0 4px 8px rgba(139,69,19,.3), inset 0 1px 0 rgba(255,255,255,.2) !important; border-radius: 8px !important; padding: .8rem 1.5rem !important; transition: all .3s ease !important; }} .gradio-container .gr-button.gr-button-primary:hover {{ background: linear-gradient(145deg, var(--iberian-rust), var(--iberian-bronze)) !important; transform: translateY(-2px) !important; box-shadow: 0 6px 12px rgba(139,69,19,.4) !important; }} .gradio-container .gr-dropdown {{ background: linear-gradient(to bottom, #f5efe8, #ebe3d5) !important; border: 2px solid var(--iberian-sand) !important; border-radius: 6px !important; }} .gradio-container .gr-dropdown .wrap {{ background: linear-gradient(to bottom, #f5efe8, #ebe3d5) !important; border: 2px solid var(--iberian-sand) !important; border-radius: 6px !important; }} .ib-line {{ font-family: 'IberiaGeorgeos', monospace, sans-serif !important; font-size: 1.9rem !important; line-height: 2.4rem !important; white-space: pre-wrap !important; background: linear-gradient(135deg, #e8dcc8 0%, #d4c4a8 50%, #c4b098 100%) !important; padding: 24px !important; border-radius: 10px !important; border: 3px solid var(--iberian-rust) !important; border-left: 6px solid var(--iberian-bronze) !important; box-shadow: 0 4px 15px rgba(139,69,19,.25), inset 0 2px 4px rgba(0,0,0,.1) !important; color: var(--iberian-clay) !important; text-shadow: 1px 1px 1px rgba(255,255,255,.3) !important; position: relative !important; }} .ib-line::before {{ content: '' !important; position: absolute !important; top: 0 !important; left: 0 !important; right: 0 !important; bottom: 0 !important; background-image: repeating-linear-gradient(0deg, transparent, transparent 2px, rgba(139,69,19,.03) 2px, rgba(139,69,19,.03) 4px) !important; pointer-events: none !important; border-radius: 10px !important; }} @media (max-width: 768px) {{ .ib-line {{ font-size: 1.5rem !important; line-height: 2rem !important; padding: 16px !important; }} .gradio-container .gr-group {{ padding: 1rem !important; }} .gradio-container h1 {{ font-size: 1.8rem !important; }} }} @media (max-width: 480px) {{ .ib-line {{ font-size: 1.3rem !important; line-height: 1.8rem !important; padding: 12px !important; }} .gradio-container h1 {{ font-size: 1.5rem !important; }} }} """ CSS = build_css() with gr.Blocks(css=CSS, theme=gr.themes.Soft(primary_hue="indigo", secondary_hue="purple")) as demo: with gr.Group(): title = gr.Markdown(f"# {LABELS['ES']['title']}") subtitle = gr.Markdown(f"*{LABELS['ES']['subtitle']}*") combo = gr.Dropdown(choices=["ES", "EN"], value="ES", label=LABELS["ES"]["combo"]) with gr.Group(): doc_header = gr.Markdown(f"## {LABELS['ES']['doc_header']}") acc_titles = LABELS["ES"]["acc_titles"] with gr.Accordion(acc_titles[0], open=False) as acc1: md1 = gr.Markdown(DOC["ES"][0]) with gr.Accordion(acc_titles[1], open=False) as acc2: md2 = gr.Markdown(DOC["ES"][1]) with gr.Accordion(acc_titles[2], open=False) as acc3: md3 = gr.Markdown(DOC["ES"][2]) with gr.Accordion(acc_titles[3], open=False) as acc4: md4 = gr.Markdown(DOC["ES"][3]) with gr.Accordion(acc_titles[4], open=False) as acc5: md5 = gr.Markdown(DOC["ES"][4]) with gr.Accordion(acc_titles[5], open=False) as acc6: md6 = gr.Markdown(DOC["ES"][5]) with gr.Accordion(acc_titles[6], open=False) as acc7: md7 = gr.Markdown(DOC["ES"][6]) with gr.Accordion(acc_titles[7], open=False) as acc8: md8 = gr.Markdown(DOC["ES"][7]) with gr.Accordion(acc_titles[8], open=False) as acc9: md9 = gr.Markdown(DOC["ES"][8]) with gr.Group(): es_in = gr.Textbox(label=LABELS["ES"]["in_label"], placeholder=LABELS["ES"]["in_ph"], lines=5) btn_tr = gr.Button(LABELS["ES"]["btn"], variant="primary") with gr.Row(): with gr.Column(scale=2): ni_out = gr.Textbox(label=LABELS["ES"]["out_lat"], lines=5, interactive=False) loc_btn = gr.Button("🔊 Locutar", variant="secondary", visible=False) audio_out = gr.Audio(label=LABELS["ES"]["out_audio"], type="numpy") with gr.Column(scale=1): ib_out = gr.HTML(label=LABELS["ES"]["out_ib"]) def run_translate_only(txt): latin, ib = translate(txt or "") ib_html = f'
{ib.replace("&","&").replace("<","<").replace(">",">")}
' return latin, ib_html, gr.update(visible=True) btn_tr.click(run_translate_only, es_in, [ni_out, ib_out, loc_btn]) def run_locution_from_out(latin_text): return synthesize_speech(latin_text) loc_btn.click(run_locution_from_out, ni_out, audio_out) def switch_lang(sel): L = LABELS[sel] T = L["acc_titles"] D = DOC[sel] return ( gr.update(value=f"# {L['title']}"), gr.update(value=f"*{L['subtitle']}*"), gr.update(label=L["combo"], value=sel), gr.update(value=f"## {L['doc_header']}"), gr.update(label=T[0]), gr.update(value=D[0]), gr.update(label=T[1]), gr.update(value=D[1]), gr.update(label=T[2]), gr.update(value=D[2]), gr.update(label=T[3]), gr.update(value=D[3]), gr.update(label=T[4]), gr.update(value=D[4]), gr.update(label=T[5]), gr.update(value=D[5]), gr.update(label=T[6]), gr.update(value=D[6]), gr.update(label=T[7]), gr.update(value=D[7]), gr.update(label=T[8]), gr.update(value=D[8]), gr.update(label=L["in_label"], placeholder=L["in_ph"]), gr.update(label=L["out_lat"]), gr.update(label=L["out_ib"]), gr.update(label=L["out_audio"]), gr.update(value=L["btn"]) ) combo.change( switch_lang, combo, [ title, subtitle, combo, doc_header, acc1, md1, acc2, md2, acc3, md3, acc4, md4, acc5, md5, acc6, md6, acc7, md7, acc8, md8, acc9, md9, es_in, ni_out, ib_out, audio_out, btn_tr ] ) if __name__ == "__main__": demo.queue().launch()