Spaces:

melvinalves
/

protein_function_prediction

Sleeping

File size: 7,844 Bytes

c6dfc57
0099257
 
 
 
c6dfc57
 
49f3a1b
201f653
be01d59
49f3a1b
0099257
82c1af8
c6dfc57
0099257
a08cc8f
1e0b741
be01d59
1e0b741
 
a08cc8f
0099257
c6dfc57
 
 
 
0099257
49f3a1b
a08cc8f
1dadffc
c6dfc57
a08cc8f
49f3a1b
0104888
a08cc8f
1dadffc
a08cc8f
0104888
4990c94
1dadffc
20b7c2a
1dadffc
 
c6dfc57
1dadffc
 
 
 
 
d31f1ca
 
 
c6dfc57
20b7c2a
 
1dadffc
 
 
20b7c2a
1dadffc
c6dfc57
d31f1ca
1dadffc
d31f1ca
1dadffc
4990c94
1dadffc
c6dfc57
a5e2965
49f3a1b
be01d59
 
1e0b741
 
be01d59
 
 
 
0099257
a08cc8f
 
 
ee20d5e
a08cc8f
 
 
 
0099257
bd36deb
 
 
30eee06
 
bd36deb
1e0b741
 
 
2bea969
1e0b741
2bea969
bd36deb
30eee06
 
 
bd36deb
c9aceda
 
bd36deb
 
c1b30d0
20b7c2a
 
49f3a1b
0099257
1e0b741
 
20b7c2a
 
1e0b741
b1ecb63
20b7c2a
 
 
b1ecb63
1e0b741
 
 
 
20b7c2a
 
 
 
 
 
 
 
 
 
c1b30d0
bd36deb
5406932
0099257
bd36deb
0099257
2bea969
c9aceda
 
166688e
3b8f083
 
 
c9aceda
 
 
 
1e0b741
3b8f083
 
 
 
 
c971d5f
 
 
0099257
c971d5f
 
 
 
 
0099257
1e0b741
2bea969
c971d5f
 
 
0099257
c971d5f
0099257
c971d5f
 
1e0b741
c971d5f
 
 
bd36deb
0099257
b1ecb63
c9aceda
49b6b36
c6dfc57
 
 
b1ecb63
 
 
c9aceda
c6dfc57
c9aceda
b1ecb63
2bea969
bd36deb
0061ba9
13ef88e
0e2cc06
1f44902
0061ba9
1f44902
 
bd36deb
1f44902
 
 
 
 
 
 
 
 
 
 
13ef88e

#  app.py  –  Streamlit app para predição de GO:MF
#  ProtBERT / ProtBERT-BFD fine-tuned   (melvinalves/FineTune)
#  ESM-2 base                            (facebook/esm2_t33_650M_UR50D)


import os, re, numpy as np, torch, joblib, streamlit as st
from huggingface_hub import login
from transformers import AutoTokenizer, AutoModel
from keras.models import load_model
from goatools.obo_parser import GODag

# AUTENTICAÇÃO #
login(os.environ["HF_TOKEN"])

# CONFIG #
SPACE_ID   = "melvinalves/protein_function_prediction"
TOP_N      = 20
THRESH     = 0.37
CHUNK_PB   = 512
CHUNK_ESM  = 1024

# REPOSITÓRIOS HF
FINETUNED_PB   = ("melvinalves/FineTune", "fineTunedProtbert")
FINETUNED_BFD  = ("melvinalves/FineTune", "fineTunedProtbertbfd")
BASE_ESM       = "facebook/esm2_t33_650M_UR50D"

# HELPERS #
@st.cache_resource
def download_file(path):
    """Ficheiros pequenos (≤1 GB) guardados no Space."""
    from huggingface_hub import hf_hub_download
    return hf_hub_download(repo_id=SPACE_ID, repo_type="space", filename=path)

@st.cache_resource
def load_keras(name):
    """Carrega modelos Keras (MLPs e stacking)."""
    return load_model(download_file(f"models/{name}"), compile=False)

@st.cache_resource
def load_hf_encoder(repo_id, subfolder=None, base_tok=None):
    """Carrega tokenizer + encoder; converte TF-weights → PyTorch on-the-fly."""
    if base_tok is None:
        base_tok = repo_id
    tok = AutoTokenizer.from_pretrained(base_tok, do_lower_case=False)

    kwargs = dict(from_tf=True)
    if subfolder:
        kwargs["subfolder"] = subfolder
    mdl = AutoModel.from_pretrained(repo_id, **kwargs)
    mdl.eval()
    return tok, mdl

def embed_seq(model_ref, seq, chunk):
    """Devolve embedding CLS médio; corta seq. longa em chunks se preciso."""
    if isinstance(model_ref, tuple):                # ProtBERT fine-tuned
        repo_id, subf = model_ref
        tok, mdl = load_hf_encoder(repo_id, subfolder=subf,
                                   base_tok="Rostlab/prot_bert")
    else:                                           # modelo base ESM-2
        tok, mdl = load_hf_encoder(model_ref)

    parts = [seq[i:i+chunk] for i in range(0, len(seq), chunk)]
    vecs  = []
    for p in parts:
        toks = tok(" ".join(p), return_tensors="pt", truncation=False)
        with torch.no_grad():
            out = mdl(**{k: v.to(mdl.device) for k, v in toks.items()})
        vecs.append(out.last_hidden_state[:, 0, :].cpu().numpy())
    return np.mean(vecs, axis=0)

@st.cache_resource
def load_go_info():
    """Lê GO.obo e devolve {id: (name, definition bruta)}."""
    dag = GODag(download_file("data/go.obo"), optional_attrs=["defn"])
    return {tid: (term.name, term.defn) for tid, term in dag.items()}

GO_INFO = load_go_info()

# MODELOS #
mlp_pb   = load_keras("mlp_protbert.h5")
mlp_bfd  = load_keras("mlp_protbertbfd.h5")
mlp_esm  = load_keras("mlp_esm2.h5")
stacking = load_keras("ensemble_stack.h5")

mlb      = joblib.load(download_file("data/mlb_597.pkl"))
GO       = mlb.classes_

# UI #
st.set_page_config(page_title="Predição de Funções Moleculares de Proteínas",
                   page_icon="🧬", layout="centered")

st.markdown(
    """
    <style>
        body, .stApp { background:#FFFFFF !important; }
        .block-container { padding-top:1.5rem; }
        textarea { font-size:0.9rem !important; }
        div[data-testid="column"]:first-child {
            border-right:1px solid #E0E0E0; padding-right:1rem !important;
        }
    </style>
    """,
    unsafe_allow_html=True
)

if os.path.exists("logo.png"):
    st.image("logo.png", width=180)

st.title("Predição de Funções Moleculares de Proteínas (GO:MF)")

fasta_input     = st.text_area("Insere uma ou mais sequências FASTA:", height=300)
predict_clicked = st.button("Prever GO terms")

# UTILITÁRIOS #
def parse_fasta_multiple(text):
    """Extrai [(header, seq)] de texto FASTA (bloco inicial sem '>' suportado)."""
    out = []
    for i, blk in enumerate(text.strip().split(">")):
        if not blk.strip():
            continue
        lines  = blk.strip().splitlines()
        header = lines[0].strip() if i else f"Seq_{i+1}"
        seq    = "".join(lines[1:] if i else lines).replace(" ", "").upper()
        if seq:
            out.append((header, seq))
    return out

def clean_definition(defin: str) -> str:
    """
    Retorna apenas o texto dentro das primeiras aspas.
    Se não houver aspas, devolve texto antes do primeiro '['.
    """
    if not defin:
        return ""
    m = re.search(r'"([^"]+)"', defin)
    if m:
        return m.group(1).strip()
    return defin.split("[", 1)[0].strip()

def go_link(go_id, name=""):
    url = f"https://www.ebi.ac.uk/QuickGO/term/{go_id}"
    return f"[{go_id} - {name}]({url})" if name else f"[{go_id}]({url})"

# MOSTRAR RESULTADOS #
def mostrar(header, y_pred):
    pid      = header.split()[0]
    uniprot  = f"https://www.uniprot.org/uniprotkb/{pid}"

    with st.expander(header, expanded=True):
        st.markdown(
            f"""
            <div style="text-align:right;margin-bottom:0.5rem">
              <a href="{uniprot}" target="_blank">
                <button style="background:#2b8cbe;border:none;border-radius:4px;
                               padding:0.35rem 0.8rem;color:#fff;font-size:0.9rem;
                               cursor:pointer">Visitar UniProt</button>
              </a>
            </div>
            """,
            unsafe_allow_html=True
        )

        col1, col2 = st.columns(2)

        # coluna 1 : ≥ threshold
        with col1:
            st.markdown(f"**GO terms com prob ≥ {THRESH}**")
            hits = mlb.inverse_transform((y_pred >= THRESH).astype(int))[0]
            if hits:
                for go_id in hits:
                    name, defin_raw = GO_INFO.get(go_id, ("- sem nome -", ""))
                    defin = clean_definition(defin_raw)
                    st.markdown(f"- {go_link(go_id, name)}")
                    if defin:
                        st.caption(defin)
            else:
                st.code("- nenhum -")

        # coluna 2 : Top-20
        with col2:
            st.markdown(f"**Top {TOP_N} GO terms mais prováveis**")
            for rank, idx in enumerate(np.argsort(-y_pred[0])[:TOP_N], 1):
                go_id = GO[idx]
                name, _ = GO_INFO.get(go_id, ("", ""))
                st.markdown(f"{rank}. {go_link(go_id, name)} : {y_pred[0][idx]:.4f}")

# INFERÊNCIA #
if predict_clicked:
    for header, seq in parse_fasta_multiple(fasta_input):
        with st.spinner(f"A processar {header}… (pode demorar alguns minutos)"):
            emb_pb  = embed_seq(FINETUNED_PB,  seq, CHUNK_PB)
            emb_bfd = embed_seq(FINETUNED_BFD, seq, CHUNK_PB)
            emb_esm = embed_seq(BASE_ESM,       seq, CHUNK_ESM)

            y_pb  = mlp_pb.predict(emb_pb)
            y_bfd = mlp_bfd.predict(emb_bfd)
            y_esm = mlp_esm.predict(emb_esm)[:, :597]

            y_ens = stacking.predict(np.concatenate([y_pb, y_bfd, y_esm], axis=1))

        mostrar(header, y_ens)

# LISTA COMPLETA COM BARRA DE PESQUISA #
with st.expander("Mostrar lista completa dos 597 GO terms possíveis", expanded=False):
    search_term = st.text_input("Filtra GO term ou nome:")

    # aplicar filtro
    filtered_go_terms = []
    for go_id in GO:
        name, _ = GO_INFO.get(go_id, ("", ""))
        if search_term.strip().lower() in go_id.lower() or search_term.strip().lower() in name.lower():
            filtered_go_terms.append((go_id, name))

    # mostrar por colunas
    if filtered_go_terms:
        cols = st.columns(3)
        for i, (go_id, name) in enumerate(filtered_go_terms):
            cols[i % 3].markdown(f"- {go_link(go_id, name)}")
    else:
        st.info("Nenhum GO term corresponde ao filtro inserido.")