File size: 7,844 Bytes
c6dfc57 0099257 c6dfc57 49f3a1b 201f653 be01d59 49f3a1b 0099257 82c1af8 c6dfc57 0099257 a08cc8f 1e0b741 be01d59 1e0b741 a08cc8f 0099257 c6dfc57 0099257 49f3a1b a08cc8f 1dadffc c6dfc57 a08cc8f 49f3a1b 0104888 a08cc8f 1dadffc a08cc8f 0104888 4990c94 1dadffc 20b7c2a 1dadffc c6dfc57 1dadffc d31f1ca c6dfc57 20b7c2a 1dadffc 20b7c2a 1dadffc c6dfc57 d31f1ca 1dadffc d31f1ca 1dadffc 4990c94 1dadffc c6dfc57 a5e2965 49f3a1b be01d59 1e0b741 be01d59 0099257 a08cc8f ee20d5e a08cc8f 0099257 bd36deb 30eee06 bd36deb 1e0b741 2bea969 1e0b741 2bea969 bd36deb 30eee06 bd36deb c9aceda bd36deb c1b30d0 20b7c2a 49f3a1b 0099257 1e0b741 20b7c2a 1e0b741 b1ecb63 20b7c2a b1ecb63 1e0b741 20b7c2a c1b30d0 bd36deb 5406932 0099257 bd36deb 0099257 2bea969 c9aceda 166688e 3b8f083 c9aceda 1e0b741 3b8f083 c971d5f 0099257 c971d5f 0099257 1e0b741 2bea969 c971d5f 0099257 c971d5f 0099257 c971d5f 1e0b741 c971d5f bd36deb 0099257 b1ecb63 c9aceda 49b6b36 c6dfc57 b1ecb63 c9aceda c6dfc57 c9aceda b1ecb63 2bea969 bd36deb 0061ba9 13ef88e 0e2cc06 1f44902 0061ba9 1f44902 bd36deb 1f44902 13ef88e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 |
# app.py – Streamlit app para predição de GO:MF
# ProtBERT / ProtBERT-BFD fine-tuned (melvinalves/FineTune)
# ESM-2 base (facebook/esm2_t33_650M_UR50D)
import os, re, numpy as np, torch, joblib, streamlit as st
from huggingface_hub import login
from transformers import AutoTokenizer, AutoModel
from keras.models import load_model
from goatools.obo_parser import GODag
# AUTENTICAÇÃO #
login(os.environ["HF_TOKEN"])
# CONFIG #
SPACE_ID = "melvinalves/protein_function_prediction"
TOP_N = 20
THRESH = 0.37
CHUNK_PB = 512
CHUNK_ESM = 1024
# REPOSITÓRIOS HF
FINETUNED_PB = ("melvinalves/FineTune", "fineTunedProtbert")
FINETUNED_BFD = ("melvinalves/FineTune", "fineTunedProtbertbfd")
BASE_ESM = "facebook/esm2_t33_650M_UR50D"
# HELPERS #
@st.cache_resource
def download_file(path):
"""Ficheiros pequenos (≤1 GB) guardados no Space."""
from huggingface_hub import hf_hub_download
return hf_hub_download(repo_id=SPACE_ID, repo_type="space", filename=path)
@st.cache_resource
def load_keras(name):
"""Carrega modelos Keras (MLPs e stacking)."""
return load_model(download_file(f"models/{name}"), compile=False)
@st.cache_resource
def load_hf_encoder(repo_id, subfolder=None, base_tok=None):
"""Carrega tokenizer + encoder; converte TF-weights → PyTorch on-the-fly."""
if base_tok is None:
base_tok = repo_id
tok = AutoTokenizer.from_pretrained(base_tok, do_lower_case=False)
kwargs = dict(from_tf=True)
if subfolder:
kwargs["subfolder"] = subfolder
mdl = AutoModel.from_pretrained(repo_id, **kwargs)
mdl.eval()
return tok, mdl
def embed_seq(model_ref, seq, chunk):
"""Devolve embedding CLS médio; corta seq. longa em chunks se preciso."""
if isinstance(model_ref, tuple): # ProtBERT fine-tuned
repo_id, subf = model_ref
tok, mdl = load_hf_encoder(repo_id, subfolder=subf,
base_tok="Rostlab/prot_bert")
else: # modelo base ESM-2
tok, mdl = load_hf_encoder(model_ref)
parts = [seq[i:i+chunk] for i in range(0, len(seq), chunk)]
vecs = []
for p in parts:
toks = tok(" ".join(p), return_tensors="pt", truncation=False)
with torch.no_grad():
out = mdl(**{k: v.to(mdl.device) for k, v in toks.items()})
vecs.append(out.last_hidden_state[:, 0, :].cpu().numpy())
return np.mean(vecs, axis=0)
@st.cache_resource
def load_go_info():
"""Lê GO.obo e devolve {id: (name, definition bruta)}."""
dag = GODag(download_file("data/go.obo"), optional_attrs=["defn"])
return {tid: (term.name, term.defn) for tid, term in dag.items()}
GO_INFO = load_go_info()
# MODELOS #
mlp_pb = load_keras("mlp_protbert.h5")
mlp_bfd = load_keras("mlp_protbertbfd.h5")
mlp_esm = load_keras("mlp_esm2.h5")
stacking = load_keras("ensemble_stack.h5")
mlb = joblib.load(download_file("data/mlb_597.pkl"))
GO = mlb.classes_
# UI #
st.set_page_config(page_title="Predição de Funções Moleculares de Proteínas",
page_icon="🧬", layout="centered")
st.markdown(
"""
<style>
body, .stApp { background:#FFFFFF !important; }
.block-container { padding-top:1.5rem; }
textarea { font-size:0.9rem !important; }
div[data-testid="column"]:first-child {
border-right:1px solid #E0E0E0; padding-right:1rem !important;
}
</style>
""",
unsafe_allow_html=True
)
if os.path.exists("logo.png"):
st.image("logo.png", width=180)
st.title("Predição de Funções Moleculares de Proteínas (GO:MF)")
fasta_input = st.text_area("Insere uma ou mais sequências FASTA:", height=300)
predict_clicked = st.button("Prever GO terms")
# UTILITÁRIOS #
def parse_fasta_multiple(text):
"""Extrai [(header, seq)] de texto FASTA (bloco inicial sem '>' suportado)."""
out = []
for i, blk in enumerate(text.strip().split(">")):
if not blk.strip():
continue
lines = blk.strip().splitlines()
header = lines[0].strip() if i else f"Seq_{i+1}"
seq = "".join(lines[1:] if i else lines).replace(" ", "").upper()
if seq:
out.append((header, seq))
return out
def clean_definition(defin: str) -> str:
"""
Retorna apenas o texto dentro das primeiras aspas.
Se não houver aspas, devolve texto antes do primeiro '['.
"""
if not defin:
return ""
m = re.search(r'"([^"]+)"', defin)
if m:
return m.group(1).strip()
return defin.split("[", 1)[0].strip()
def go_link(go_id, name=""):
url = f"https://www.ebi.ac.uk/QuickGO/term/{go_id}"
return f"[{go_id} - {name}]({url})" if name else f"[{go_id}]({url})"
# MOSTRAR RESULTADOS #
def mostrar(header, y_pred):
pid = header.split()[0]
uniprot = f"https://www.uniprot.org/uniprotkb/{pid}"
with st.expander(header, expanded=True):
st.markdown(
f"""
<div style="text-align:right;margin-bottom:0.5rem">
<a href="{uniprot}" target="_blank">
<button style="background:#2b8cbe;border:none;border-radius:4px;
padding:0.35rem 0.8rem;color:#fff;font-size:0.9rem;
cursor:pointer">Visitar UniProt</button>
</a>
</div>
""",
unsafe_allow_html=True
)
col1, col2 = st.columns(2)
# coluna 1 : ≥ threshold
with col1:
st.markdown(f"**GO terms com prob ≥ {THRESH}**")
hits = mlb.inverse_transform((y_pred >= THRESH).astype(int))[0]
if hits:
for go_id in hits:
name, defin_raw = GO_INFO.get(go_id, ("- sem nome -", ""))
defin = clean_definition(defin_raw)
st.markdown(f"- {go_link(go_id, name)}")
if defin:
st.caption(defin)
else:
st.code("- nenhum -")
# coluna 2 : Top-20
with col2:
st.markdown(f"**Top {TOP_N} GO terms mais prováveis**")
for rank, idx in enumerate(np.argsort(-y_pred[0])[:TOP_N], 1):
go_id = GO[idx]
name, _ = GO_INFO.get(go_id, ("", ""))
st.markdown(f"{rank}. {go_link(go_id, name)} : {y_pred[0][idx]:.4f}")
# INFERÊNCIA #
if predict_clicked:
for header, seq in parse_fasta_multiple(fasta_input):
with st.spinner(f"A processar {header}… (pode demorar alguns minutos)"):
emb_pb = embed_seq(FINETUNED_PB, seq, CHUNK_PB)
emb_bfd = embed_seq(FINETUNED_BFD, seq, CHUNK_PB)
emb_esm = embed_seq(BASE_ESM, seq, CHUNK_ESM)
y_pb = mlp_pb.predict(emb_pb)
y_bfd = mlp_bfd.predict(emb_bfd)
y_esm = mlp_esm.predict(emb_esm)[:, :597]
y_ens = stacking.predict(np.concatenate([y_pb, y_bfd, y_esm], axis=1))
mostrar(header, y_ens)
# LISTA COMPLETA COM BARRA DE PESQUISA #
with st.expander("Mostrar lista completa dos 597 GO terms possíveis", expanded=False):
search_term = st.text_input("Filtra GO term ou nome:")
# aplicar filtro
filtered_go_terms = []
for go_id in GO:
name, _ = GO_INFO.get(go_id, ("", ""))
if search_term.strip().lower() in go_id.lower() or search_term.strip().lower() in name.lower():
filtered_go_terms.append((go_id, name))
# mostrar por colunas
if filtered_go_terms:
cols = st.columns(3)
for i, (go_id, name) in enumerate(filtered_go_terms):
cols[i % 3].markdown(f"- {go_link(go_id, name)}")
else:
st.info("Nenhum GO term corresponde ao filtro inserido.")
|