Update app.py
Browse files
app.py
CHANGED
|
@@ -1,30 +1,30 @@
|
|
| 1 |
-
# -------------------------------------------------------------------------------------------------
|
| 2 |
# app.py β Streamlit app para prediΓ§Γ£o de GO:MF
|
| 3 |
-
#
|
| 4 |
-
#
|
| 5 |
-
|
|
|
|
| 6 |
import os, re, numpy as np, torch, joblib, streamlit as st
|
| 7 |
from huggingface_hub import login
|
| 8 |
from transformers import AutoTokenizer, AutoModel
|
| 9 |
from keras.models import load_model
|
| 10 |
from goatools.obo_parser import GODag
|
| 11 |
|
| 12 |
-
#
|
| 13 |
login(os.environ["HF_TOKEN"])
|
| 14 |
|
| 15 |
-
#
|
| 16 |
SPACE_ID = "melvinalves/protein_function_prediction"
|
| 17 |
TOP_N = 20
|
| 18 |
THRESH = 0.37
|
| 19 |
CHUNK_PB = 512
|
| 20 |
CHUNK_ESM = 1024
|
| 21 |
|
| 22 |
-
#
|
| 23 |
FINETUNED_PB = ("melvinalves/FineTune", "fineTunedProtbert")
|
| 24 |
FINETUNED_BFD = ("melvinalves/FineTune", "fineTunedProtbertbfd")
|
| 25 |
BASE_ESM = "facebook/esm2_t33_650M_UR50D"
|
| 26 |
|
| 27 |
-
#
|
| 28 |
@st.cache_resource
|
| 29 |
def download_file(path):
|
| 30 |
"""Ficheiros pequenos (β€1 GB) guardados no Space."""
|
|
@@ -76,7 +76,7 @@ def load_go_info():
|
|
| 76 |
|
| 77 |
GO_INFO = load_go_info()
|
| 78 |
|
| 79 |
-
#
|
| 80 |
mlp_pb = load_keras("mlp_protbert.h5")
|
| 81 |
mlp_bfd = load_keras("mlp_protbertbfd.h5")
|
| 82 |
mlp_esm = load_keras("mlp_esm2.h5")
|
|
@@ -85,7 +85,7 @@ stacking = load_keras("ensemble_stack.h5")
|
|
| 85 |
mlb = joblib.load(download_file("data/mlb_597.pkl"))
|
| 86 |
GO = mlb.classes_
|
| 87 |
|
| 88 |
-
#
|
| 89 |
st.set_page_config(page_title="PrediΓ§Γ£o de FunΓ§Γ΅es Moleculares de ProteΓnas",
|
| 90 |
page_icon="π§¬", layout="centered")
|
| 91 |
|
|
@@ -111,7 +111,7 @@ st.title("PrediΓ§Γ£o de FunΓ§Γ΅es Moleculares de ProteΓnas (GO:MF)")
|
|
| 111 |
fasta_input = st.text_area("Insere uma ou mais sequΓͺncias FASTA:", height=300)
|
| 112 |
predict_clicked = st.button("Prever GO terms")
|
| 113 |
|
| 114 |
-
#
|
| 115 |
def parse_fasta_multiple(text):
|
| 116 |
"""Extrai [(header, seq)] de texto FASTA (bloco inicial sem '>' suportado)."""
|
| 117 |
out = []
|
|
@@ -139,9 +139,9 @@ def clean_definition(defin: str) -> str:
|
|
| 139 |
|
| 140 |
def go_link(go_id, name=""):
|
| 141 |
url = f"https://www.ebi.ac.uk/QuickGO/term/{go_id}"
|
| 142 |
-
return f"[{go_id}
|
| 143 |
|
| 144 |
-
#
|
| 145 |
def mostrar(header, y_pred):
|
| 146 |
pid = header.split()[0]
|
| 147 |
uniprot = f"https://www.uniprot.org/uniprotkb/{pid}"
|
|
@@ -162,21 +162,21 @@ def mostrar(header, y_pred):
|
|
| 162 |
|
| 163 |
col1, col2 = st.columns(2)
|
| 164 |
|
| 165 |
-
#
|
| 166 |
with col1:
|
| 167 |
st.markdown(f"**GO terms com prob β₯ {THRESH}**")
|
| 168 |
hits = mlb.inverse_transform((y_pred >= THRESH).astype(int))[0]
|
| 169 |
if hits:
|
| 170 |
for go_id in hits:
|
| 171 |
-
name, defin_raw = GO_INFO.get(go_id, ("
|
| 172 |
defin = clean_definition(defin_raw)
|
| 173 |
st.markdown(f"- {go_link(go_id, name)}")
|
| 174 |
if defin:
|
| 175 |
st.caption(defin)
|
| 176 |
else:
|
| 177 |
-
st.code("
|
| 178 |
|
| 179 |
-
#
|
| 180 |
with col2:
|
| 181 |
st.markdown(f"**Top {TOP_N} GO terms mais provΓ‘veis**")
|
| 182 |
for rank, idx in enumerate(np.argsort(-y_pred[0])[:TOP_N], 1):
|
|
@@ -184,7 +184,7 @@ def mostrar(header, y_pred):
|
|
| 184 |
name, _ = GO_INFO.get(go_id, ("", ""))
|
| 185 |
st.markdown(f"{rank}. {go_link(go_id, name)} : {y_pred[0][idx]:.4f}")
|
| 186 |
|
| 187 |
-
#
|
| 188 |
if predict_clicked:
|
| 189 |
for header, seq in parse_fasta_multiple(fasta_input):
|
| 190 |
with st.spinner(f"A processar {header}β¦ (pode demorar alguns minutos)"):
|
|
@@ -200,7 +200,7 @@ if predict_clicked:
|
|
| 200 |
|
| 201 |
mostrar(header, y_ens)
|
| 202 |
|
| 203 |
-
#
|
| 204 |
with st.expander("Mostrar lista completa dos 597 GO terms possΓveis", expanded=False):
|
| 205 |
cols = st.columns(3)
|
| 206 |
for i, go_id in enumerate(GO):
|
|
|
|
|
|
|
| 1 |
# app.py β Streamlit app para prediΓ§Γ£o de GO:MF
|
| 2 |
+
# ProtBERT / ProtBERT-BFD fine-tuned (melvinalves/FineTune)
|
| 3 |
+
# ESM-2 base (facebook/esm2_t33_650M_UR50D)
|
| 4 |
+
|
| 5 |
+
|
| 6 |
import os, re, numpy as np, torch, joblib, streamlit as st
|
| 7 |
from huggingface_hub import login
|
| 8 |
from transformers import AutoTokenizer, AutoModel
|
| 9 |
from keras.models import load_model
|
| 10 |
from goatools.obo_parser import GODag
|
| 11 |
|
| 12 |
+
# AUTENTICAΓΓO #
|
| 13 |
login(os.environ["HF_TOKEN"])
|
| 14 |
|
| 15 |
+
# CONFIG #
|
| 16 |
SPACE_ID = "melvinalves/protein_function_prediction"
|
| 17 |
TOP_N = 20
|
| 18 |
THRESH = 0.37
|
| 19 |
CHUNK_PB = 512
|
| 20 |
CHUNK_ESM = 1024
|
| 21 |
|
| 22 |
+
# REPOSITΓRIOS HF
|
| 23 |
FINETUNED_PB = ("melvinalves/FineTune", "fineTunedProtbert")
|
| 24 |
FINETUNED_BFD = ("melvinalves/FineTune", "fineTunedProtbertbfd")
|
| 25 |
BASE_ESM = "facebook/esm2_t33_650M_UR50D"
|
| 26 |
|
| 27 |
+
# HELPERS #
|
| 28 |
@st.cache_resource
|
| 29 |
def download_file(path):
|
| 30 |
"""Ficheiros pequenos (β€1 GB) guardados no Space."""
|
|
|
|
| 76 |
|
| 77 |
GO_INFO = load_go_info()
|
| 78 |
|
| 79 |
+
# MODELOS #
|
| 80 |
mlp_pb = load_keras("mlp_protbert.h5")
|
| 81 |
mlp_bfd = load_keras("mlp_protbertbfd.h5")
|
| 82 |
mlp_esm = load_keras("mlp_esm2.h5")
|
|
|
|
| 85 |
mlb = joblib.load(download_file("data/mlb_597.pkl"))
|
| 86 |
GO = mlb.classes_
|
| 87 |
|
| 88 |
+
# UI #
|
| 89 |
st.set_page_config(page_title="PrediΓ§Γ£o de FunΓ§Γ΅es Moleculares de ProteΓnas",
|
| 90 |
page_icon="π§¬", layout="centered")
|
| 91 |
|
|
|
|
| 111 |
fasta_input = st.text_area("Insere uma ou mais sequΓͺncias FASTA:", height=300)
|
| 112 |
predict_clicked = st.button("Prever GO terms")
|
| 113 |
|
| 114 |
+
# UTILITΓRIOS #
|
| 115 |
def parse_fasta_multiple(text):
|
| 116 |
"""Extrai [(header, seq)] de texto FASTA (bloco inicial sem '>' suportado)."""
|
| 117 |
out = []
|
|
|
|
| 139 |
|
| 140 |
def go_link(go_id, name=""):
|
| 141 |
url = f"https://www.ebi.ac.uk/QuickGO/term/{go_id}"
|
| 142 |
+
return f"[{go_id} - {name}]({url})" if name else f"[{go_id}]({url})"
|
| 143 |
|
| 144 |
+
# MOSTRAR RESULTADOS #
|
| 145 |
def mostrar(header, y_pred):
|
| 146 |
pid = header.split()[0]
|
| 147 |
uniprot = f"https://www.uniprot.org/uniprotkb/{pid}"
|
|
|
|
| 162 |
|
| 163 |
col1, col2 = st.columns(2)
|
| 164 |
|
| 165 |
+
# coluna 1 : β₯ threshold
|
| 166 |
with col1:
|
| 167 |
st.markdown(f"**GO terms com prob β₯ {THRESH}**")
|
| 168 |
hits = mlb.inverse_transform((y_pred >= THRESH).astype(int))[0]
|
| 169 |
if hits:
|
| 170 |
for go_id in hits:
|
| 171 |
+
name, defin_raw = GO_INFO.get(go_id, ("- sem nome -", ""))
|
| 172 |
defin = clean_definition(defin_raw)
|
| 173 |
st.markdown(f"- {go_link(go_id, name)}")
|
| 174 |
if defin:
|
| 175 |
st.caption(defin)
|
| 176 |
else:
|
| 177 |
+
st.code("- nenhum -")
|
| 178 |
|
| 179 |
+
# coluna 2 : Top-20
|
| 180 |
with col2:
|
| 181 |
st.markdown(f"**Top {TOP_N} GO terms mais provΓ‘veis**")
|
| 182 |
for rank, idx in enumerate(np.argsort(-y_pred[0])[:TOP_N], 1):
|
|
|
|
| 184 |
name, _ = GO_INFO.get(go_id, ("", ""))
|
| 185 |
st.markdown(f"{rank}. {go_link(go_id, name)} : {y_pred[0][idx]:.4f}")
|
| 186 |
|
| 187 |
+
# INFERΓNCIA #
|
| 188 |
if predict_clicked:
|
| 189 |
for header, seq in parse_fasta_multiple(fasta_input):
|
| 190 |
with st.spinner(f"A processar {header}β¦ (pode demorar alguns minutos)"):
|
|
|
|
| 200 |
|
| 201 |
mostrar(header, y_ens)
|
| 202 |
|
| 203 |
+
# LISTA COMPLETA #
|
| 204 |
with st.expander("Mostrar lista completa dos 597 GO terms possΓveis", expanded=False):
|
| 205 |
cols = st.columns(3)
|
| 206 |
for i, go_id in enumerate(GO):
|