melvinalves commited on
Commit
0099257
Β·
verified Β·
1 Parent(s): 7e71d76

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +19 -19
app.py CHANGED
@@ -1,30 +1,30 @@
1
- # -------------------------------------------------------------------------------------------------
2
  # app.py – Streamlit app para prediΓ§Γ£o de GO:MF
3
- # β€’ ProtBERT / ProtBERT-BFD fine-tuned (melvinalves/FineTune)
4
- # β€’ ESM-2 base (facebook/esm2_t33_650M_UR50D)
5
- # -------------------------------------------------------------------------------------------------
 
6
  import os, re, numpy as np, torch, joblib, streamlit as st
7
  from huggingface_hub import login
8
  from transformers import AutoTokenizer, AutoModel
9
  from keras.models import load_model
10
  from goatools.obo_parser import GODag
11
 
12
- # β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€” AUTENTICAÇÃO β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€” #
13
  login(os.environ["HF_TOKEN"])
14
 
15
- # β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€” CONFIG β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€” #
16
  SPACE_ID = "melvinalves/protein_function_prediction"
17
  TOP_N = 20
18
  THRESH = 0.37
19
  CHUNK_PB = 512
20
  CHUNK_ESM = 1024
21
 
22
- # repositΓ³rios HF
23
  FINETUNED_PB = ("melvinalves/FineTune", "fineTunedProtbert")
24
  FINETUNED_BFD = ("melvinalves/FineTune", "fineTunedProtbertbfd")
25
  BASE_ESM = "facebook/esm2_t33_650M_UR50D"
26
 
27
- # β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€” HELPERS β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€” #
28
  @st.cache_resource
29
  def download_file(path):
30
  """Ficheiros pequenos (≀1 GB) guardados no Space."""
@@ -76,7 +76,7 @@ def load_go_info():
76
 
77
  GO_INFO = load_go_info()
78
 
79
- # β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€” CARGA MODELOS β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€” #
80
  mlp_pb = load_keras("mlp_protbert.h5")
81
  mlp_bfd = load_keras("mlp_protbertbfd.h5")
82
  mlp_esm = load_keras("mlp_esm2.h5")
@@ -85,7 +85,7 @@ stacking = load_keras("ensemble_stack.h5")
85
  mlb = joblib.load(download_file("data/mlb_597.pkl"))
86
  GO = mlb.classes_
87
 
88
- # β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€” UI β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€” #
89
  st.set_page_config(page_title="PrediΓ§Γ£o de FunΓ§Γ΅es Moleculares de ProteΓ­nas",
90
  page_icon="🧬", layout="centered")
91
 
@@ -111,7 +111,7 @@ st.title("PrediΓ§Γ£o de FunΓ§Γ΅es Moleculares de ProteΓ­nas (GO:MF)")
111
  fasta_input = st.text_area("Insere uma ou mais sequΓͺncias FASTA:", height=300)
112
  predict_clicked = st.button("Prever GO terms")
113
 
114
- # β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€” UTILITÁRIOS β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€” #
115
  def parse_fasta_multiple(text):
116
  """Extrai [(header, seq)] de texto FASTA (bloco inicial sem '>' suportado)."""
117
  out = []
@@ -139,9 +139,9 @@ def clean_definition(defin: str) -> str:
139
 
140
  def go_link(go_id, name=""):
141
  url = f"https://www.ebi.ac.uk/QuickGO/term/{go_id}"
142
- return f"[{go_id} β€” {name}]({url})" if name else f"[{go_id}]({url})"
143
 
144
- # β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€” MOSTRAR RESULTADOS β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€” #
145
  def mostrar(header, y_pred):
146
  pid = header.split()[0]
147
  uniprot = f"https://www.uniprot.org/uniprotkb/{pid}"
@@ -162,21 +162,21 @@ def mostrar(header, y_pred):
162
 
163
  col1, col2 = st.columns(2)
164
 
165
- # --- coluna 1 : β‰₯ threshold
166
  with col1:
167
  st.markdown(f"**GO terms com prob β‰₯ {THRESH}**")
168
  hits = mlb.inverse_transform((y_pred >= THRESH).astype(int))[0]
169
  if hits:
170
  for go_id in hits:
171
- name, defin_raw = GO_INFO.get(go_id, ("β€” sem nome β€”", ""))
172
  defin = clean_definition(defin_raw)
173
  st.markdown(f"- {go_link(go_id, name)}")
174
  if defin:
175
  st.caption(defin)
176
  else:
177
- st.code("β€” nenhum β€”")
178
 
179
- # --- coluna 2 : Top-20
180
  with col2:
181
  st.markdown(f"**Top {TOP_N} GO terms mais provΓ‘veis**")
182
  for rank, idx in enumerate(np.argsort(-y_pred[0])[:TOP_N], 1):
@@ -184,7 +184,7 @@ def mostrar(header, y_pred):
184
  name, _ = GO_INFO.get(go_id, ("", ""))
185
  st.markdown(f"{rank}. {go_link(go_id, name)} : {y_pred[0][idx]:.4f}")
186
 
187
- # β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€” INFERÊNCIA β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€” #
188
  if predict_clicked:
189
  for header, seq in parse_fasta_multiple(fasta_input):
190
  with st.spinner(f"A processar {header}… (pode demorar alguns minutos)"):
@@ -200,7 +200,7 @@ if predict_clicked:
200
 
201
  mostrar(header, y_ens)
202
 
203
- # β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€” LISTA COMPLETA β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€” #
204
  with st.expander("Mostrar lista completa dos 597 GO terms possΓ­veis", expanded=False):
205
  cols = st.columns(3)
206
  for i, go_id in enumerate(GO):
 
 
1
  # app.py – Streamlit app para prediΓ§Γ£o de GO:MF
2
+ # ProtBERT / ProtBERT-BFD fine-tuned (melvinalves/FineTune)
3
+ # ESM-2 base (facebook/esm2_t33_650M_UR50D)
4
+
5
+
6
  import os, re, numpy as np, torch, joblib, streamlit as st
7
  from huggingface_hub import login
8
  from transformers import AutoTokenizer, AutoModel
9
  from keras.models import load_model
10
  from goatools.obo_parser import GODag
11
 
12
+ # AUTENTICAÇÃO #
13
  login(os.environ["HF_TOKEN"])
14
 
15
+ # CONFIG #
16
  SPACE_ID = "melvinalves/protein_function_prediction"
17
  TOP_N = 20
18
  THRESH = 0.37
19
  CHUNK_PB = 512
20
  CHUNK_ESM = 1024
21
 
22
+ # REPOSITΓ“RIOS HF
23
  FINETUNED_PB = ("melvinalves/FineTune", "fineTunedProtbert")
24
  FINETUNED_BFD = ("melvinalves/FineTune", "fineTunedProtbertbfd")
25
  BASE_ESM = "facebook/esm2_t33_650M_UR50D"
26
 
27
+ # HELPERS #
28
  @st.cache_resource
29
  def download_file(path):
30
  """Ficheiros pequenos (≀1 GB) guardados no Space."""
 
76
 
77
  GO_INFO = load_go_info()
78
 
79
+ # MODELOS #
80
  mlp_pb = load_keras("mlp_protbert.h5")
81
  mlp_bfd = load_keras("mlp_protbertbfd.h5")
82
  mlp_esm = load_keras("mlp_esm2.h5")
 
85
  mlb = joblib.load(download_file("data/mlb_597.pkl"))
86
  GO = mlb.classes_
87
 
88
+ # UI #
89
  st.set_page_config(page_title="PrediΓ§Γ£o de FunΓ§Γ΅es Moleculares de ProteΓ­nas",
90
  page_icon="🧬", layout="centered")
91
 
 
111
  fasta_input = st.text_area("Insere uma ou mais sequΓͺncias FASTA:", height=300)
112
  predict_clicked = st.button("Prever GO terms")
113
 
114
+ # UTILITÁRIOS #
115
  def parse_fasta_multiple(text):
116
  """Extrai [(header, seq)] de texto FASTA (bloco inicial sem '>' suportado)."""
117
  out = []
 
139
 
140
  def go_link(go_id, name=""):
141
  url = f"https://www.ebi.ac.uk/QuickGO/term/{go_id}"
142
+ return f"[{go_id} - {name}]({url})" if name else f"[{go_id}]({url})"
143
 
144
+ # MOSTRAR RESULTADOS #
145
  def mostrar(header, y_pred):
146
  pid = header.split()[0]
147
  uniprot = f"https://www.uniprot.org/uniprotkb/{pid}"
 
162
 
163
  col1, col2 = st.columns(2)
164
 
165
+ # coluna 1 : β‰₯ threshold
166
  with col1:
167
  st.markdown(f"**GO terms com prob β‰₯ {THRESH}**")
168
  hits = mlb.inverse_transform((y_pred >= THRESH).astype(int))[0]
169
  if hits:
170
  for go_id in hits:
171
+ name, defin_raw = GO_INFO.get(go_id, ("- sem nome -", ""))
172
  defin = clean_definition(defin_raw)
173
  st.markdown(f"- {go_link(go_id, name)}")
174
  if defin:
175
  st.caption(defin)
176
  else:
177
+ st.code("- nenhum -")
178
 
179
+ # coluna 2 : Top-20
180
  with col2:
181
  st.markdown(f"**Top {TOP_N} GO terms mais provΓ‘veis**")
182
  for rank, idx in enumerate(np.argsort(-y_pred[0])[:TOP_N], 1):
 
184
  name, _ = GO_INFO.get(go_id, ("", ""))
185
  st.markdown(f"{rank}. {go_link(go_id, name)} : {y_pred[0][idx]:.4f}")
186
 
187
+ # INFERÊNCIA #
188
  if predict_clicked:
189
  for header, seq in parse_fasta_multiple(fasta_input):
190
  with st.spinner(f"A processar {header}… (pode demorar alguns minutos)"):
 
200
 
201
  mostrar(header, y_ens)
202
 
203
+ # LISTA COMPLETA #
204
  with st.expander("Mostrar lista completa dos 597 GO terms possΓ­veis", expanded=False):
205
  cols = st.columns(3)
206
  for i, go_id in enumerate(GO):