melvinalves commited on
Commit
5406932
Β·
verified Β·
1 Parent(s): 75c1762

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +41 -86
app.py CHANGED
@@ -14,10 +14,10 @@ login(os.environ["HF_TOKEN"])
14
 
15
  # β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€” CONFIG β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€” #
16
  SPACE_ID = "melvinalves/protein_function_prediction"
17
- TOP_N = 20 # top-20 mais provΓ‘veis
18
  THRESH = 0.37
19
- CHUNK_PB = 512 # janela ProtBERT / ProtBERT-BFD
20
- CHUNK_ESM = 1024 # janela ESM-2
21
 
22
  # repositΓ³rios HF
23
  FINETUNED_PB = ("melvinalves/FineTune", "fineTunedProtbert")
@@ -43,7 +43,7 @@ def load_hf_encoder(repo_id, subfolder=None, base_tok=None):
43
  β€’ repo_id : repositΓ³rio HF ou caminho local
44
  β€’ subfolder : subpasta onde vivem pesos/config (None se nΓ£o houver)
45
  β€’ base_tok : repo para o tokenizer (None => usa repo_id)
46
- Converte tf_model.h5 β†’ PyTorch on-the-fly.
47
  """
48
  if base_tok is None:
49
  base_tok = repo_id
@@ -59,7 +59,8 @@ def load_hf_encoder(repo_id, subfolder=None, base_tok=None):
59
  # ---------- extrair embedding ----------
60
  def embed_seq(model_ref, seq, chunk):
61
  """
62
- Retorna embedding CLS mΓ©dio (divide sequΓͺncia em chunks se necessΓ‘rio).
 
63
  """
64
  if isinstance(model_ref, tuple): # ProtBERT fine-tuned
65
  repo_id, subf = model_ref
@@ -96,68 +97,28 @@ mlb = joblib.load(download_file("data/mlb_597.pkl"))
96
  GO = mlb.classes_
97
 
98
  # β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€” UI β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€” #
 
99
  st.set_page_config(page_title="PrediΓ§Γ£o de FunΓ§Γ΅es Moleculares de ProteΓ­nas",
100
  page_icon="🧬", layout="centered")
101
 
102
- # ---------- CSS global ----------
103
  st.markdown(
104
  """
105
  <style>
106
- /* fundo branco + texto preto */
107
- body, .stApp { background-color:#FFFFFF !important; color:#000000 !important; }
108
- /* reduz top padding para o logo caber completo */
109
- .block-container { padding-top:3rem; }
110
-
111
- /* logo centralizado e afastado do topo */
112
- img.logo-top { display:block; margin:0 auto 1.5rem; }
113
-
114
- /* textarea/input brancos */
115
- textarea, input, .stTextArea textarea, .stTextInput input {
116
- background-color:#FFFFFF !important;
117
- color:#000000 !important;
118
- }
119
-
120
- /* botΓ΅es Streamlit */
121
- .stButton>button {
122
- background:#F8F9FA !important; /* cinza muito claro */
123
- color:#000000 !important;
124
- border:1px solid #007BFF !important;
125
- border-radius:4px;
126
- }
127
- .stButton>button:hover {
128
- background:#007BFF !important;
129
- color:#FFFFFF !important;
130
- }
131
-
132
- /* botΓ£o UniProt custom */
133
- .prot-btn {
134
- background:#007BFF; color:#FFFFFF; border:none;
135
- padding:6px 12px; border-radius:4px; cursor:pointer;
136
- }
137
- .prot-btn:hover {
138
- background:#0056B3;
139
- }
140
-
141
- /* tiramos cores de hover vermelhas dos expanders; seta + texto azuis */
142
- .st-expander:focus:not(:active) .streamlit-expanderHeader,
143
- .streamlit-expanderHeader:hover {
144
- color:#007BFF !important;
145
- }
146
-
147
- /* divisΓ³ria vertical entre colunas */
148
- div[data-testid='column']:nth-of-type(1) {
149
- border-right:1px solid #DDDDDD;
150
- padding-right:1rem;
151
  }
 
 
152
  </style>
153
  """,
154
  unsafe_allow_html=True
155
  )
156
 
157
- # ---------- Logo ----------
158
  LOGO_PATH = "logo.png"
159
  if os.path.exists(LOGO_PATH):
160
- st.markdown(f'<img src="app://{LOGO_PATH}" width="180" class="logo-top">', unsafe_allow_html=True)
161
 
162
  st.title("PrediΓ§Γ£o de FunΓ§Γ΅es Moleculares de ProteΓ­nas (GO:MF)")
163
 
@@ -167,65 +128,59 @@ predict_clicked = st.button("Prever GO terms")
167
  # β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€” PARSE DE MÚLTIPLAS SEQUÊNCIAS β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€” #
168
  def parse_fasta_multiple(fasta_str):
169
  """
170
- Devolve lista (header, seq). Suporta bloco inicial sem '>'.
 
171
  """
172
  entries, parsed = fasta_str.strip().split(">"), []
173
  for i, entry in enumerate(entries):
174
  if not entry.strip():
175
  continue
176
  lines = entry.strip().splitlines()
177
- if i > 0: # FASTA normal
178
  header = lines[0].strip()
179
- seq = "".join(lines[1:]).replace(" ", "").upper()
180
- else: # sequΓͺncia sem '>'
181
  header = f"Seq_{i+1}"
182
- seq = "".join(lines).replace(" ", "").upper()
183
  if seq:
184
  parsed.append((header, seq))
185
  return parsed
186
 
187
- # β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€” FUNÇÕES AUX COLUNA/LINKS β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€” #
188
  def go_link(go_id, name=""):
189
- url = f"https://www.ebi.ac.uk/QuickGO/term/{go_id}"
 
190
  label = f"{go_id} β€” {name}" if name else go_id
191
  return f"[{label}]({url})"
192
 
193
- def prot_url(header):
 
194
  pid = header.split()[0]
195
- return f"https://www.uniprot.org/uniprotkb/{pid}"
196
-
197
- # β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€” MOSTRAR RESULTADOS β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€” #
198
- def mostrar(header, y_pred):
199
- """Expander com coluna-esq (hits) + coluna-dir (Top-20)."""
200
- url = prot_url(header)
201
-
202
- # botΓ£o UniProt fora do expander
203
- st.markdown(
204
- f'<a href="{url}" target="_blank">'
205
- f'<button class="prot-btn">πŸ”— Ver UniProt ({header.split()[0]})</button>'
206
- f'</a>',
207
- unsafe_allow_html=True
208
- )
209
 
210
- with st.expander(header, expanded=True):
 
 
 
211
  col1, col2 = st.columns(2)
212
 
213
- # coluna 1 – hits acima do threshold
214
  with col1:
215
  st.markdown(f"**GO terms com prob β‰₯ {THRESH}**")
216
  hits = mlb.inverse_transform((y_pred >= THRESH).astype(int))[0]
217
  if hits:
218
  for go_id in hits:
219
  name, defin = GO_INFO.get(go_id, ("β€” sem nome β€”", ""))
220
- defin = re.sub(r'^\\s*\"?(.+?)\"?\\s*(\\[[^\\]]*\\])?\\s*$', r'\\1',
221
  defin or "")
222
- st.markdown(f"- {go_link(go_id, name)}")
223
  if defin:
224
  st.caption(defin)
225
  else:
226
  st.code("β€” nenhum β€”")
227
 
228
- # coluna 2 – top-20
229
  with col2:
230
  st.markdown(f"**Top {TOP_N} GO terms mais provΓ‘veis**")
231
  for rank, idx in enumerate(np.argsort(-y_pred[0])[:TOP_N], start=1):
@@ -242,23 +197,23 @@ if predict_clicked:
242
 
243
  for header, seq in parsed_seqs:
244
  with st.spinner(f"A processar {header}… (pode demorar alguns minutos)"):
245
- # embeddings
246
  emb_pb = embed_seq(FINETUNED_PB, seq, CHUNK_PB)
247
  emb_bfd = embed_seq(FINETUNED_BFD, seq, CHUNK_PB)
248
  emb_esm = embed_seq(BASE_ESM, seq, CHUNK_ESM)
249
 
250
- # prediΓ§Γ΅es MLPs
251
  y_pb = mlp_pb.predict(emb_pb)
252
  y_bfd = mlp_bfd.predict(emb_bfd)
253
- y_esm = mlp_esm.predict(emb_esm)[:, :597] # alinhar nΒΊ de termos
254
 
255
- # stacking
256
  X = np.concatenate([y_pb, y_bfd, y_esm], axis=1)
257
  y_ens = stacking.predict(X)
258
 
259
- mostrar(header, y_ens)
260
 
261
- # β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€” LISTA COMPLETA DE TERMOS β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€” #
262
  with st.expander("Mostrar lista completa dos 597 GO terms possΓ­veis", expanded=False):
263
  cols = st.columns(3)
264
  for i, go_id in enumerate(GO):
 
14
 
15
  # β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€” CONFIG β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€” #
16
  SPACE_ID = "melvinalves/protein_function_prediction"
17
+ TOP_N = 20 # mostra agora top-20
18
  THRESH = 0.37
19
+ CHUNK_PB = 512 # janela ProtBERT / ProtBERT-BFD
20
+ CHUNK_ESM = 1024 # janela ESM-2
21
 
22
  # repositΓ³rios HF
23
  FINETUNED_PB = ("melvinalves/FineTune", "fineTunedProtbert")
 
43
  β€’ repo_id : repositΓ³rio HF ou caminho local
44
  β€’ subfolder : subpasta onde vivem pesos/config (None se nΓ£o houver)
45
  β€’ base_tok : repo para o tokenizer (None => usa repo_id)
46
+ Converte tf_model.h5 β†’ PyTorch on-the-fly (from_tf=True).
47
  """
48
  if base_tok is None:
49
  base_tok = repo_id
 
59
  # ---------- extrair embedding ----------
60
  def embed_seq(model_ref, seq, chunk):
61
  """
62
+ β€’ model_ref = string (modelo base) OU tuple(repo_id, subfolder) (modelo fine-tuned)
63
+ Retorna embedding CLS mΓ©dio (caso a sequΓͺncia seja dividida em chunks).
64
  """
65
  if isinstance(model_ref, tuple): # ProtBERT fine-tuned
66
  repo_id, subf = model_ref
 
97
  GO = mlb.classes_
98
 
99
  # β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€” UI β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€” #
100
+ # --- aspecto geral da pΓ‘gina
101
  st.set_page_config(page_title="PrediΓ§Γ£o de FunΓ§Γ΅es Moleculares de ProteΓ­nas",
102
  page_icon="🧬", layout="centered")
103
 
104
+ # CSS: fundo branco sΓ³lido + pequenos ajustes
105
  st.markdown(
106
  """
107
  <style>
108
+ body, .stApp {
109
+ background-color: #FFFFFF !important;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
110
  }
111
+ .block-container { padding-top: 1.5rem; }
112
+ textarea { font-size: 0.9rem !important; }
113
  </style>
114
  """,
115
  unsafe_allow_html=True
116
  )
117
 
118
+ # Logo (coloca logo.png na raiz do Space)
119
  LOGO_PATH = "logo.png"
120
  if os.path.exists(LOGO_PATH):
121
+ st.image(LOGO_PATH, width=180)
122
 
123
  st.title("PrediΓ§Γ£o de FunΓ§Γ΅es Moleculares de ProteΓ­nas (GO:MF)")
124
 
 
128
  # β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€” PARSE DE MÚLTIPLAS SEQUÊNCIAS β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€” #
129
  def parse_fasta_multiple(fasta_str):
130
  """
131
+ Devolve lista de (header, seq) a partir de texto FASTA possivelmente mΓΊltiplo.
132
+ Suporta bloco inicial sem '>'.
133
  """
134
  entries, parsed = fasta_str.strip().split(">"), []
135
  for i, entry in enumerate(entries):
136
  if not entry.strip():
137
  continue
138
  lines = entry.strip().splitlines()
139
+ if i > 0: # bloco tΓ­pico FASTA
140
  header = lines[0].strip()
141
+ seq = "".join(lines[1:]).replace(" ", "").upper()
142
+ else: # sequΓͺncia sem '>'
143
  header = f"Seq_{i+1}"
144
+ seq = "".join(lines).replace(" ", "").upper()
145
  if seq:
146
  parsed.append((header, seq))
147
  return parsed
148
 
149
+ # β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€” FUNÇÕES AUXILIARES DE LAYOUT β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€” #
150
  def go_link(go_id, name=""):
151
+ """Cria link para pΓ‘gina do GO term (QuickGO)."""
152
+ url = f"https://www.ebi.ac.uk/QuickGO/term/{go_id}"
153
  label = f"{go_id} β€” {name}" if name else go_id
154
  return f"[{label}]({url})"
155
 
156
+ def prot_link(header):
157
+ """Tenta gerar link para UniProt usando o primeiro token do header."""
158
  pid = header.split()[0]
159
+ url = f"https://www.uniprot.org/uniprotkb/{pid}"
160
+ return f"[{header}]({url})"
 
 
 
 
 
 
 
 
 
 
 
 
161
 
162
+ # β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€” FUNÇÃO PRINCIPAL DE RESULTADOS β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€” #
163
+ def mostrar(tag, y_pred):
164
+ """Mostra resultados em duas colunas dentro de um expander."""
165
+ with st.expander(tag, expanded=True):
166
  col1, col2 = st.columns(2)
167
 
168
+ # β€”β€”β€” coluna 1 : termos acima do threshold
169
  with col1:
170
  st.markdown(f"**GO terms com prob β‰₯ {THRESH}**")
171
  hits = mlb.inverse_transform((y_pred >= THRESH).astype(int))[0]
172
  if hits:
173
  for go_id in hits:
174
  name, defin = GO_INFO.get(go_id, ("β€” sem nome β€”", ""))
175
+ defin = re.sub(r'^\\s*"?(.+?)"?\\s*(\\[[^\\]]*\\])?\\s*$', r'\\1',
176
  defin or "")
177
+ st.markdown(f"- {go_link(go_id, name)} ")
178
  if defin:
179
  st.caption(defin)
180
  else:
181
  st.code("β€” nenhum β€”")
182
 
183
+ # β€”β€”β€” coluna 2 : top-N mais provΓ‘veis
184
  with col2:
185
  st.markdown(f"**Top {TOP_N} GO terms mais provΓ‘veis**")
186
  for rank, idx in enumerate(np.argsort(-y_pred[0])[:TOP_N], start=1):
 
197
 
198
  for header, seq in parsed_seqs:
199
  with st.spinner(f"A processar {header}… (pode demorar alguns minutos)"):
200
+ # β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€” EMBEDDINGS β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€” #
201
  emb_pb = embed_seq(FINETUNED_PB, seq, CHUNK_PB)
202
  emb_bfd = embed_seq(FINETUNED_BFD, seq, CHUNK_PB)
203
  emb_esm = embed_seq(BASE_ESM, seq, CHUNK_ESM)
204
 
205
+ # β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€” PREDIÇÕES MLPs β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€” #
206
  y_pb = mlp_pb.predict(emb_pb)
207
  y_bfd = mlp_bfd.predict(emb_bfd)
208
+ y_esm = mlp_esm.predict(emb_esm)[:, :597] # alinhar nΒΊ de termos
209
 
210
+ # β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€” STACKING β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€” #
211
  X = np.concatenate([y_pb, y_bfd, y_esm], axis=1)
212
  y_ens = stacking.predict(X)
213
 
214
+ mostrar(prot_link(header), y_ens)
215
 
216
+ # β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€” LISTA COMPLETA DE TERMOS SUPORTADOS β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€” #
217
  with st.expander("Mostrar lista completa dos 597 GO terms possΓ­veis", expanded=False):
218
  cols = st.columns(3)
219
  for i, go_id in enumerate(GO):