Spaces:

melvinalves
/

protein_function_prediction

Running

App Files Files Community

melvinalves commited on Jun 23

Commit

c971d5f

verified ·

1 Parent(s): 0725542

Update app.py

Browse files

Files changed (1) hide show

app.py +109 -70

app.py CHANGED Viewed

@@ -14,10 +14,10 @@ login(os.environ["HF_TOKEN"])
 # ———————————————————  CONFIG  ——————————————————— #
 SPACE_ID   = "melvinalves/protein_function_prediction"
-TOP_N      = 20      # top-20 mais prováveis
 THRESH     = 0.37
-CHUNK_PB   = 512      # janela ProtBERT / ProtBERT-BFD
-CHUNK_ESM  = 1024     # janela ESM-2
 # repositórios HF
 FINETUNED_PB   = ("melvinalves/FineTune", "fineTunedProtbert")
@@ -41,9 +41,9 @@ def load_keras(name):
 def load_hf_encoder(repo_id, subfolder=None, base_tok=None):
     """
     • repo_id   : repositório HF ou caminho local
-    • subfolder : subpasta dos pesos (None se não houver)
-    • base_tok  : repo do tokenizer (None → usa repo_id)
-    Converte tf_model.h5 → PyTorch on-the-fly (from_tf=True).
     """
     if base_tok is None:
         base_tok = repo_id
@@ -59,8 +59,7 @@ def load_hf_encoder(repo_id, subfolder=None, base_tok=None):
 # ---------- extrair embedding ----------
 def embed_seq(model_ref, seq, chunk):
     """
-    • model_ref = string (modelo base)  OU  tuple(repo_id, subfolder) (modelo fine-tuned)
-    Retorna embedding CLS médio (caso a sequência seja dividida em chunks).
     """
     if isinstance(model_ref, tuple):                # ProtBERT fine-tuned
         repo_id, subf = model_ref
@@ -100,28 +99,65 @@ GO       = mlb.classes_
 st.set_page_config(page_title="Predição de Funções Moleculares de Proteínas",
                    page_icon="🧬", layout="centered")
-# CSS global : fundo branco, texto preto, textarea branca + traço colunas
 st.markdown(
     """
     <style>
-        body, .stApp { background:#FFFFFF !important; color:#000000 !important; }
-        textarea     { background:#FFFFFF !important; color:#000000 !important;
-                       font-size:0.9rem !important; }
-        /* traço vertical entre as duas colunas (segunda coluna) */
-        div[data-testid="column"]:nth-of-type(3) {
-            border-left:1px solid #e0e0e0;
-            padding-left:1rem;
         }
-        .block-container { padding-top:1.5rem; }
     </style>
     """,
     unsafe_allow_html=True
 )
-# Logo (coloca logo.png na raiz do Space)
 LOGO_PATH = "logo.png"
 if os.path.exists(LOGO_PATH):
-    st.image(LOGO_PATH, width=180)
 st.title("Predição de Funções Moleculares de Proteínas (GO:MF)")
@@ -131,67 +167,71 @@ predict_clicked = st.button("Prever GO terms")
 # ———————————————————  PARSE DE MÚLTIPLAS SEQUÊNCIAS  ——————————————————— #
 def parse_fasta_multiple(fasta_str):
     """
-    Devolve lista de (header, seq) a partir de texto FASTA possivelmente múltiplo.
-    Suporta bloco inicial sem '>'.
     """
     entries, parsed = fasta_str.strip().split(">"), []
     for i, entry in enumerate(entries):
         if not entry.strip():
             continue
         lines = entry.strip().splitlines()
-        if i > 0:          # bloco típico FASTA
             header = lines[0].strip()
-            seq = "".join(lines[1:]).replace(" ", "").upper()
-        else:              # sequência sem '>'
             header = f"Seq_{i+1}"
-            seq = "".join(lines).replace(" ", "").upper()
         if seq:
             parsed.append((header, seq))
     return parsed
-# ———————————————————  FUNÇÕES AUXILIARES DE LAYOUT  ——————————————————— #
 def go_link(go_id, name=""):
-    """Cria link para página do GO term (QuickGO)."""
-    url = f"https://www.ebi.ac.uk/QuickGO/term/{go_id}"
     label = f"{go_id} — {name}" if name else go_id
     return f"[{label}]({url})"
-def prot_link(header):
-    """Tenta gerar link para UniProt usando o primeiro token do header."""
     pid = header.split()[0]
-    url = f"https://www.uniprot.org/uniprotkb/{pid}"
-    return f"[{header}]({url})"
-# ———————————————————  FUNÇÃO PRINCIPAL DE RESULTADOS  ——————————————————— #
-def mostrar(tag, y_pred):
-    """Mostra resultados em duas colunas separadas por traço."""
-    # 3 colunas: esquerda | traço (muito estreito) | direita
-    col1, col_mid, col2 = st.columns([1, 0.04, 1])
-    with col1:
-        st.markdown(f"**GO terms com prob ≥ {THRESH}**")
-        hits = mlb.inverse_transform((y_pred >= THRESH).astype(int))[0]
-        if hits:
-            for go_id in hits:
-                name, defin = GO_INFO.get(go_id, ("— sem nome —", ""))
-                defin = re.sub(r'^\\s*\"?(.+?)\"?\\s*(\\[[^\\]]*\\])?\\s*$', r'\\1',
-                               defin or "")
-                st.markdown(f"- {go_link(go_id, name)}  ")
-                if defin:
-                    st.caption(defin)
-        else:
-            st.code("— nenhum —")
-    # coluna do meio já tem a linha (CSS) — fica vazia
-    with col_mid:
-        st.write("")
-    with col2:
-        st.markdown(f"**Top {TOP_N} GO terms mais prováveis**")
-        for rank, idx in enumerate(np.argsort(-y_pred[0])[:TOP_N], start=1):
-            go_id = GO[idx]
-            name, _ = GO_INFO.get(go_id, ("", ""))
-            st.markdown(f"{rank}. {go_link(go_id, name)} : {y_pred[0][idx]:.4f}")
 # ———————————————————  INFERÊNCIA  ——————————————————— #
 if predict_clicked:
@@ -202,24 +242,23 @@ if predict_clicked:
     for header, seq in parsed_seqs:
         with st.spinner(f"A processar {header}… (pode demorar alguns minutos)"):
-            # ————————————  EMBEDDINGS  ———————————— #
             emb_pb  = embed_seq(FINETUNED_PB,  seq, CHUNK_PB)
             emb_bfd = embed_seq(FINETUNED_BFD, seq, CHUNK_PB)
             emb_esm = embed_seq(BASE_ESM,       seq, CHUNK_ESM)
-            # ————————————  PREDIÇÕES MLPs  ———————————— #
             y_pb  = mlp_pb.predict(emb_pb)
             y_bfd = mlp_bfd.predict(emb_bfd)
-            y_esm = mlp_esm.predict(emb_esm)[:, :597]  # alinhar nº termos
-            # ————————————  STACKING  ———————————— #
             X     = np.concatenate([y_pb, y_bfd, y_esm], axis=1)
             y_ens = stacking.predict(X)
-        st.markdown(f"### {prot_link(header)}", unsafe_allow_html=True)
-        mostrar("", y_ens)
-# ———————————————————  LISTA COMPLETA DE TERMOS SUPORTADOS  ——————————————————— #
 with st.expander("Mostrar lista completa dos 597 GO terms possíveis", expanded=False):
     cols = st.columns(3)
     for i, go_id in enumerate(GO):

 # ———————————————————  CONFIG  ——————————————————— #
 SPACE_ID   = "melvinalves/protein_function_prediction"
+TOP_N      = 20        # top-20 mais prováveis
 THRESH     = 0.37
+CHUNK_PB   = 512       # janela ProtBERT / ProtBERT-BFD
+CHUNK_ESM  = 1024      # janela ESM-2
 # repositórios HF
 FINETUNED_PB   = ("melvinalves/FineTune", "fineTunedProtbert")
 def load_hf_encoder(repo_id, subfolder=None, base_tok=None):
     """
     • repo_id   : repositório HF ou caminho local
+    • subfolder : subpasta onde vivem pesos/config (None se não houver)
+    • base_tok  : repo para o tokenizer      (None => usa repo_id)
+    Converte tf_model.h5 → PyTorch on-the-fly.
     """
     if base_tok is None:
         base_tok = repo_id
 # ---------- extrair embedding ----------
 def embed_seq(model_ref, seq, chunk):
     """
+    Retorna embedding CLS médio (divide sequência em chunks se necessário).
     """
     if isinstance(model_ref, tuple):                # ProtBERT fine-tuned
         repo_id, subf = model_ref
 st.set_page_config(page_title="Predição de Funções Moleculares de Proteínas",
                    page_icon="🧬", layout="centered")
+# ---------- CSS global ----------
 st.markdown(
     """
     <style>
+        /* fundo branco + texto preto */
+        body, .stApp                        { background-color:#FFFFFF !important; color:#000000 !important; }
+        /* reduz top padding para o logo caber completo */
+        .block-container                    { padding-top:3rem; }
+        /* logo centralizado e afastado do topo */
+        img.logo-top                        { display:block; margin:0 auto 1.5rem; }
+        /* textarea/input brancos */
+        textarea, input, .stTextArea textarea, .stTextInput input {
+            background-color:#FFFFFF !important;
+            color:#000000 !important;
+        }
+        /* botões Streamlit */
+        .stButton>button {
+            background:#F8F9FA !important;          /* cinza muito claro */
+            color:#000000 !important;
+            border:1px solid #007BFF !important;
+            border-radius:4px;
+        }
+        .stButton>button:hover {
+            background:#007BFF !important;
+            color:#FFFFFF !important;
+        }
+        /* botão UniProt custom */
+        .prot-btn {
+            background:#007BFF; color:#FFFFFF; border:none;
+            padding:6px 12px; border-radius:4px; cursor:pointer;
+        }
+        .prot-btn:hover {
+            background:#0056B3;
+        }
+        /* tiramos cores de hover vermelhas dos expanders; seta + texto azuis */
+        .st-expander:focus:not(:active) .streamlit-expanderHeader,
+        .streamlit-expanderHeader:hover {
+            color:#007BFF !important;
+        }
+        /* divisória vertical entre colunas */
+        div[data-testid='column']:nth-of-type(1) {
+            border-right:1px solid #DDDDDD;
+            padding-right:1rem;
         }
     </style>
     """,
     unsafe_allow_html=True
 )
+# ---------- Logo ----------
 LOGO_PATH = "logo.png"
 if os.path.exists(LOGO_PATH):
+    st.markdown(f'<img src="app://{LOGO_PATH}" width="180" class="logo-top">', unsafe_allow_html=True)
 st.title("Predição de Funções Moleculares de Proteínas (GO:MF)")
 # ———————————————————  PARSE DE MÚLTIPLAS SEQUÊNCIAS  ——————————————————— #
 def parse_fasta_multiple(fasta_str):
     """
+    Devolve lista (header, seq). Suporta bloco inicial sem '>'.
     """
     entries, parsed = fasta_str.strip().split(">"), []
     for i, entry in enumerate(entries):
         if not entry.strip():
             continue
         lines = entry.strip().splitlines()
+        if i > 0:      # FASTA normal
             header = lines[0].strip()
+            seq    = "".join(lines[1:]).replace(" ", "").upper()
+        else:          # sequência sem '>'
             header = f"Seq_{i+1}"
+            seq    = "".join(lines).replace(" ", "").upper()
         if seq:
             parsed.append((header, seq))
     return parsed
+# ———————————————————  FUNÇÕES AUX COLUNA/LINKS  ——————————————————— #
 def go_link(go_id, name=""):
+    url   = f"https://www.ebi.ac.uk/QuickGO/term/{go_id}"
     label = f"{go_id} — {name}" if name else go_id
     return f"[{label}]({url})"
+def prot_url(header):
     pid = header.split()[0]
+    return f"https://www.uniprot.org/uniprotkb/{pid}"
+# ———————————————————  MOSTRAR RESULTADOS  ——————————————————— #
+def mostrar(header, y_pred):
+    """Expander com coluna-esq (hits) + coluna-dir (Top-20)."""
+    url = prot_url(header)
+    # botão UniProt fora do expander
+    st.markdown(
+        f'<a href="{url}" target="_blank">'
+        f'<button class="prot-btn">🔗 Ver UniProt ({header.split()[0]})</button>'
+        f'</a>',
+        unsafe_allow_html=True
+    )
+    with st.expander(header, expanded=True):
+        col1, col2 = st.columns(2)
+        # coluna 1 – hits acima do threshold
+        with col1:
+            st.markdown(f"**GO terms com prob ≥ {THRESH}**")
+            hits = mlb.inverse_transform((y_pred >= THRESH).astype(int))[0]
+            if hits:
+                for go_id in hits:
+                    name, defin = GO_INFO.get(go_id, ("— sem nome —", ""))
+                    defin = re.sub(r'^\\s*\"?(.+?)\"?\\s*(\\[[^\\]]*\\])?\\s*$', r'\\1',
+                                   defin or "")
+                    st.markdown(f"- {go_link(go_id, name)}")
+                    if defin:
+                        st.caption(defin)
+            else:
+                st.code("— nenhum —")
+        # coluna 2 – top-20
+        with col2:
+            st.markdown(f"**Top {TOP_N} GO terms mais prováveis**")
+            for rank, idx in enumerate(np.argsort(-y_pred[0])[:TOP_N], start=1):
+                go_id = GO[idx]
+                name, _ = GO_INFO.get(go_id, ("", ""))
+                st.markdown(f"{rank}. {go_link(go_id, name)} : {y_pred[0][idx]:.4f}")
 # ———————————————————  INFERÊNCIA  ——————————————————— #
 if predict_clicked:
     for header, seq in parsed_seqs:
         with st.spinner(f"A processar {header}… (pode demorar alguns minutos)"):
+            # embeddings
             emb_pb  = embed_seq(FINETUNED_PB,  seq, CHUNK_PB)
             emb_bfd = embed_seq(FINETUNED_BFD, seq, CHUNK_PB)
             emb_esm = embed_seq(BASE_ESM,       seq, CHUNK_ESM)
+            # predições MLPs
             y_pb  = mlp_pb.predict(emb_pb)
             y_bfd = mlp_bfd.predict(emb_bfd)
+            y_esm = mlp_esm.predict(emb_esm)[:, :597]     # alinhar nº de termos
+            # stacking
             X     = np.concatenate([y_pb, y_bfd, y_esm], axis=1)
             y_ens = stacking.predict(X)
+        mostrar(header, y_ens)
+# ———————————————————  LISTA COMPLETA DE TERMOS  ——————————————————— #
 with st.expander("Mostrar lista completa dos 597 GO terms possíveis", expanded=False):
     cols = st.columns(3)
     for i, go_id in enumerate(GO):