In [15]:
# %%
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModel
from tensorflow.keras.models import load_model
import joblib

# Parâmetros
SEQ_FASTA = "MFNVESVERVELCESLLTWIQTFNVDAPCQTAEDLTNGVVMSQVLQKIDPVYFDDNWLNRIKTEVGDNWRLKISNLKKILKGILDYNHEILGQQINDFTLPDVNLIGEHSDAAELGRMLQLILGCAVNCEQKQEYIQAIMMMEESVQHVVMTAIQELMSKESPVSAGHDAYVDLDRQLKKTTEELNEALSAKEEIAQRCHELDMQVAALQEEKSSLLAENQILMERLNQSDSIEDPNSPAGRRHLQLQTQLEQLQEETFRLEAAKDDYRIRCEELEKEISELRQQNDELTTLADEAQSLKDEIDVLRHSSDKVSKLEGQVESYKKKLEDLGDLRRQVKLLEEKNTMYMQNTVSLEEELRKANAARGQLETYKRQVVELQNRLSDESKKADKLDFEYKRLKEKVDGLQKEKDRLRTERDSLKETIEELRCVQAQEGQLTTQGLMPLGSQESSDSLAAEIVTPEIREKLIRLQHENKMLKLNQEDSDNEKIALLQSLLDDANLRKNELETENRLVNQRLLEVQSQVEELQKSLQDQGSKAEDSVLLKKKLEEHLEKLHEANNELQKKRAIIEDLEPRFNNSSLRIEELQEALRKKEEEMKQMEERYKKYLEKAKSVIRTLDPKQNQGAAPEIQALKNQLQERDRLFHSLEKEYEKTKSQRDMEEKYIVSAWYNMGMTLHKKAAEDRLASTGSGQSFLARQRQATSTRRSYPGHVQPATAR" # (mantém a tua sequência completa)
TOP_N = 10
THRESH = 0.37 

# Funções auxiliares
def get_embedding_mean(model_name, seq, chunk):
 tokenizer = AutoTokenizer.from_pretrained(model_name, do_lower_case=False)
 model = AutoModel.from_pretrained(model_name)
 model.eval()

 chunks = [seq[i:i+chunk] for i in range(0, len(seq), chunk)]
 reps = []
 for c in chunks:
 tokens = tokenizer(" ".join(c), return_tensors="pt", truncation=False, padding=False)
 with torch.no_grad():
 reps.append(model(**tokens).last_hidden_state[:, 0, :].squeeze().numpy())
 return np.mean(reps, axis=0, keepdims=True) # shape (1, dim)

# Embeddings
print("A gerar embeddings …")
emb_pb = get_embedding_mean("Rostlab/prot_bert", SEQ_FASTA, 512)
emb_bfd = get_embedding_mean("Rostlab/prot_bert_bfd", SEQ_FASTA, 512)
emb_esm = get_embedding_mean("facebook/esm2_t33_650M_UR50D", SEQ_FASTA, 1024)

# Carregar modelos
mlp_pb = load_model("models/mlp_protbert.h5")
mlp_bfd = load_model("models/mlp_protbertbfd.h5")
mlp_esm = load_model("models/mlp_esm2.h5")
stacking = load_model("models/ensemble_stack.h5")

# Predições dos MLPs base
print("A fazer predições individuais …")
y_pb = mlp_pb.predict(emb_pb)[:, :597]
y_bfd = mlp_bfd.predict(emb_bfd)[:, :597]
y_esm = mlp_esm.predict(emb_esm)[:, :597]

# --- 4. Ensemble (stacking)
X_stack = np.concatenate([y_pb, y_bfd, y_esm], axis=1)
y_ens = stacking.predict(X_stack)

# --- 5. Carregar MultiLabelBinarizer
mlb = joblib.load("data/mlb_597.pkl")
GO = mlb.classes_

# --- 6. Função para mostrar resultados
def print_results(name, y_pred):
 print(f"\n {name}")
 # GO terms acima do limiar
 terms = mlb.inverse_transform((y_pred >= THRESH).astype(int))
 print(f" GO terms com prob ≥ {THRESH}:")
 print(" ", terms[0] if terms[0] else "Nenhum")

 # Top-N
 top_idx = np.argsort(-y_pred[0])[:TOP_N]
 print(f" Top {TOP_N} mais prováveis:")
 for i in top_idx:
 print(f" {GO[i]} : {y_pred[0][i]:.4f}")

# Imprimir tudo
print_results("ProtBERT (MLP)", y_pb)
print_results("ProtBERT-BFD (MLP)", y_bfd)
print_results("ESM-2 (MLP)", y_esm)
print_results("Ensemble (Stacking)", y_ens)


A gerar embeddings …


Some weights of EsmModel were not initialized from the model checkpoint at facebook/esm2_t33_650M_UR50D and are newly initialized: ['esm.pooler.dense.bias', 'esm.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


A fazer predições individuais …


ValueError: in user code:

 File "C:\Users\Melvin\anaconda3\envs\protein_env\lib\site-packages\keras\src\engine\training.py", line 2341, in predict_function *
 return step_function(self, iterator)
 File "C:\Users\Melvin\anaconda3\envs\protein_env\lib\site-packages\keras\src\engine\training.py", line 2327, in step_function **
 outputs = model.distribute_strategy.run(run_step, args=(data,))
 File "C:\Users\Melvin\anaconda3\envs\protein_env\lib\site-packages\keras\src\engine\training.py", line 2315, in run_step **
 outputs = model.predict_step(data)
 File "C:\Users\Melvin\anaconda3\envs\protein_env\lib\site-packages\keras\src\engine\training.py", line 2283, in predict_step
 return self(x, training=False)
 File "C:\Users\Melvin\anaconda3\envs\protein_env\lib\site-packages\keras\src\utils\traceback_utils.py", line 70, in error_handler
 raise e.with_traceback(filtered_tb) from None
 File "C:\Users\Melvin\anaconda3\envs\protein_env\lib\site-packages\keras\src\engine\input_spec.py", line 298, in assert_input_compatibility
 raise ValueError(

 ValueError: Input 0 of layer "sequential" is incompatible with the layer: expected shape=(None, 1779), found shape=(None, 1791)
