In [9]:
# %%
import pandas as pd
import numpy as np
from Bio import SeqIO
from goatools.obo_parser import GODag
from collections import Counter
from sklearn.preprocessing import MultiLabelBinarizer
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
import os, random

# Carregar ficheiros principais
FASTA = "uniprot_sprot_exp.fasta"
ANNOT = "uniprot_sprot_exp.txt"
GO_OBO = "go.obo"

# Ler sequências
seqs, ids = [], []
for record in SeqIO.parse(FASTA, "fasta"):
 ids.append(record.id)
 seqs.append(str(record.seq))

df_seq = pd.DataFrame({"protein_id": ids, "sequence": seqs})

# Ler anotações GO:MF
df_ann = pd.read_csv(ANNOT, sep="\t", names=["protein_id", "go_term", "category"])
df_ann = df_ann[df_ann["category"] == "F"]

# Propagação hierárquica dos GO terms
go_dag = GODag(GO_OBO)
mf_terms = {t for t, o in go_dag.items() if o.namespace == "molecular_function"}

def propagate_terms(terms):
 expanded = set()
 for t in terms:
 if t in go_dag:
 expanded |= go_dag[t].get_all_parents()
 expanded.add(t)
 return list(expanded & mf_terms)

grouped = df_ann.groupby("protein_id")["go_term"].apply(list).reset_index()
grouped["go_term"] = grouped["go_term"].apply(propagate_terms)

# Juntar com sequência
df = df_seq.merge(grouped, on="protein_id")
df = df[df["go_term"].str.len() > 0]

# Filtrar GO terms com ≥50 proteínas
all_terms = [term for sublist in df["go_term"] for term in sublist]
term_counts = Counter(all_terms)
valid_terms = {t for t, count in term_counts.items() if count >= 50}

df["go_term"] = df["go_term"].apply(lambda ts: [t for t in ts if t in valid_terms])
df = df[df["go_term"].str.len() > 0]

# Preparar labels e dividir por proteína
df["go_terms"] = df["go_term"].apply(lambda x: ';'.join(sorted(set(x))))
df = df[["protein_id", "sequence", "go_terms"]].drop_duplicates()

mlb = MultiLabelBinarizer()
Y = mlb.fit_transform(df["go_terms"].str.split(";"))
X = df[["protein_id", "sequence"]].values

mskf = MultilabelStratifiedKFold(n_splits=10, random_state=42, shuffle=True)
train_idx, temp_idx = next(mskf.split(X, Y))
val_idx, test_idx = np.array_split(temp_idx, 2)

df_train = df.iloc[train_idx].copy()
df_val = df.iloc[val_idx].copy()
df_test = df.iloc[test_idx].copy()

os.makedirs("data", exist_ok=True)
df_train.to_csv("data/mf-training.csv", index=False)
df_val.to_csv("data/mf-validation.csv", index=False)
df_test.to_csv("data/mf-test.csv", index=False)

# Guardar o binarizador
import joblib
joblib.dump(mlb, "data/mlb.pkl")

print("✓ Dataset preparado:")
print(" - Training:", df_train.shape)
print(" - Validation:", df_val.shape)
print(" - Test:", df_test.shape)
print(" - GO terms:", len(mlb.classes_))


go.obo: fmt(1.2) rel(2025-03-16) 43,544 Terms
✓ Dataset preparado:
 - Training: (31142, 3)
 - Validation: (1724, 3)
 - Test: (1724, 3)
 - GO terms: 602


In [10]:
# %%
import random
from collections import defaultdict

# PAM1
# PAM matrix model of protein evolution
# DOI:10.1093/oxfordjournals.molbev.a040360
pam_data = {
 'A': [9948, 19, 27, 42, 31, 46, 50, 92, 17, 7, 40, 88, 42, 41, 122, 279, 255, 9, 72, 723],
 'R': [14, 9871, 24, 38, 37, 130, 38, 62, 49, 4, 58, 205, 26, 33, 47, 103, 104, 5, 36, 52],
 'N': [20, 22, 9860, 181, 29, 36, 41, 67, 31, 5, 22, 49, 23, 10, 33, 83, 66, 3, 43, 32],
 'D': [40, 34, 187, 9818, 11, 63, 98, 61, 23, 5, 25, 54, 43, 13, 27, 88, 55, 4, 29, 36],
 'C': [20, 16, 26, 9, 9987, 10, 17, 37, 12, 2, 16, 26, 10, 19, 27, 26, 25, 2, 6, 67],
 'Q': [29, 118, 29, 49, 8, 9816, 72, 55, 36, 4, 60, 158, 35, 22, 39, 86, 74, 3, 34, 28],
 'E': [35, 29, 41, 101, 12, 71, 9804, 56, 33, 5, 36, 107, 42, 20, 38, 87, 69, 4, 30, 42],
 'G': [96, 61, 77, 70, 38, 51, 58, 9868, 26, 6, 37, 53, 39, 28, 69, 134, 116, 5, 47, 60],
 'H': [17, 53, 33, 19, 15, 39, 34, 24, 9907, 3, 32, 57, 24, 15, 27, 47, 43, 2, 22, 19],
 'I': [6, 3, 6, 6, 3, 5, 6, 7, 3, 9973, 23, 13, 12, 41, 93, 84, 115, 3, 8, 102],
 'L': [26, 39, 17, 15, 7, 33, 22, 20, 19, 27, 9864, 49, 24, 78, 117, 148, 193, 5, 24, 70],
 'K': [60, 198, 43, 52, 12, 142, 96, 53, 42, 10, 63, 9710, 33, 26, 54, 109, 102, 5, 43, 42],
 'M': [21, 22, 15, 18, 6, 20, 18, 18, 17, 11, 27, 32, 9945, 26, 34, 61, 71, 3, 12, 31],
 'F': [18, 17, 8, 6, 8, 11, 10, 16, 10, 44, 92, 24, 29, 9899, 89, 88, 142, 7, 14, 68],
 'P': [97, 47, 35, 29, 23, 35, 38, 57, 21, 24, 47, 56, 28, 76, 9785, 115, 77, 4, 24, 35],
 'S': [241, 87, 76, 73, 17, 56, 60, 99, 32, 13, 69, 92, 42, 67, 100, 9605, 212, 8, 63, 70],
 'T': [186, 78, 54, 37, 14, 42, 42, 83, 28, 23, 84, 85, 53, 93, 66, 182, 9676, 8, 39, 90],
 'W': [2, 1, 1, 1, 1, 1, 1, 2, 1, 2, 2, 2, 1, 5, 3, 4, 4, 9960, 3, 4],
 'Y': [29, 21, 17, 9, 4, 13, 9, 21, 10, 7, 20, 17, 11, 23, 19, 41, 31, 3, 9935, 23],
 'V': [368, 27, 18, 18, 50, 23, 34, 64, 15, 85, 72, 42, 33, 88, 42, 112, 137, 4, 20, 9514]
}

pam_raw = pd.DataFrame(pam_data, index=pam_data.keys())
pam_matrix = pam_raw.div(pam_raw.sum(axis=1), axis=0)
pam_dict = {aa: pam_matrix.loc[aa].to_dict() for aa in pam_matrix.index}

def pam1_substitution(aa):
 if aa not in pam_dict:
 return aa
 subs = list(pam_dict[aa].keys())
 probs = list(pam_dict[aa].values())
 return np.random.choice(subs, p=probs)

def augment_sequence(seq, sub_prob=0.05):
 return ''.join([pam1_substitution(aa) if random.random() < sub_prob else aa for aa in seq])

def slice_sequence(seq, win=1024):
 if len(seq) <= win:
 return [seq]
 return [seq[i:i+win] for i in range(0, len(seq), win)]

def format_seq(seq):
 return " ".join(seq)

# Carregar labels e datasets
import joblib
mlb = joblib.load("data/mlb.pkl")
df_train = pd.read_csv("data/mf-training.csv")
df_val = pd.read_csv("data/mf-validation.csv")
df_test = pd.read_csv("data/mf-test.csv")

# Slicing + augmentação no treino
X_train, y_train = [], []

for _, row in df_train.iterrows():
 seq_aug = augment_sequence(row["sequence"], sub_prob=0.05)
 slices = slice_sequence(seq_aug, win=1024)
 label = mlb.transform([row["go_terms"].split(";")])[0]
 for sl in slices:
 X_train.append(format_seq(sl))
 y_train.append(label)

# Sem slicing no val/test
X_val = [format_seq(seq) for seq in df_val["sequence"]]
X_test = [format_seq(seq) for seq in df_test["sequence"]]

y_val = mlb.transform(df_val["go_terms"].str.split(";"))
y_test = mlb.transform(df_test["go_terms"].str.split(";"))

np.save("embeddings/y_test.npy", y_test)

In [11]:
# %%
from transformers import AutoTokenizer, AutoModel
import torch
from tqdm import tqdm
import numpy as np
import os

# Configurações
MODEL_NAME = "facebook/esm2_t33_650M_UR50D"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
CHUNK_SIZE = 16

# Carregar modelo
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, do_lower_case=False)
model = AutoModel.from_pretrained(MODEL_NAME)
model.to(DEVICE)
model.eval()

def extract_embeddings(texts):
 embeddings = []
 for i in tqdm(range(0, len(texts), CHUNK_SIZE)):
 batch = texts[i:i+CHUNK_SIZE]
 with torch.no_grad():
 inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=1024)
 inputs = {k: v.to(DEVICE) for k, v in inputs.items()}
 outputs = model(**inputs).last_hidden_state
 cls_tokens = outputs[:, 0, :] # token CLS
 embeddings.append(cls_tokens.cpu().numpy())
 return np.vstack(embeddings)

# Extrair e guardar embeddings
os.makedirs("embeddings", exist_ok=True)

emb_train = extract_embeddings(X_train)
emb_val = extract_embeddings(X_val)
emb_test = extract_embeddings(X_test)

np.save("embeddings/esm2_train.npy", emb_train)
np.save("embeddings/esm2_val.npy", emb_val)
np.save("embeddings/esm2_test.npy", emb_test)

np.save("embeddings/y_train.npy", np.array(y_train))
np.save("embeddings/y_val.npy", np.array(y_val))


Some weights of EsmModel were not initialized from the model checkpoint at facebook/esm2_t33_650M_UR50D and are newly initialized: ['esm.pooler.dense.bias', 'esm.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 2189/2189 [1:17:26<00:00, 2.12s/it]
100%|██████████| 108/108 [03:43<00:00, 2.07s/it]
100%|██████████| 108/108 [03:56<00:00, 2.19s/it]


In [2]:
# %%
import numpy as np
import tensorflow as tf
from tensorflow.keras import Input
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import average_precision_score

# Carregar os embeddings e labels
X_train = np.load("embeddings/esm2_train.npy")
X_val = np.load("embeddings/esm2_val.npy")
X_test = np.load("embeddings/esm2_test.npy")

y_train = np.load("embeddings/y_train.npy")
y_val = np.load("embeddings/y_val.npy")
y_test = np.load("embeddings/y_test.npy")

# Definir o modelo
model = Sequential([
 Dense(1024, activation='relu', input_shape=(X_train.shape[1],)),
 Dropout(0.3),
 Dense(512, activation='relu'),
 Dropout(0.3),
 Dense(y_train.shape[1], activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy')

# Treinar
early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

history = model.fit(
 X_train, y_train,
 validation_data=(X_val, y_val),
 epochs=100,
 batch_size=32,
 callbacks=[early_stop],
 verbose=1
)

# Salvar o modelo
model.save("models/mlp_esm2.h5")
model.save("models/mlp_esm2.keras")
print("Modelo guardado em models/")

# Fazer predições no conjunto de teste
y_prob = model.predict(X_test)
np.save("predictions/mf-esm2.npy", y_prob)

print(" Predições do ESM-2 salvas com forma:", y_prob.shape)


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Modelo guardado em models/
 Predições do ESM-2 salvas com forma: (1724, 602)


In [3]:
# %%
import numpy as np
import joblib
import math
from goatools.obo_parser import GODag
from sklearn.metrics import precision_recall_curve, auc

# Carregar dados e parâmetros
GO_FILE = "go.obo"
THRESHOLDS = np.arange(0.0, 1.01, 0.01)
ALPHA = 0.5

mlb = joblib.load("data/mlb.pkl")
y_true = np.load("embeddings/y_test.npy")
y_prob = np.load("predictions/mf-esm2.npy")
terms = mlb.classes_
go_dag = GODag(GO_FILE)

print(f"✓ Dados carregados: {y_true.shape} proteínas × {len(terms)} GO terms")

# Fmax
def compute_fmax(y_true, y_prob, thresholds):
 fmax, best_thr = 0, 0
 for t in thresholds:
 y_pred = (y_prob >= t).astype(int)
 tp = (y_true * y_pred).sum(axis=1)
 fp = ((1 - y_true) * y_pred).sum(axis=1)
 fn = (y_true * (1 - y_pred)).sum(axis=1)
 precision = tp / (tp + fp + 1e-8)
 recall = tp / (tp + fn + 1e-8)
 f1 = 2 * precision * recall / (precision + recall + 1e-8)
 avg_f1 = np.mean(f1)
 if avg_f1 > fmax:
 fmax, best_thr = avg_f1, t
 return fmax, best_thr

# AuPRC (micro)
def compute_auprc(y_true, y_prob):
 precision, recall, _ = precision_recall_curve(y_true.ravel(), y_prob.ravel())
 return auc(recall, precision)

# Smin
def compute_smin(y_true, y_prob, terms, threshold, go_dag, alpha=ALPHA):
 y_pred = (y_prob >= threshold).astype(int)

 # Informação semântica: IC (Information Content)
 ic = {}
 total = (y_true + y_pred).sum(axis=0).sum()
 for i, term in enumerate(terms):
 freq = (y_true[:, i] + y_pred[:, i]).sum()
 ic[term] = -np.log((freq + 1e-8) / total)

 # Para cada proteína, calcular RU e MI
 s_values = []
 for true_vec, pred_vec in zip(y_true, y_pred):
 true_terms = {terms[i] for i in np.where(true_vec)[0]}
 pred_terms = {terms[i] for i in np.where(pred_vec)[0]}

 anc_true = set()
 for t in true_terms:
 if t in go_dag:
 anc_true |= go_dag[t].get_all_parents()
 anc_pred = set()
 for t in pred_terms:
 if t in go_dag:
 anc_pred |= go_dag[t].get_all_parents()

 ru = pred_terms - true_terms
 mi = true_terms - pred_terms
 dist_ru = sum(ic.get(t, 0) for t in ru)
 dist_mi = sum(ic.get(t, 0) for t in mi)
 s = math.sqrt((alpha * dist_ru)**2 + ((1 - alpha) * dist_mi)**2)
 s_values.append(s)

 return np.mean(s_values)

# Avaliação
fmax, thr = compute_fmax(y_true, y_prob, THRESHOLDS)
auprc = compute_auprc(y_true, y_prob)
smin = compute_smin(y_true, y_prob, terms, thr, go_dag)

print(f"\n Resultados finais (ESM-2 + PAM1 + propagação):")
print(f"Fmax = {fmax:.4f}")
print(f"Thr. = {thr:.2f}")
print(f"AuPRC = {auprc:.4f}")
print(f"Smin = {smin:.4f}")


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


go.obo: fmt(1.2) rel(2025-03-16) 43,544 Terms
✓ Dados carregados: (1724, 602) proteínas × 602 GO terms

 Resultados finais (ESM-2 + PAM1 + propagação):
Fmax = 0.6377
Thr. = 0.35
AuPRC = 0.6848
Smin = 14.4202
