In [2]:
import pandas as pd
from Bio import SeqIO
from collections import Counter
from goatools.obo_parser import GODag
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
import numpy as np
import os

# Carregar GO anotações
annotations = pd.read_csv("uniprot_sprot_exp.txt", sep="\t", names=["protein_id", "go_term", "go_category"])
annotations_f = annotations[annotations["go_category"] == "F"]

# Carregar DAG e propagar GO terms
# propagação hierárquica
# https://geneontology.org/docs/download-ontology/
go_dag = GODag("go.obo")
mf_terms = {t for t, o in go_dag.items() if o.namespace == "molecular_function"}

def propagate_terms(term_list):
 full = set()
 for t in term_list:
 if t not in go_dag:
 continue
 full.add(t)
 full.update(go_dag[t].get_all_parents())
 return list(full & mf_terms)

# Carregar sequências
seqs, ids = [], []
for record in SeqIO.parse("uniprot_sprot_exp.fasta", "fasta"):
 ids.append(record.id)
 seqs.append(str(record.seq))

seq_df = pd.DataFrame({"protein_id": ids, "sequence": seqs})

# Juntar com GO anotado e propagar
grouped = annotations_f.groupby("protein_id")["go_term"].apply(list).reset_index()
data = seq_df.merge(grouped, on="protein_id")
data = data[data["go_term"].apply(len) > 0]
data["go_term"] = data["go_term"].apply(propagate_terms)
data = data[data["go_term"].apply(len) > 0]

# Filtrar GO terms raros
# todos os terms com menos de 50 proteinas associadas
all_terms = [term for sublist in data["go_term"] for term in sublist]
term_counts = Counter(all_terms)
valid_terms = {term for term, count in term_counts.items() if count >= 50}
data["go_term"] = data["go_term"].apply(lambda terms: [t for t in terms if t in valid_terms])
data = data[data["go_term"].apply(len) > 0]

# Preparar dataset final
data["go_terms"] = data["go_term"].apply(lambda x: ';'.join(sorted(set(x))))
data = data[["protein_id", "sequence", "go_terms"]].drop_duplicates()

# Binarizar labels e dividir
mlb = MultiLabelBinarizer()
Y = mlb.fit_transform(data["go_terms"].str.split(";"))
X = data[["protein_id", "sequence"]].values

mskf = MultilabelStratifiedKFold(n_splits=10, random_state=42, shuffle=True)
train_idx, temp_idx = next(mskf.split(X, Y))
val_idx, test_idx = np.array_split(temp_idx, 2)

df_train = data.iloc[train_idx].copy()
df_val = data.iloc[val_idx].copy()
df_test = data.iloc[test_idx].copy()

# Guardar em CSV
os.makedirs("data", exist_ok=True)
df_train.to_csv("data/mf-training.csv", index=False)
df_val.to_csv("data/mf-validation.csv", index=False)
df_test.to_csv("data/mf-test.csv", index=False)

# Confirmar
print("✓ Ficheiros criados:")
print(" - data/mf-training.csv :", df_train.shape, df_train.columns.tolist())
print(" - data/mf-validation.csv:", df_val.shape, df_val.columns.tolist())
print(" - data/mf-test.csv :", df_test.shape, df_test.columns.tolist())
print(f"GO terms únicos (após propagação e filtro): {len(mlb.classes_)}")


go.obo: fmt(1.2) rel(2025-03-16) 43,544 Terms
✓ Ficheiros criados:
 - data/mf-training.csv : (31142, 3)
 - data/mf-validation.csv: (1724, 3)
 - data/mf-test.csv : (1724, 3)
GO terms únicos (após propagação e filtro): 602


In [2]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import random
import os
import ktrain
from ktrain import text
from sklearn.preprocessing import MultiLabelBinarizer


# PAM1
# PAM matrix model of protein evolution
# DOI:10.1093/oxfordjournals.molbev.a040360
pam_data = {
 'A': [9948, 19, 27, 42, 31, 46, 50, 92, 17, 7, 40, 88, 42, 41, 122, 279, 255, 9, 72, 723],
 'R': [14, 9871, 24, 38, 37, 130, 38, 62, 49, 4, 58, 205, 26, 33, 47, 103, 104, 5, 36, 52],
 'N': [20, 22, 9860, 181, 29, 36, 41, 67, 31, 5, 22, 49, 23, 10, 33, 83, 66, 3, 43, 32],
 'D': [40, 34, 187, 9818, 11, 63, 98, 61, 23, 5, 25, 54, 43, 13, 27, 88, 55, 4, 29, 36],
 'C': [20, 16, 26, 9, 9987, 10, 17, 37, 12, 2, 16, 26, 10, 19, 27, 26, 25, 2, 6, 67],
 'Q': [29, 118, 29, 49, 8, 9816, 72, 55, 36, 4, 60, 158, 35, 22, 39, 86, 74, 3, 34, 28],
 'E': [35, 29, 41, 101, 12, 71, 9804, 56, 33, 5, 36, 107, 42, 20, 38, 87, 69, 4, 30, 42],
 'G': [96, 61, 77, 70, 38, 51, 58, 9868, 26, 6, 37, 53, 39, 28, 69, 134, 116, 5, 47, 60],
 'H': [17, 53, 33, 19, 15, 39, 34, 24, 9907, 3, 32, 57, 24, 15, 27, 47, 43, 2, 22, 19],
 'I': [6, 3, 6, 6, 3, 5, 6, 7, 3, 9973, 23, 13, 12, 41, 93, 84, 115, 3, 8, 102],
 'L': [26, 39, 17, 15, 7, 33, 22, 20, 19, 27, 9864, 49, 24, 78, 117, 148, 193, 5, 24, 70],
 'K': [60, 198, 43, 52, 12, 142, 96, 53, 42, 10, 63, 9710, 33, 26, 54, 109, 102, 5, 43, 42],
 'M': [21, 22, 15, 18, 6, 20, 18, 18, 17, 11, 27, 32, 9945, 26, 34, 61, 71, 3, 12, 31],
 'F': [18, 17, 8, 6, 8, 11, 10, 16, 10, 44, 92, 24, 29, 9899, 89, 88, 142, 7, 14, 68],
 'P': [97, 47, 35, 29, 23, 35, 38, 57, 21, 24, 47, 56, 28, 76, 9785, 115, 77, 4, 24, 35],
 'S': [241, 87, 76, 73, 17, 56, 60, 99, 32, 13, 69, 92, 42, 67, 100, 9605, 212, 8, 63, 70],
 'T': [186, 78, 54, 37, 14, 42, 42, 83, 28, 23, 84, 85, 53, 93, 66, 182, 9676, 8, 39, 90],
 'W': [2, 1, 1, 1, 1, 1, 1, 2, 1, 2, 2, 2, 1, 5, 3, 4, 4, 9960, 3, 4],
 'Y': [29, 21, 17, 9, 4, 13, 9, 21, 10, 7, 20, 17, 11, 23, 19, 41, 31, 3, 9935, 23],
 'V': [368, 27, 18, 18, 50, 23, 34, 64, 15, 85, 72, 42, 33, 88, 42, 112, 137, 4, 20, 9514]
}
pam_raw = pd.DataFrame(pam_data, index=list(pam_data.keys()))
pam_matrix = pam_raw.div(pam_raw.sum(axis=1), axis=0)
list_amino = pam_raw.columns.tolist()
pam_dict = {
 aa: {sub: pam_matrix.loc[aa, sub] for sub in list_amino}
 for aa in list_amino
}

def pam1_substitution(aa):
 if aa not in pam_dict:
 return aa
 subs = list(pam_dict[aa].keys())
 probs = list(pam_dict[aa].values())
 return np.random.choice(subs, p=probs)

def augment_sequence(seq, sub_prob=0.05):
 return ''.join([pam1_substitution(aa) if random.random() < sub_prob else aa for aa in seq])

def slice_sequence(seq, win=512):
 return [seq[i:i+win] for i in range(0, len(seq), win)]

def generate_data(df, augment=False):
 X, y = [], []
 label_cols = [col for col in df.columns if col.startswith("GO:")]
 for _, row in tqdm(df.iterrows(), total=len(df)):
 seq = row["sequence"]
 if augment:
 seq = augment_sequence(seq)
 seq_slices = slice_sequence(seq)
 X.extend(seq_slices)
 lbl = row[label_cols].values.astype(int)
 y.extend([lbl] * len(seq_slices))
 return X, np.array(y), label_cols

def format_sequence(seq): return " ".join(list(seq))

# Função para carregar e binarizar
def load_and_binarize(csv_path, mlb=None):
 df = pd.read_csv(csv_path)
 df["go_terms"] = df["go_terms"].str.split(";")
 if mlb is None:
 mlb = MultiLabelBinarizer()
 labels = mlb.fit_transform(df["go_terms"])
 else:
 labels = mlb.transform(df["go_terms"])
 labels_df = pd.DataFrame(labels, columns=mlb.classes_)
 df = df.reset_index(drop=True).join(labels_df)
 return df, mlb

# Carregar os dados
df_train, mlb = load_and_binarize("data/mf-training.csv")
df_val, _ = load_and_binarize("data/mf-validation.csv", mlb=mlb)

# Gerar com augmentation no treino
X_train, y_train, term_cols = generate_data(df_train, augment=True)
X_val, y_val, _ = generate_data(df_val, augment=False)

# Preparar texto para tokenizer
X_train_fmt = list(map(format_sequence, X_train))
X_val_fmt = list(map(format_sequence, X_val))

# Fine-tune ProtBERT
# https://huggingface.co/Rostlab/prot_bert
# https://doi.org/10.1093/bioinformatics/btac020
# dados de treino-> UniRef100 (216 milhões de sequências)
MODEL_NAME = "Rostlab/prot_bert"
MAX_LEN = 512
BATCH_SIZE = 1

t = text.Transformer(MODEL_NAME, maxlen=MAX_LEN, classes=term_cols)
trn = t.preprocess_train(X_train_fmt, y_train)
val = t.preprocess_test(X_val_fmt, y_val)

model = t.get_classifier()
learner = ktrain.get_learner(model,
 train_data=trn,
 val_data=val,
 batch_size=BATCH_SIZE)

learner.autofit(lr=1e-5,
 epochs=10,
 early_stopping=1,
 checkpoint_folder="mf-fine-tuned-protbert")


 from .autonotebook import tqdm as notebook_tqdm
 _torch_pytree._register_pytree_node(
100%|██████████| 31142/31142 [00:24<00:00, 1262.18it/s]
100%|██████████| 1724/1724 [00:00<00:00, 2628.24it/s]


preprocessing train...
language: de
train sequence lengths:
	mean : 423
	95percentile : 604
	99percentile : 715


Is Multi-Label? True
preprocessing test...
language: de
test sequence lengths:
	mean : 408
	95percentile : 603
	99percentile : 714






begin training using triangular learning rate policy with max lr of 1e-05...
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


RuntimeError: Can't decrement id ref count (unable to extend file properly)

In [19]:
import os
import pandas as pd
import numpy as np
from tqdm import tqdm
import joblib
import gc
from transformers import AutoTokenizer, TFAutoModel

# Parâmetros
MODEL_DIR = "weights/mf-fine-tuned-protbert-epoch10"
BASE_MODEL = "Rostlab/prot_bert"
OUT_DIR = "embeddings"
BATCH_TOK = 16

tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, do_lower_case=False)
model = TFAutoModel.from_pretrained(MODEL_DIR, from_pt=False)

print("✓ Tokenizer base e modelo fine-tuned carregados com sucesso")

# Funções auxiliares

def get_embeddings(batch, tokenizer, model):
 tokens = tokenizer(batch, return_tensors="tf", padding=True, truncation=True, max_length=512)
 output = model(**tokens)
 return output.last_hidden_state[:, 0, :].numpy()

def process_split(csv_path, out_path):
 df = pd.read_csv(csv_path)
 label_cols = [col for col in df.columns if col.startswith("GO:")]
 prot_ids, embeds, labels = [], [], []

 for _, row in tqdm(df.iterrows(), total=len(df), desc=f"Processando {csv_path}"):
 slices = slice_sequence(row["sequence"])
 slices_fmt = list(map(format_sequence, slices))

 slice_embeds = []
 for i in range(0, len(slices_fmt), BATCH_TOK):
 batch = slices_fmt[i:i+BATCH_TOK]
 slice_embeds.append(get_embeddings(batch, tokenizer, model))
 slice_embeds = np.vstack(slice_embeds)

 prot_embed = slice_embeds.mean(axis=0)
 prot_ids.append(row["protein_id"])
 embeds.append(prot_embed.astype(np.float32))
 labels.append(row[label_cols].values.astype(np.int8))
 gc.collect()

 embeds = np.vstack(embeds)
 labels = np.vstack(labels)

 joblib.dump({
 "protein_ids": prot_ids,
 "embeddings": embeds,
 "labels": labels,
 "go_terms": label_cols
 }, out_path, compress=3)

 print(f"✓ Guardado {out_path} — {embeds.shape[0]} proteínas")

# Aplicar
os.makedirs(OUT_DIR, exist_ok=True)

process_split("data/mf-training.csv", os.path.join(OUT_DIR, "train_protbert.pkl"))
process_split("data/mf-validation.csv", os.path.join(OUT_DIR, "val_protbert.pkl"))
process_split("data/mf-test.csv", os.path.join(OUT_DIR, "test_protbert.pkl"))


 from .autonotebook import tqdm as notebook_tqdm
 _torch_pytree._register_pytree_node(
 _torch_pytree._register_pytree_node(
Some layers from the model checkpoint at weights/mf-fine-tuned-protbert-epoch10 were not used when initializing TFBertModel: ['classifier', 'dropout_183']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at weights/mf-fine-tuned-protbert-epoch10.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further tra

✓ Tokenizer base e modelo fine-tuned carregados com sucesso


Processando data/mf-training.csv: 0%| | 25/31142 [00:06<2:23:28, 3.61it/s]


KeyboardInterrupt: 

In [27]:
import pandas as pd
import joblib
from sklearn.preprocessing import MultiLabelBinarizer

# Obter GO terms do ficheiro de teste
df_test = pd.read_csv("data/mf-test.csv")
test_terms = sorted(set(term for row in df_test["go_terms"].str.split(";") for term in row))

# Função para corrigir um .pkl com base nos GO terms do teste
def patch_to_common_terms(csv_path, pkl_path, common_terms):
 df = pd.read_csv(csv_path)
 terms_split = df["go_terms"].str.split(";")
 
 # Apenas termos presentes nos common_terms
 terms_filtered = terms_split.apply(lambda lst: [t for t in lst if t in common_terms])
 
 mlb = MultiLabelBinarizer(classes=common_terms)
 Y = mlb.fit_transform(terms_filtered)

 data = joblib.load(pkl_path)
 data["labels"] = Y
 data["go_terms"] = mlb.classes_.tolist()
 
 joblib.dump(data, pkl_path, compress=3)
 print(f"✓ Corrigido: {pkl_path} — {Y.shape[0]} exemplos, {Y.shape[1]} GO terms")

# Aplicar às 3 partições
patch_to_common_terms("data/mf-training.csv", "embeddings/train_protbert.pkl", test_terms)
patch_to_common_terms("data/mf-validation.csv", "embeddings/val_protbert.pkl", test_terms)
patch_to_common_terms("data/mf-test.csv", "embeddings/test_protbert.pkl", test_terms)



✓ Corrigido: embeddings/train_protbert.pkl — 31142 exemplos, 597 GO terms
✓ Corrigido: embeddings/val_protbert.pkl — 1724 exemplos, 597 GO terms
✓ Corrigido: embeddings/test_protbert.pkl — 1724 exemplos, 597 GO terms


In [2]:
import tensorflow as tf
import joblib
import numpy as np
from tensorflow.keras import Input
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

# Carregar embeddings
train = joblib.load("embeddings/train_protbert.pkl")
val = joblib.load("embeddings/val_protbert.pkl")
test = joblib.load("embeddings/test_protbert.pkl")

X_train, y_train = train["embeddings"], train["labels"]
X_val, y_val = val["embeddings"], val["labels"]
X_test, y_test = test["embeddings"], test["labels"]

print(f"✓ Embeddings carregados: {X_train.shape} → {y_train.shape[1]} GO terms")

# Garantir consistência de classes
max_classes = y_train.shape[1] # 602 GO terms (do treino)

def pad_labels(y, target_dim=max_classes):
 if y.shape[1] < target_dim:
 padding = np.zeros((y.shape[0], target_dim - y.shape[1]), dtype=np.int8)
 return np.hstack([y, padding])
 return y

y_val = pad_labels(y_val)
y_test = pad_labels(y_test)

# Modelo MLP
model = Sequential([
 Dense(1024, activation="relu", input_shape=(X_train.shape[1],)),
 Dropout(0.3),
 Dense(512, activation="relu"),
 Dropout(0.3),
 Dense(max_classes, activation="sigmoid")
])

model.compile(loss="binary_crossentropy",
 optimizer="adam",
 metrics=["binary_accuracy"])

# Early stopping e treino
callbacks = [
 EarlyStopping(monitor="val_loss", patience=5, restore_best_weights=True)
]

model.fit(X_train, y_train,
 validation_data=(X_val, y_val),
 epochs=100,
 batch_size=32,
 callbacks=callbacks,
 verbose=1)

# Previsões
y_prob = model.predict(X_test)
np.save("predictions/mf-protbert-pam1.npy", y_prob)
print("Previsões guardadas em mf-protbert-pam1.npy")

# Modelo
model.save("models/mlp_protbert.h5")
model.save("models/mlp_protbert.keras")
print("Modelos guardado em models/")

✓ Embeddings carregados: (31142, 1024) → 597 GO terms
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Previsões guardadas em mf-protbert-pam1.npy
Modelos guardado em models/


In [3]:
import numpy as np
from sklearn.metrics import precision_recall_curve, auc
from goatools.obo_parser import GODag
import joblib
import math

# Parâmetros
GO_FILE = "go.obo"
THRESHOLDS = np.arange(0.0, 1.01, 0.01)
ALPHA = 0.5

# Carregar dados
test = joblib.load("embeddings/test_protbert.pkl")
y_true = test["labels"]
terms = test["go_terms"]
y_prob = np.load("predictions/mf-protbert-pam1.npy")
go_dag = GODag(GO_FILE)

print(f"✓ Embeddings: {y_true.shape} labels × {len(terms)} GO terms")

# Fmax
def compute_fmax(y_true, y_prob, thresholds):
 fmax, best_thr = 0, 0
 for t in thresholds:
 y_pred = (y_prob >= t).astype(int)
 tp = (y_true * y_pred).sum(axis=1)
 fp = ((1 - y_true) * y_pred).sum(axis=1)
 fn = (y_true * (1 - y_pred)).sum(axis=1)
 precision = tp / (tp + fp + 1e-8)
 recall = tp / (tp + fn + 1e-8)
 f1 = 2 * precision * recall / (precision + recall + 1e-8)
 avg_f1 = np.mean(f1)
 if avg_f1 > fmax:
 fmax, best_thr = avg_f1, t
 return fmax, best_thr

# AuPRC micro
def compute_auprc(y_true, y_prob):
 precision, recall, _ = precision_recall_curve(y_true.ravel(), y_prob.ravel())
 return auc(recall, precision)

# Smin
def compute_smin(y_true, y_prob, terms, threshold, go_dag, alpha=ALPHA):
 y_pred = (y_prob >= threshold).astype(int)
 ic = {}
 total = (y_true + y_pred).sum(axis=0).sum()
 for i, term in enumerate(terms):
 freq = (y_true[:, i] + y_pred[:, i]).sum()
 ic[term] = -np.log((freq + 1e-8) / total)

 s_values = []
 for true_vec, pred_vec in zip(y_true, y_pred):
 true_terms = {terms[i] for i in np.where(true_vec)[0]}
 pred_terms = {terms[i] for i in np.where(pred_vec)[0]}

 anc_true = set()
 for t in true_terms:
 if t in go_dag:
 anc_true |= go_dag[t].get_all_parents()
 anc_pred = set()
 for t in pred_terms:
 if t in go_dag:
 anc_pred |= go_dag[t].get_all_parents()

 ru = pred_terms - true_terms
 mi = true_terms - pred_terms
 dist_ru = sum(ic.get(t, 0) for t in ru)
 dist_mi = sum(ic.get(t, 0) for t in mi)
 s = math.sqrt((alpha * dist_ru)**2 + ((1 - alpha) * dist_mi)**2)
 s_values.append(s)

 return np.mean(s_values)

# --- 6. Avaliar ----------------------------------------------------------
fmax, thr = compute_fmax(y_true, y_prob, THRESHOLDS)
auprc = compute_auprc(y_true, y_prob)
smin = compute_smin(y_true, y_prob, terms, thr, go_dag)

print(f"\n📊 Resultados finais (ProtBERT + PAM1 + propagação):")
print(f"Fmax = {fmax:.4f}")
print(f"Thr. = {thr:.2f}")
print(f"AuPRC = {auprc:.4f}")
print(f"Smin = {smin:.4f}")


go.obo: fmt(1.2) rel(2025-03-16) 43,544 Terms
✓ Embeddings: (1724, 597) labels × 597 GO terms

📊 Resultados finais (ProtBERT + PAM1 + propagação):
Fmax = 0.6611
Thr. = 0.45
AuPRC = 0.6951
Smin = 13.4386
