{ "cells": [ { "cell_type": "code", "execution_count": 2, "id": "c6dbc330-062a-48f0-8242-3f21cc1c9c2b", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "go.obo: fmt(1.2) rel(2025-03-16) 43,544 Terms\n", "✓ Ficheiros criados:\n", " - data/mf-training.csv : (31142, 3)\n", " - data/mf-validation.csv: (1724, 3)\n", " - data/mf-test.csv : (1724, 3)\n", "GO terms únicos (após propagação e filtro): 602\n" ] } ], "source": [ "import pandas as pd\n", "from Bio import SeqIO\n", "from collections import Counter\n", "from goatools.obo_parser import GODag\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.preprocessing import MultiLabelBinarizer\n", "from iterstrat.ml_stratifiers import MultilabelStratifiedKFold\n", "import numpy as np\n", "import os\n", "\n", "# Carregar GO anotações\n", "annotations = pd.read_csv(\"uniprot_sprot_exp.txt\", sep=\"\\t\", names=[\"protein_id\", \"go_term\", \"go_category\"])\n", "annotations_f = annotations[annotations[\"go_category\"] == \"F\"]\n", "\n", "# Carregar DAG e propagar GO terms\n", "# propagação hierárquica\n", "# https://geneontology.org/docs/download-ontology/\n", "go_dag = GODag(\"go.obo\")\n", "mf_terms = {t for t, o in go_dag.items() if o.namespace == \"molecular_function\"}\n", "\n", "def propagate_terms(term_list):\n", " full = set()\n", " for t in term_list:\n", " if t not in go_dag:\n", " continue\n", " full.add(t)\n", " full.update(go_dag[t].get_all_parents())\n", " return list(full & mf_terms)\n", "\n", "# Carregar sequências\n", "seqs, ids = [], []\n", "for record in SeqIO.parse(\"uniprot_sprot_exp.fasta\", \"fasta\"):\n", " ids.append(record.id)\n", " seqs.append(str(record.seq))\n", "\n", "seq_df = pd.DataFrame({\"protein_id\": ids, \"sequence\": seqs})\n", "\n", "# Juntar com GO anotado e propagar\n", "grouped = annotations_f.groupby(\"protein_id\")[\"go_term\"].apply(list).reset_index()\n", "data = seq_df.merge(grouped, on=\"protein_id\")\n", "data = data[data[\"go_term\"].apply(len) > 0]\n", "data[\"go_term\"] = data[\"go_term\"].apply(propagate_terms)\n", "data = data[data[\"go_term\"].apply(len) > 0]\n", "\n", "# Filtrar GO terms raros\n", "# todos os terms com menos de 50 proteinas associadas\n", "all_terms = [term for sublist in data[\"go_term\"] for term in sublist]\n", "term_counts = Counter(all_terms)\n", "valid_terms = {term for term, count in term_counts.items() if count >= 50}\n", "data[\"go_term\"] = data[\"go_term\"].apply(lambda terms: [t for t in terms if t in valid_terms])\n", "data = data[data[\"go_term\"].apply(len) > 0]\n", "\n", "# Preparar dataset final\n", "data[\"go_terms\"] = data[\"go_term\"].apply(lambda x: ';'.join(sorted(set(x))))\n", "data = data[[\"protein_id\", \"sequence\", \"go_terms\"]].drop_duplicates()\n", "\n", "# Binarizar labels e dividir\n", "mlb = MultiLabelBinarizer()\n", "Y = mlb.fit_transform(data[\"go_terms\"].str.split(\";\"))\n", "X = data[[\"protein_id\", \"sequence\"]].values\n", "\n", "mskf = MultilabelStratifiedKFold(n_splits=10, random_state=42, shuffle=True)\n", "train_idx, temp_idx = next(mskf.split(X, Y))\n", "val_idx, test_idx = np.array_split(temp_idx, 2)\n", "\n", "df_train = data.iloc[train_idx].copy()\n", "df_val = data.iloc[val_idx].copy()\n", "df_test = data.iloc[test_idx].copy()\n", "\n", "# Guardar em CSV\n", "os.makedirs(\"data\", exist_ok=True)\n", "df_train.to_csv(\"data/mf-training.csv\", index=False)\n", "df_val.to_csv(\"data/mf-validation.csv\", index=False)\n", "df_test.to_csv(\"data/mf-test.csv\", index=False)\n", "\n", "# Confirmar\n", "print(\"✓ Ficheiros criados:\")\n", "print(\" - data/mf-training.csv :\", df_train.shape, df_train.columns.tolist())\n", "print(\" - data/mf-validation.csv:\", df_val.shape, df_val.columns.tolist())\n", "print(\" - data/mf-test.csv :\", df_test.shape, df_test.columns.tolist())\n", "print(f\"GO terms únicos (após propagação e filtro): {len(mlb.classes_)}\")\n" ] }, { "cell_type": "code", "execution_count": 2, "id": "6cf7aaa6-4941-4951-8d73-1f4f1f4362f3", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\Melvin\\anaconda3\\envs\\projeto_proteina2\\lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", " from .autonotebook import tqdm as notebook_tqdm\n", "C:\\Users\\Melvin\\anaconda3\\envs\\projeto_proteina2\\lib\\site-packages\\transformers\\utils\\generic.py:441: FutureWarning: `torch.utils._pytree._register_pytree_node` is deprecated. Please use `torch.utils._pytree.register_pytree_node` instead.\n", " _torch_pytree._register_pytree_node(\n", "100%|██████████| 31142/31142 [00:24<00:00, 1262.18it/s]\n", "100%|██████████| 1724/1724 [00:00<00:00, 2628.24it/s]\n", "C:\\Users\\Melvin\\anaconda3\\envs\\projeto_proteina2\\lib\\site-packages\\ktrain\\text\\preprocessor.py:382: UserWarning: The class_names argument is replacing the classes argument. Please update your code.\n", " warnings.warn(\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "preprocessing train...\n", "language: de\n", "train sequence lengths:\n", "\tmean : 423\n", "\t95percentile : 604\n", "\t99percentile : 715\n" ] }, { "data": { "text/html": [ "\n", "\n" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "Is Multi-Label? True\n", "preprocessing test...\n", "language: de\n", "test sequence lengths:\n", "\tmean : 408\n", "\t95percentile : 603\n", "\t99percentile : 714\n" ] }, { "data": { "text/html": [ "\n", "\n" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\Melvin\\anaconda3\\envs\\projeto_proteina2\\lib\\site-packages\\ktrain\\text\\preprocessor.py:1093: UserWarning: Could not load a Tensorflow version of model. (If this worked before, it might be an out-of-memory issue.) Attempting to download/load PyTorch version as TensorFlow model using from_pt=True. You will need PyTorch installed for this.\n", " warnings.warn(\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "\n", "begin training using triangular learning rate policy with max lr of 1e-05...\n", "Epoch 1/10\n", "40995/40995 [==============================] - 13053s 318ms/step - loss: 0.0745 - binary_accuracy: 0.9866 - val_loss: 0.0582 - val_binary_accuracy: 0.9859\n", "Epoch 2/10\n", "40995/40995 [==============================] - 14484s 353ms/step - loss: 0.0504 - binary_accuracy: 0.9873 - val_loss: 0.0499 - val_binary_accuracy: 0.9867\n", "Epoch 3/10\n", "40995/40995 [==============================] - 14472s 353ms/step - loss: 0.0450 - binary_accuracy: 0.9879 - val_loss: 0.0449 - val_binary_accuracy: 0.9873\n", "Epoch 4/10\n", "40995/40995 [==============================] - 14445s 352ms/step - loss: 0.0407 - binary_accuracy: 0.9884 - val_loss: 0.0413 - val_binary_accuracy: 0.9878\n", "Epoch 5/10\n", "40995/40995 [==============================] - 12524s 305ms/step - loss: 0.0378 - binary_accuracy: 0.9888 - val_loss: 0.0394 - val_binary_accuracy: 0.9881\n", "Epoch 6/10\n", "40995/40995 [==============================] - 14737s 359ms/step - loss: 0.0359 - binary_accuracy: 0.9891 - val_loss: 0.0383 - val_binary_accuracy: 0.9883\n", "Epoch 7/10\n", "40995/40995 [==============================] - 20317s 495ms/step - loss: 0.0343 - binary_accuracy: 0.9894 - val_loss: 0.0371 - val_binary_accuracy: 0.9885\n", "Epoch 8/10\n", "40995/40995 [==============================] - 9073s 221ms/step - loss: 0.0331 - binary_accuracy: 0.9896 - val_loss: 0.0364 - val_binary_accuracy: 0.9887\n", "Epoch 9/10\n", "40995/40995 [==============================] - 9001s 219ms/step - loss: 0.0320 - binary_accuracy: 0.9898 - val_loss: 0.0360 - val_binary_accuracy: 0.9888\n", "Epoch 10/10\n", "40995/40995 [==============================] - 8980s 219ms/step - loss: 0.0311 - binary_accuracy: 0.9900 - val_loss: 0.0356 - val_binary_accuracy: 0.9890\n" ] }, { "ename": "RuntimeError", "evalue": "Can't decrement id ref count (unable to extend file properly)", "output_type": "error", "traceback": [ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[1;31mOSError\u001b[0m Traceback (most recent call last)", "File \u001b[1;32m~\\anaconda3\\envs\\projeto_proteina2\\lib\\site-packages\\keras\\engine\\training.py:2252\u001b[0m, in \u001b[0;36mModel.save_weights\u001b[1;34m(self, filepath, overwrite, save_format, options)\u001b[0m\n\u001b[0;32m 2251\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m h5py\u001b[38;5;241m.\u001b[39mFile(filepath, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mw\u001b[39m\u001b[38;5;124m'\u001b[39m) \u001b[38;5;28;01mas\u001b[39;00m f:\n\u001b[1;32m-> 2252\u001b[0m \u001b[43mhdf5_format\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msave_weights_to_hdf5_group\u001b[49m\u001b[43m(\u001b[49m\u001b[43mf\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mlayers\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 2253\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n", "File \u001b[1;32m~\\anaconda3\\envs\\projeto_proteina2\\lib\\site-packages\\keras\\saving\\hdf5_format.py:646\u001b[0m, in \u001b[0;36msave_weights_to_hdf5_group\u001b[1;34m(f, layers)\u001b[0m\n\u001b[0;32m 645\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m--> 646\u001b[0m param_dset[:] \u001b[38;5;241m=\u001b[39m val\n", "File \u001b[1;32mh5py\\\\_objects.pyx:54\u001b[0m, in \u001b[0;36mh5py._objects.with_phil.wrapper\u001b[1;34m()\u001b[0m\n", "File \u001b[1;32mh5py\\\\_objects.pyx:55\u001b[0m, in \u001b[0;36mh5py._objects.with_phil.wrapper\u001b[1;34m()\u001b[0m\n", "File \u001b[1;32m~\\anaconda3\\envs\\projeto_proteina2\\lib\\site-packages\\h5py\\_hl\\dataset.py:999\u001b[0m, in \u001b[0;36mDataset.__setitem__\u001b[1;34m(self, args, val)\u001b[0m\n\u001b[0;32m 998\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m fspace \u001b[38;5;129;01min\u001b[39;00m selection\u001b[38;5;241m.\u001b[39mbroadcast(mshape):\n\u001b[1;32m--> 999\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mid\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mwrite\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmspace\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfspace\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mval\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmtype\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdxpl\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_dxpl\u001b[49m\u001b[43m)\u001b[49m\n", "File \u001b[1;32mh5py\\\\_objects.pyx:54\u001b[0m, in \u001b[0;36mh5py._objects.with_phil.wrapper\u001b[1;34m()\u001b[0m\n", "File \u001b[1;32mh5py\\\\_objects.pyx:55\u001b[0m, in \u001b[0;36mh5py._objects.with_phil.wrapper\u001b[1;34m()\u001b[0m\n", "File \u001b[1;32mh5py\\\\h5d.pyx:282\u001b[0m, in \u001b[0;36mh5py.h5d.DatasetID.write\u001b[1;34m()\u001b[0m\n", "File \u001b[1;32mh5py\\\\_proxy.pyx:115\u001b[0m, in \u001b[0;36mh5py._proxy.dset_rw\u001b[1;34m()\u001b[0m\n", "\u001b[1;31mOSError\u001b[0m: [Errno 28] Can't write data (file write failed: time = Wed May 7 10:48:36 2025\n, filename = 'mf-fine-tuned-protbert\\weights-10-0.04.hdf5', file descriptor = 4, errno = 28, error message = 'No space left on device', buf = 000002CC552FF040, total write size = 4194304, bytes this sub-write = 4194304, bytes actually written = 18446744073709551615, offset = 1180551864)", "\nDuring handling of the above exception, another exception occurred:\n", "\u001b[1;31mRuntimeError\u001b[0m Traceback (most recent call last)", "Cell \u001b[1;32mIn[2], line 119\u001b[0m\n\u001b[0;32m 113\u001b[0m model \u001b[38;5;241m=\u001b[39m t\u001b[38;5;241m.\u001b[39mget_classifier()\n\u001b[0;32m 114\u001b[0m learner \u001b[38;5;241m=\u001b[39m ktrain\u001b[38;5;241m.\u001b[39mget_learner(model,\n\u001b[0;32m 115\u001b[0m train_data\u001b[38;5;241m=\u001b[39mtrn,\n\u001b[0;32m 116\u001b[0m val_data\u001b[38;5;241m=\u001b[39mval,\n\u001b[0;32m 117\u001b[0m batch_size\u001b[38;5;241m=\u001b[39mBATCH_SIZE)\n\u001b[1;32m--> 119\u001b[0m \u001b[43mlearner\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mautofit\u001b[49m\u001b[43m(\u001b[49m\u001b[43mlr\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m1e-5\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[0;32m 120\u001b[0m \u001b[43m \u001b[49m\u001b[43mepochs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m10\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[0;32m 121\u001b[0m \u001b[43m \u001b[49m\u001b[43mearly_stopping\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m1\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[0;32m 122\u001b[0m \u001b[43m \u001b[49m\u001b[43mcheckpoint_folder\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mmf-fine-tuned-protbert\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n", "File \u001b[1;32m~\\anaconda3\\envs\\projeto_proteina2\\lib\\site-packages\\ktrain\\core.py:1239\u001b[0m, in \u001b[0;36mLearner.autofit\u001b[1;34m(self, lr, epochs, early_stopping, reduce_on_plateau, reduce_factor, cycle_momentum, max_momentum, min_momentum, monitor, checkpoint_folder, class_weight, callbacks, steps_per_epoch, verbose)\u001b[0m\n\u001b[0;32m 1234\u001b[0m policy \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtriangular learning rate\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 1235\u001b[0m U\u001b[38;5;241m.\u001b[39mvprint(\n\u001b[0;32m 1236\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mbegin training using \u001b[39m\u001b[38;5;132;01m%s\u001b[39;00m\u001b[38;5;124m policy with max lr of \u001b[39m\u001b[38;5;132;01m%s\u001b[39;00m\u001b[38;5;124m...\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;241m%\u001b[39m (policy, lr),\n\u001b[0;32m 1237\u001b[0m verbose\u001b[38;5;241m=\u001b[39mverbose,\n\u001b[0;32m 1238\u001b[0m )\n\u001b[1;32m-> 1239\u001b[0m hist \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfit\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 1240\u001b[0m \u001b[43m \u001b[49m\u001b[43mlr\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 1241\u001b[0m \u001b[43m \u001b[49m\u001b[43mepochs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 1242\u001b[0m \u001b[43m \u001b[49m\u001b[43mearly_stopping\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mearly_stopping\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 1243\u001b[0m \u001b[43m \u001b[49m\u001b[43mcheckpoint_folder\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcheckpoint_folder\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 1244\u001b[0m \u001b[43m \u001b[49m\u001b[43mverbose\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mverbose\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 1245\u001b[0m \u001b[43m \u001b[49m\u001b[43mclass_weight\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mclass_weight\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 1246\u001b[0m \u001b[43m \u001b[49m\u001b[43mcallbacks\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mkcallbacks\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 1247\u001b[0m \u001b[43m \u001b[49m\u001b[43msteps_per_epoch\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43msteps_per_epoch\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 1248\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 1249\u001b[0m hist\u001b[38;5;241m.\u001b[39mhistory[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mlr\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m clr\u001b[38;5;241m.\u001b[39mhistory[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mlr\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n\u001b[0;32m 1250\u001b[0m hist\u001b[38;5;241m.\u001b[39mhistory[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124miterations\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m clr\u001b[38;5;241m.\u001b[39mhistory[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124miterations\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n", "File \u001b[1;32m~\\anaconda3\\envs\\projeto_proteina2\\lib\\site-packages\\ktrain\\core.py:1650\u001b[0m, in \u001b[0;36mGenLearner.fit\u001b[1;34m(self, lr, n_cycles, cycle_len, cycle_mult, lr_decay, checkpoint_folder, early_stopping, class_weight, callbacks, steps_per_epoch, verbose)\u001b[0m\n\u001b[0;32m 1648\u001b[0m warnings\u001b[38;5;241m.\u001b[39mfilterwarnings(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mignore\u001b[39m\u001b[38;5;124m\"\u001b[39m, message\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m.*Check your callbacks.*\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m 1649\u001b[0m fit_fn \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmodel\u001b[38;5;241m.\u001b[39mfit\n\u001b[1;32m-> 1650\u001b[0m hist \u001b[38;5;241m=\u001b[39m \u001b[43mfit_fn\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 1651\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_prepare\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtrain_data\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 1652\u001b[0m \u001b[43m \u001b[49m\u001b[43msteps_per_epoch\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43msteps_per_epoch\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 1653\u001b[0m \u001b[43m \u001b[49m\u001b[43mvalidation_steps\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mvalidation_steps\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 1654\u001b[0m \u001b[43m \u001b[49m\u001b[43mepochs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mepochs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 1655\u001b[0m \u001b[43m \u001b[49m\u001b[43mvalidation_data\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_prepare\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mval_data\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtrain\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 1656\u001b[0m \u001b[43m \u001b[49m\u001b[43mworkers\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mworkers\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 1657\u001b[0m \u001b[43m \u001b[49m\u001b[43muse_multiprocessing\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43muse_multiprocessing\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 1658\u001b[0m \u001b[43m \u001b[49m\u001b[43mverbose\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mverbose\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 1659\u001b[0m \u001b[43m \u001b[49m\u001b[43mshuffle\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[0;32m 1660\u001b[0m \u001b[43m \u001b[49m\u001b[43mclass_weight\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mclass_weight\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 1661\u001b[0m \u001b[43m \u001b[49m\u001b[43mcallbacks\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mkcallbacks\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 1662\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 1663\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m sgdr \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m 1664\u001b[0m hist\u001b[38;5;241m.\u001b[39mhistory[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mlr\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m sgdr\u001b[38;5;241m.\u001b[39mhistory[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mlr\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n", "File \u001b[1;32m~\\anaconda3\\envs\\projeto_proteina2\\lib\\site-packages\\keras\\engine\\training.py:1230\u001b[0m, in \u001b[0;36mModel.fit\u001b[1;34m(self, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, validation_batch_size, validation_freq, max_queue_size, workers, use_multiprocessing)\u001b[0m\n\u001b[0;32m 1227\u001b[0m val_logs \u001b[38;5;241m=\u001b[39m {\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mval_\u001b[39m\u001b[38;5;124m'\u001b[39m \u001b[38;5;241m+\u001b[39m name: val \u001b[38;5;28;01mfor\u001b[39;00m name, val \u001b[38;5;129;01min\u001b[39;00m val_logs\u001b[38;5;241m.\u001b[39mitems()}\n\u001b[0;32m 1228\u001b[0m epoch_logs\u001b[38;5;241m.\u001b[39mupdate(val_logs)\n\u001b[1;32m-> 1230\u001b[0m \u001b[43mcallbacks\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mon_epoch_end\u001b[49m\u001b[43m(\u001b[49m\u001b[43mepoch\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mepoch_logs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 1231\u001b[0m training_logs \u001b[38;5;241m=\u001b[39m epoch_logs\n\u001b[0;32m 1232\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstop_training:\n", "File \u001b[1;32m~\\anaconda3\\envs\\projeto_proteina2\\lib\\site-packages\\keras\\callbacks.py:413\u001b[0m, in \u001b[0;36mCallbackList.on_epoch_end\u001b[1;34m(self, epoch, logs)\u001b[0m\n\u001b[0;32m 411\u001b[0m logs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_process_logs(logs)\n\u001b[0;32m 412\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m callback \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcallbacks:\n\u001b[1;32m--> 413\u001b[0m \u001b[43mcallback\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mon_epoch_end\u001b[49m\u001b[43m(\u001b[49m\u001b[43mepoch\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mlogs\u001b[49m\u001b[43m)\u001b[49m\n", "File \u001b[1;32m~\\anaconda3\\envs\\projeto_proteina2\\lib\\site-packages\\keras\\callbacks.py:1368\u001b[0m, in \u001b[0;36mModelCheckpoint.on_epoch_end\u001b[1;34m(self, epoch, logs)\u001b[0m\n\u001b[0;32m 1366\u001b[0m \u001b[38;5;66;03m# pylint: disable=protected-access\u001b[39;00m\n\u001b[0;32m 1367\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msave_freq \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mepoch\u001b[39m\u001b[38;5;124m'\u001b[39m:\n\u001b[1;32m-> 1368\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_save_model\u001b[49m\u001b[43m(\u001b[49m\u001b[43mepoch\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mepoch\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mbatch\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mlogs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mlogs\u001b[49m\u001b[43m)\u001b[49m\n", "File \u001b[1;32m~\\anaconda3\\envs\\projeto_proteina2\\lib\\site-packages\\keras\\callbacks.py:1431\u001b[0m, in \u001b[0;36mModelCheckpoint._save_model\u001b[1;34m(self, epoch, batch, logs)\u001b[0m\n\u001b[0;32m 1429\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m'\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124mEpoch \u001b[39m\u001b[38;5;132;01m%05d\u001b[39;00m\u001b[38;5;124m: saving model to \u001b[39m\u001b[38;5;132;01m%s\u001b[39;00m\u001b[38;5;124m'\u001b[39m \u001b[38;5;241m%\u001b[39m (epoch \u001b[38;5;241m+\u001b[39m \u001b[38;5;241m1\u001b[39m, filepath))\n\u001b[0;32m 1430\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msave_weights_only:\n\u001b[1;32m-> 1431\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmodel\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msave_weights\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 1432\u001b[0m \u001b[43m \u001b[49m\u001b[43mfilepath\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43moverwrite\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_options\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 1433\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m 1434\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmodel\u001b[38;5;241m.\u001b[39msave(filepath, overwrite\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m, options\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_options)\n", "File \u001b[1;32m~\\anaconda3\\envs\\projeto_proteina2\\lib\\site-packages\\keras\\engine\\training.py:2252\u001b[0m, in \u001b[0;36mModel.save_weights\u001b[1;34m(self, filepath, overwrite, save_format, options)\u001b[0m\n\u001b[0;32m 2250\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m save_format \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mh5\u001b[39m\u001b[38;5;124m'\u001b[39m:\n\u001b[0;32m 2251\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m h5py\u001b[38;5;241m.\u001b[39mFile(filepath, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mw\u001b[39m\u001b[38;5;124m'\u001b[39m) \u001b[38;5;28;01mas\u001b[39;00m f:\n\u001b[1;32m-> 2252\u001b[0m hdf5_format\u001b[38;5;241m.\u001b[39msave_weights_to_hdf5_group(f, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mlayers)\n\u001b[0;32m 2253\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m 2254\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m tf\u001b[38;5;241m.\u001b[39mexecuting_eagerly():\n", "File \u001b[1;32mh5py\\\\_objects.pyx:54\u001b[0m, in \u001b[0;36mh5py._objects.with_phil.wrapper\u001b[1;34m()\u001b[0m\n", "File \u001b[1;32mh5py\\\\_objects.pyx:55\u001b[0m, in \u001b[0;36mh5py._objects.with_phil.wrapper\u001b[1;34m()\u001b[0m\n", "File \u001b[1;32m~\\anaconda3\\envs\\projeto_proteina2\\lib\\site-packages\\h5py\\_hl\\files.py:599\u001b[0m, in \u001b[0;36mFile.__exit__\u001b[1;34m(self, *args)\u001b[0m\n\u001b[0;32m 596\u001b[0m \u001b[38;5;129m@with_phil\u001b[39m\n\u001b[0;32m 597\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__exit__\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;241m*\u001b[39margs):\n\u001b[0;32m 598\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mid:\n\u001b[1;32m--> 599\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mclose\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n", "File \u001b[1;32m~\\anaconda3\\envs\\projeto_proteina2\\lib\\site-packages\\h5py\\_hl\\files.py:581\u001b[0m, in \u001b[0;36mFile.close\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 575\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mid\u001b[38;5;241m.\u001b[39mvalid:\n\u001b[0;32m 576\u001b[0m \u001b[38;5;66;03m# We have to explicitly murder all open objects related to the file\u001b[39;00m\n\u001b[0;32m 577\u001b[0m \n\u001b[0;32m 578\u001b[0m \u001b[38;5;66;03m# Close file-resident objects first, then the files.\u001b[39;00m\n\u001b[0;32m 579\u001b[0m \u001b[38;5;66;03m# Otherwise we get errors in MPI mode.\u001b[39;00m\n\u001b[0;32m 580\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mid\u001b[38;5;241m.\u001b[39m_close_open_objects(h5f\u001b[38;5;241m.\u001b[39mOBJ_LOCAL \u001b[38;5;241m|\u001b[39m \u001b[38;5;241m~\u001b[39mh5f\u001b[38;5;241m.\u001b[39mOBJ_FILE)\n\u001b[1;32m--> 581\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mid\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_close_open_objects\u001b[49m\u001b[43m(\u001b[49m\u001b[43mh5f\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mOBJ_LOCAL\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m|\u001b[39;49m\u001b[43m \u001b[49m\u001b[43mh5f\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mOBJ_FILE\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 583\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mid\u001b[38;5;241m.\u001b[39mclose()\n\u001b[0;32m 584\u001b[0m _objects\u001b[38;5;241m.\u001b[39mnonlocal_close()\n", "File \u001b[1;32mh5py\\\\_objects.pyx:54\u001b[0m, in \u001b[0;36mh5py._objects.with_phil.wrapper\u001b[1;34m()\u001b[0m\n", "File \u001b[1;32mh5py\\\\_objects.pyx:55\u001b[0m, in \u001b[0;36mh5py._objects.with_phil.wrapper\u001b[1;34m()\u001b[0m\n", "File \u001b[1;32mh5py\\\\h5f.pyx:355\u001b[0m, in \u001b[0;36mh5py.h5f.FileID._close_open_objects\u001b[1;34m()\u001b[0m\n", "\u001b[1;31mRuntimeError\u001b[0m: Can't decrement id ref count (unable to extend file properly)" ] } ], "source": [ "import pandas as pd\n", "import numpy as np\n", "from tqdm import tqdm\n", "import random\n", "import os\n", "import ktrain\n", "from ktrain import text\n", "from sklearn.preprocessing import MultiLabelBinarizer\n", "\n", "\n", "# PAM1\n", "# PAM matrix model of protein evolution\n", "# DOI:10.1093/oxfordjournals.molbev.a040360\n", "pam_data = {\n", " 'A': [9948, 19, 27, 42, 31, 46, 50, 92, 17, 7, 40, 88, 42, 41, 122, 279, 255, 9, 72, 723],\n", " 'R': [14, 9871, 24, 38, 37, 130, 38, 62, 49, 4, 58, 205, 26, 33, 47, 103, 104, 5, 36, 52],\n", " 'N': [20, 22, 9860, 181, 29, 36, 41, 67, 31, 5, 22, 49, 23, 10, 33, 83, 66, 3, 43, 32],\n", " 'D': [40, 34, 187, 9818, 11, 63, 98, 61, 23, 5, 25, 54, 43, 13, 27, 88, 55, 4, 29, 36],\n", " 'C': [20, 16, 26, 9, 9987, 10, 17, 37, 12, 2, 16, 26, 10, 19, 27, 26, 25, 2, 6, 67],\n", " 'Q': [29, 118, 29, 49, 8, 9816, 72, 55, 36, 4, 60, 158, 35, 22, 39, 86, 74, 3, 34, 28],\n", " 'E': [35, 29, 41, 101, 12, 71, 9804, 56, 33, 5, 36, 107, 42, 20, 38, 87, 69, 4, 30, 42],\n", " 'G': [96, 61, 77, 70, 38, 51, 58, 9868, 26, 6, 37, 53, 39, 28, 69, 134, 116, 5, 47, 60],\n", " 'H': [17, 53, 33, 19, 15, 39, 34, 24, 9907, 3, 32, 57, 24, 15, 27, 47, 43, 2, 22, 19],\n", " 'I': [6, 3, 6, 6, 3, 5, 6, 7, 3, 9973, 23, 13, 12, 41, 93, 84, 115, 3, 8, 102],\n", " 'L': [26, 39, 17, 15, 7, 33, 22, 20, 19, 27, 9864, 49, 24, 78, 117, 148, 193, 5, 24, 70],\n", " 'K': [60, 198, 43, 52, 12, 142, 96, 53, 42, 10, 63, 9710, 33, 26, 54, 109, 102, 5, 43, 42],\n", " 'M': [21, 22, 15, 18, 6, 20, 18, 18, 17, 11, 27, 32, 9945, 26, 34, 61, 71, 3, 12, 31],\n", " 'F': [18, 17, 8, 6, 8, 11, 10, 16, 10, 44, 92, 24, 29, 9899, 89, 88, 142, 7, 14, 68],\n", " 'P': [97, 47, 35, 29, 23, 35, 38, 57, 21, 24, 47, 56, 28, 76, 9785, 115, 77, 4, 24, 35],\n", " 'S': [241, 87, 76, 73, 17, 56, 60, 99, 32, 13, 69, 92, 42, 67, 100, 9605, 212, 8, 63, 70],\n", " 'T': [186, 78, 54, 37, 14, 42, 42, 83, 28, 23, 84, 85, 53, 93, 66, 182, 9676, 8, 39, 90],\n", " 'W': [2, 1, 1, 1, 1, 1, 1, 2, 1, 2, 2, 2, 1, 5, 3, 4, 4, 9960, 3, 4],\n", " 'Y': [29, 21, 17, 9, 4, 13, 9, 21, 10, 7, 20, 17, 11, 23, 19, 41, 31, 3, 9935, 23],\n", " 'V': [368, 27, 18, 18, 50, 23, 34, 64, 15, 85, 72, 42, 33, 88, 42, 112, 137, 4, 20, 9514]\n", "}\n", "pam_raw = pd.DataFrame(pam_data, index=list(pam_data.keys()))\n", "pam_matrix = pam_raw.div(pam_raw.sum(axis=1), axis=0)\n", "list_amino = pam_raw.columns.tolist()\n", "pam_dict = {\n", " aa: {sub: pam_matrix.loc[aa, sub] for sub in list_amino}\n", " for aa in list_amino\n", "}\n", "\n", "def pam1_substitution(aa):\n", " if aa not in pam_dict:\n", " return aa\n", " subs = list(pam_dict[aa].keys())\n", " probs = list(pam_dict[aa].values())\n", " return np.random.choice(subs, p=probs)\n", "\n", "def augment_sequence(seq, sub_prob=0.05):\n", " return ''.join([pam1_substitution(aa) if random.random() < sub_prob else aa for aa in seq])\n", "\n", "def slice_sequence(seq, win=512):\n", " return [seq[i:i+win] for i in range(0, len(seq), win)]\n", "\n", "def generate_data(df, augment=False):\n", " X, y = [], []\n", " label_cols = [col for col in df.columns if col.startswith(\"GO:\")]\n", " for _, row in tqdm(df.iterrows(), total=len(df)):\n", " seq = row[\"sequence\"]\n", " if augment:\n", " seq = augment_sequence(seq)\n", " seq_slices = slice_sequence(seq)\n", " X.extend(seq_slices)\n", " lbl = row[label_cols].values.astype(int)\n", " y.extend([lbl] * len(seq_slices))\n", " return X, np.array(y), label_cols\n", "\n", "def format_sequence(seq): return \" \".join(list(seq))\n", "\n", "# Função para carregar e binarizar\n", "def load_and_binarize(csv_path, mlb=None):\n", " df = pd.read_csv(csv_path)\n", " df[\"go_terms\"] = df[\"go_terms\"].str.split(\";\")\n", " if mlb is None:\n", " mlb = MultiLabelBinarizer()\n", " labels = mlb.fit_transform(df[\"go_terms\"])\n", " else:\n", " labels = mlb.transform(df[\"go_terms\"])\n", " labels_df = pd.DataFrame(labels, columns=mlb.classes_)\n", " df = df.reset_index(drop=True).join(labels_df)\n", " return df, mlb\n", "\n", "# Carregar os dados\n", "df_train, mlb = load_and_binarize(\"data/mf-training.csv\")\n", "df_val, _ = load_and_binarize(\"data/mf-validation.csv\", mlb=mlb)\n", "\n", "# Gerar com augmentation no treino\n", "X_train, y_train, term_cols = generate_data(df_train, augment=True)\n", "X_val, y_val, _ = generate_data(df_val, augment=False)\n", "\n", "# Preparar texto para tokenizer\n", "X_train_fmt = list(map(format_sequence, X_train))\n", "X_val_fmt = list(map(format_sequence, X_val))\n", "\n", "# Fine-tune ProtBERT\n", "# https://huggingface.co/Rostlab/prot_bert\n", "# https://doi.org/10.1093/bioinformatics/btac020\n", "# dados de treino-> UniRef100 (216 milhões de sequências)\n", "MODEL_NAME = \"Rostlab/prot_bert\"\n", "MAX_LEN = 512\n", "BATCH_SIZE = 1\n", "\n", "t = text.Transformer(MODEL_NAME, maxlen=MAX_LEN, classes=term_cols)\n", "trn = t.preprocess_train(X_train_fmt, y_train)\n", "val = t.preprocess_test(X_val_fmt, y_val)\n", "\n", "model = t.get_classifier()\n", "learner = ktrain.get_learner(model,\n", " train_data=trn,\n", " val_data=val,\n", " batch_size=BATCH_SIZE)\n", "\n", "learner.autofit(lr=1e-5,\n", " epochs=10,\n", " early_stopping=1,\n", " checkpoint_folder=\"mf-fine-tuned-protbert\")\n" ] }, { "cell_type": "code", "execution_count": 19, "id": "9b39c439-5708-4787-bfee-d3a4d3aa190d", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\Melvin\\anaconda3\\envs\\projeto_proteina2\\lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", " from .autonotebook import tqdm as notebook_tqdm\n", "C:\\Users\\Melvin\\anaconda3\\envs\\projeto_proteina2\\lib\\site-packages\\transformers\\utils\\generic.py:441: FutureWarning: `torch.utils._pytree._register_pytree_node` is deprecated. Please use `torch.utils._pytree.register_pytree_node` instead.\n", " _torch_pytree._register_pytree_node(\n", "C:\\Users\\Melvin\\anaconda3\\envs\\projeto_proteina2\\lib\\site-packages\\huggingface_hub\\file_download.py:797: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n", " warnings.warn(\n", "C:\\Users\\Melvin\\anaconda3\\envs\\projeto_proteina2\\lib\\site-packages\\transformers\\utils\\generic.py:309: FutureWarning: `torch.utils._pytree._register_pytree_node` is deprecated. Please use `torch.utils._pytree.register_pytree_node` instead.\n", " _torch_pytree._register_pytree_node(\n", "Some layers from the model checkpoint at weights/mf-fine-tuned-protbert-epoch10 were not used when initializing TFBertModel: ['classifier', 'dropout_183']\n", "- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n", "- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n", "All the layers of TFBertModel were initialized from the model checkpoint at weights/mf-fine-tuned-protbert-epoch10.\n", "If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "✓ Tokenizer base e modelo fine-tuned carregados com sucesso\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Processando data/mf-training.csv: 0%| | 25/31142 [00:06<2:23:28, 3.61it/s]\n" ] }, { "ename": "KeyboardInterrupt", "evalue": "", "output_type": "error", "traceback": [ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[1;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", "Cell \u001b[1;32mIn[19], line 78\u001b[0m\n\u001b[0;32m 75\u001b[0m \u001b[38;5;66;03m# --- 4. Aplicar -----------------------------------------------------------\u001b[39;00m\n\u001b[0;32m 76\u001b[0m os\u001b[38;5;241m.\u001b[39mmakedirs(OUT_DIR, exist_ok\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[1;32m---> 78\u001b[0m \u001b[43mprocess_split\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mdata/mf-training.csv\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mos\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpath\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mjoin\u001b[49m\u001b[43m(\u001b[49m\u001b[43mOUT_DIR\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mtrain_protbert.pkl\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 79\u001b[0m process_split(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdata/mf-validation.csv\u001b[39m\u001b[38;5;124m\"\u001b[39m, os\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39mjoin(OUT_DIR, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mval_protbert.pkl\u001b[39m\u001b[38;5;124m\"\u001b[39m))\n\u001b[0;32m 80\u001b[0m process_split(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdata/mf-test.csv\u001b[39m\u001b[38;5;124m\"\u001b[39m, os\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39mjoin(OUT_DIR, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtest_protbert.pkl\u001b[39m\u001b[38;5;124m\"\u001b[39m))\n", "Cell \u001b[1;32mIn[19], line 61\u001b[0m, in \u001b[0;36mprocess_split\u001b[1;34m(csv_path, out_path)\u001b[0m\n\u001b[0;32m 59\u001b[0m embeds\u001b[38;5;241m.\u001b[39mappend(prot_embed\u001b[38;5;241m.\u001b[39mastype(np\u001b[38;5;241m.\u001b[39mfloat32))\n\u001b[0;32m 60\u001b[0m labels\u001b[38;5;241m.\u001b[39mappend(row[label_cols]\u001b[38;5;241m.\u001b[39mvalues\u001b[38;5;241m.\u001b[39mastype(np\u001b[38;5;241m.\u001b[39mint8))\n\u001b[1;32m---> 61\u001b[0m gc\u001b[38;5;241m.\u001b[39mcollect()\n\u001b[0;32m 63\u001b[0m embeds \u001b[38;5;241m=\u001b[39m np\u001b[38;5;241m.\u001b[39mvstack(embeds)\n\u001b[0;32m 64\u001b[0m labels \u001b[38;5;241m=\u001b[39m np\u001b[38;5;241m.\u001b[39mvstack(labels)\n", "\u001b[1;31mKeyboardInterrupt\u001b[0m: " ] } ], "source": [ "import os\n", "import pandas as pd\n", "import numpy as np\n", "from tqdm import tqdm\n", "import joblib\n", "import gc\n", "from transformers import AutoTokenizer, TFAutoModel\n", "\n", "# Parâmetros\n", "MODEL_DIR = \"weights/mf-fine-tuned-protbert-epoch10\"\n", "BASE_MODEL = \"Rostlab/prot_bert\"\n", "OUT_DIR = \"embeddings\"\n", "BATCH_TOK = 16\n", "\n", "tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, do_lower_case=False)\n", "model = TFAutoModel.from_pretrained(MODEL_DIR, from_pt=False)\n", "\n", "print(\"✓ Tokenizer base e modelo fine-tuned carregados com sucesso\")\n", "\n", "# Funções auxiliares\n", "\n", "def get_embeddings(batch, tokenizer, model):\n", " tokens = tokenizer(batch, return_tensors=\"tf\", padding=True, truncation=True, max_length=512)\n", " output = model(**tokens)\n", " return output.last_hidden_state[:, 0, :].numpy()\n", "\n", "def process_split(csv_path, out_path):\n", " df = pd.read_csv(csv_path)\n", " label_cols = [col for col in df.columns if col.startswith(\"GO:\")]\n", " prot_ids, embeds, labels = [], [], []\n", "\n", " for _, row in tqdm(df.iterrows(), total=len(df), desc=f\"Processando {csv_path}\"):\n", " slices = slice_sequence(row[\"sequence\"])\n", " slices_fmt = list(map(format_sequence, slices))\n", "\n", " slice_embeds = []\n", " for i in range(0, len(slices_fmt), BATCH_TOK):\n", " batch = slices_fmt[i:i+BATCH_TOK]\n", " slice_embeds.append(get_embeddings(batch, tokenizer, model))\n", " slice_embeds = np.vstack(slice_embeds)\n", "\n", " prot_embed = slice_embeds.mean(axis=0)\n", " prot_ids.append(row[\"protein_id\"])\n", " embeds.append(prot_embed.astype(np.float32))\n", " labels.append(row[label_cols].values.astype(np.int8))\n", " gc.collect()\n", "\n", " embeds = np.vstack(embeds)\n", " labels = np.vstack(labels)\n", "\n", " joblib.dump({\n", " \"protein_ids\": prot_ids,\n", " \"embeddings\": embeds,\n", " \"labels\": labels,\n", " \"go_terms\": label_cols\n", " }, out_path, compress=3)\n", "\n", " print(f\"✓ Guardado {out_path} — {embeds.shape[0]} proteínas\")\n", "\n", "# Aplicar\n", "os.makedirs(OUT_DIR, exist_ok=True)\n", "\n", "process_split(\"data/mf-training.csv\", os.path.join(OUT_DIR, \"train_protbert.pkl\"))\n", "process_split(\"data/mf-validation.csv\", os.path.join(OUT_DIR, \"val_protbert.pkl\"))\n", "process_split(\"data/mf-test.csv\", os.path.join(OUT_DIR, \"test_protbert.pkl\"))\n" ] }, { "cell_type": "code", "execution_count": 27, "id": "ad0c5421-e0a1-4a6a-8ace-2c69aeab0e0d", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "✓ Corrigido: embeddings/train_protbert.pkl — 31142 exemplos, 597 GO terms\n", "✓ Corrigido: embeddings/val_protbert.pkl — 1724 exemplos, 597 GO terms\n", "✓ Corrigido: embeddings/test_protbert.pkl — 1724 exemplos, 597 GO terms\n" ] } ], "source": [ "import pandas as pd\n", "import joblib\n", "from sklearn.preprocessing import MultiLabelBinarizer\n", "\n", "# Obter GO terms do ficheiro de teste\n", "df_test = pd.read_csv(\"data/mf-test.csv\")\n", "test_terms = sorted(set(term for row in df_test[\"go_terms\"].str.split(\";\") for term in row))\n", "\n", "# Função para corrigir um .pkl com base nos GO terms do teste\n", "def patch_to_common_terms(csv_path, pkl_path, common_terms):\n", " df = pd.read_csv(csv_path)\n", " terms_split = df[\"go_terms\"].str.split(\";\")\n", " \n", " # Apenas termos presentes nos common_terms\n", " terms_filtered = terms_split.apply(lambda lst: [t for t in lst if t in common_terms])\n", " \n", " mlb = MultiLabelBinarizer(classes=common_terms)\n", " Y = mlb.fit_transform(terms_filtered)\n", "\n", " data = joblib.load(pkl_path)\n", " data[\"labels\"] = Y\n", " data[\"go_terms\"] = mlb.classes_.tolist()\n", " \n", " joblib.dump(data, pkl_path, compress=3)\n", " print(f\"✓ Corrigido: {pkl_path} — {Y.shape[0]} exemplos, {Y.shape[1]} GO terms\")\n", "\n", "# Aplicar às 3 partições\n", "patch_to_common_terms(\"data/mf-training.csv\", \"embeddings/train_protbert.pkl\", test_terms)\n", "patch_to_common_terms(\"data/mf-validation.csv\", \"embeddings/val_protbert.pkl\", test_terms)\n", "patch_to_common_terms(\"data/mf-test.csv\", \"embeddings/test_protbert.pkl\", test_terms)\n", "\n" ] }, { "cell_type": "code", "execution_count": 2, "id": "1785d8a9-23fc-4490-8d71-29cc91a4cb57", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "✓ Embeddings carregados: (31142, 1024) → 597 GO terms\n", "Epoch 1/100\n", "974/974 [==============================] - 11s 11ms/step - loss: 0.0357 - binary_accuracy: 0.9894 - val_loss: 0.0334 - val_binary_accuracy: 0.9902\n", "Epoch 2/100\n", "974/974 [==============================] - 10s 11ms/step - loss: 0.0276 - binary_accuracy: 0.9914 - val_loss: 0.0328 - val_binary_accuracy: 0.9901\n", "Epoch 3/100\n", "974/974 [==============================] - 11s 11ms/step - loss: 0.0268 - binary_accuracy: 0.9916 - val_loss: 0.0326 - val_binary_accuracy: 0.9904\n", "Epoch 4/100\n", "974/974 [==============================] - 11s 11ms/step - loss: 0.0264 - binary_accuracy: 0.9917 - val_loss: 0.0321 - val_binary_accuracy: 0.9902\n", "Epoch 5/100\n", "974/974 [==============================] - 11s 12ms/step - loss: 0.0260 - binary_accuracy: 0.9918 - val_loss: 0.0318 - val_binary_accuracy: 0.9903\n", "Epoch 6/100\n", "974/974 [==============================] - 11s 11ms/step - loss: 0.0257 - binary_accuracy: 0.9918 - val_loss: 0.0326 - val_binary_accuracy: 0.9903\n", "Epoch 7/100\n", "974/974 [==============================] - 11s 12ms/step - loss: 0.0255 - binary_accuracy: 0.9919 - val_loss: 0.0321 - val_binary_accuracy: 0.9906\n", "Epoch 8/100\n", "974/974 [==============================] - 11s 11ms/step - loss: 0.0252 - binary_accuracy: 0.9919 - val_loss: 0.0329 - val_binary_accuracy: 0.9904\n", "Epoch 9/100\n", "974/974 [==============================] - 11s 11ms/step - loss: 0.0251 - binary_accuracy: 0.9919 - val_loss: 0.0320 - val_binary_accuracy: 0.9905\n", "Epoch 10/100\n", "974/974 [==============================] - 11s 11ms/step - loss: 0.0249 - binary_accuracy: 0.9920 - val_loss: 0.0318 - val_binary_accuracy: 0.9904\n", "54/54 [==============================] - 0s 2ms/step\n", "Previsões guardadas em mf-protbert-pam1.npy\n", "Modelos guardado em models/\n" ] } ], "source": [ "import tensorflow as tf\n", "import joblib\n", "import numpy as np\n", "from tensorflow.keras import Input\n", "from tensorflow.keras.models import Sequential\n", "from tensorflow.keras.layers import Dense, Dropout\n", "from tensorflow.keras.callbacks import EarlyStopping\n", "\n", "# Carregar embeddings\n", "train = joblib.load(\"embeddings/train_protbert.pkl\")\n", "val = joblib.load(\"embeddings/val_protbert.pkl\")\n", "test = joblib.load(\"embeddings/test_protbert.pkl\")\n", "\n", "X_train, y_train = train[\"embeddings\"], train[\"labels\"]\n", "X_val, y_val = val[\"embeddings\"], val[\"labels\"]\n", "X_test, y_test = test[\"embeddings\"], test[\"labels\"]\n", "\n", "print(f\"✓ Embeddings carregados: {X_train.shape} → {y_train.shape[1]} GO terms\")\n", "\n", "# Garantir consistência de classes\n", "max_classes = y_train.shape[1] # 602 GO terms (do treino)\n", "\n", "def pad_labels(y, target_dim=max_classes):\n", " if y.shape[1] < target_dim:\n", " padding = np.zeros((y.shape[0], target_dim - y.shape[1]), dtype=np.int8)\n", " return np.hstack([y, padding])\n", " return y\n", "\n", "y_val = pad_labels(y_val)\n", "y_test = pad_labels(y_test)\n", "\n", "# Modelo MLP\n", "model = Sequential([\n", " Dense(1024, activation=\"relu\", input_shape=(X_train.shape[1],)),\n", " Dropout(0.3),\n", " Dense(512, activation=\"relu\"),\n", " Dropout(0.3),\n", " Dense(max_classes, activation=\"sigmoid\")\n", "])\n", "\n", "model.compile(loss=\"binary_crossentropy\",\n", " optimizer=\"adam\",\n", " metrics=[\"binary_accuracy\"])\n", "\n", "# Early stopping e treino\n", "callbacks = [\n", " EarlyStopping(monitor=\"val_loss\", patience=5, restore_best_weights=True)\n", "]\n", "\n", "model.fit(X_train, y_train,\n", " validation_data=(X_val, y_val),\n", " epochs=100,\n", " batch_size=32,\n", " callbacks=callbacks,\n", " verbose=1)\n", "\n", "# Previsões\n", "y_prob = model.predict(X_test)\n", "np.save(\"predictions/mf-protbert-pam1.npy\", y_prob)\n", "print(\"Previsões guardadas em mf-protbert-pam1.npy\")\n", "\n", "# Modelo\n", "model.save(\"models/mlp_protbert.h5\")\n", "model.save(\"models/mlp_protbert.keras\")\n", "print(\"Modelos guardado em models/\")" ] }, { "cell_type": "code", "execution_count": 3, "id": "fdb66630-76dc-43a0-bd56-45052175fdba", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "go.obo: fmt(1.2) rel(2025-03-16) 43,544 Terms\n", "✓ Embeddings: (1724, 597) labels × 597 GO terms\n", "\n", "📊 Resultados finais (ProtBERT + PAM1 + propagação):\n", "Fmax = 0.6611\n", "Thr. = 0.45\n", "AuPRC = 0.6951\n", "Smin = 13.4386\n" ] } ], "source": [ "import numpy as np\n", "from sklearn.metrics import precision_recall_curve, auc\n", "from goatools.obo_parser import GODag\n", "import joblib\n", "import math\n", "\n", "# Parâmetros\n", "GO_FILE = \"go.obo\"\n", "THRESHOLDS = np.arange(0.0, 1.01, 0.01)\n", "ALPHA = 0.5\n", "\n", "# Carregar dados\n", "test = joblib.load(\"embeddings/test_protbert.pkl\")\n", "y_true = test[\"labels\"]\n", "terms = test[\"go_terms\"]\n", "y_prob = np.load(\"predictions/mf-protbert-pam1.npy\")\n", "go_dag = GODag(GO_FILE)\n", "\n", "print(f\"✓ Embeddings: {y_true.shape} labels × {len(terms)} GO terms\")\n", "\n", "# Fmax\n", "def compute_fmax(y_true, y_prob, thresholds):\n", " fmax, best_thr = 0, 0\n", " for t in thresholds:\n", " y_pred = (y_prob >= t).astype(int)\n", " tp = (y_true * y_pred).sum(axis=1)\n", " fp = ((1 - y_true) * y_pred).sum(axis=1)\n", " fn = (y_true * (1 - y_pred)).sum(axis=1)\n", " precision = tp / (tp + fp + 1e-8)\n", " recall = tp / (tp + fn + 1e-8)\n", " f1 = 2 * precision * recall / (precision + recall + 1e-8)\n", " avg_f1 = np.mean(f1)\n", " if avg_f1 > fmax:\n", " fmax, best_thr = avg_f1, t\n", " return fmax, best_thr\n", "\n", "# AuPRC micro\n", "def compute_auprc(y_true, y_prob):\n", " precision, recall, _ = precision_recall_curve(y_true.ravel(), y_prob.ravel())\n", " return auc(recall, precision)\n", "\n", "# Smin\n", "def compute_smin(y_true, y_prob, terms, threshold, go_dag, alpha=ALPHA):\n", " y_pred = (y_prob >= threshold).astype(int)\n", " ic = {}\n", " total = (y_true + y_pred).sum(axis=0).sum()\n", " for i, term in enumerate(terms):\n", " freq = (y_true[:, i] + y_pred[:, i]).sum()\n", " ic[term] = -np.log((freq + 1e-8) / total)\n", "\n", " s_values = []\n", " for true_vec, pred_vec in zip(y_true, y_pred):\n", " true_terms = {terms[i] for i in np.where(true_vec)[0]}\n", " pred_terms = {terms[i] for i in np.where(pred_vec)[0]}\n", "\n", " anc_true = set()\n", " for t in true_terms:\n", " if t in go_dag:\n", " anc_true |= go_dag[t].get_all_parents()\n", " anc_pred = set()\n", " for t in pred_terms:\n", " if t in go_dag:\n", " anc_pred |= go_dag[t].get_all_parents()\n", "\n", " ru = pred_terms - true_terms\n", " mi = true_terms - pred_terms\n", " dist_ru = sum(ic.get(t, 0) for t in ru)\n", " dist_mi = sum(ic.get(t, 0) for t in mi)\n", " s = math.sqrt((alpha * dist_ru)**2 + ((1 - alpha) * dist_mi)**2)\n", " s_values.append(s)\n", "\n", " return np.mean(s_values)\n", "\n", "# --- 6. Avaliar ----------------------------------------------------------\n", "fmax, thr = compute_fmax(y_true, y_prob, THRESHOLDS)\n", "auprc = compute_auprc(y_true, y_prob)\n", "smin = compute_smin(y_true, y_prob, terms, thr, go_dag)\n", "\n", "print(f\"\\n📊 Resultados finais (ProtBERT + PAM1 + propagação):\")\n", "print(f\"Fmax = {fmax:.4f}\")\n", "print(f\"Thr. = {thr:.2f}\")\n", "print(f\"AuPRC = {auprc:.4f}\")\n", "print(f\"Smin = {smin:.4f}\")\n" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.16" } }, "nbformat": 4, "nbformat_minor": 5 }