soumyatghosh commited on Jul 16, 2025

Commit

4527b5f

verified ·

1 Parent(s): f901faa

Upload folder using huggingface_hub

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.DS_Store +0 -0
.gitattributes +3 -0
README.md +314 -0
configs/README.md +3 -0
data/sample_data.h5ad +3 -0
data/sample_data_metadata.json +43 -0
poetry.lock +0 -0
pyproject.toml +178 -0
scripts/preprocess_sample_data.sh +37 -0
scripts/tokenize_sample_data.sh +39 -0
teddy/.DS_Store +0 -0
teddy/__init__.py +0 -0
teddy/data_processing/__init__.py +0 -0
teddy/data_processing/preprocessing/README.md +55 -0
teddy/data_processing/preprocessing/__init__.py +0 -0
teddy/data_processing/preprocessing/preprocess.py +516 -0
teddy/data_processing/tokenization/README.md +58 -0
teddy/data_processing/tokenization/__init__.py +0 -0
teddy/data_processing/tokenization/tokenization.py +419 -0
teddy/data_processing/utils/__init__.py +0 -0
teddy/data_processing/utils/bio_annotations/__init__.py +0 -0
teddy/data_processing/utils/bio_annotations/calculate_biostats.py +99 -0
teddy/data_processing/utils/bio_annotations/data/all_filtered.json +1227 -0
teddy/data_processing/utils/bio_annotations/data/mappings/all_filtered_cell_mapping.json +862 -0
teddy/data_processing/utils/bio_annotations/data/mappings/all_filtered_disease_mapping.json +125 -0
teddy/data_processing/utils/bio_annotations/data/mappings/all_filtered_sex_mapping.json +5 -0
teddy/data_processing/utils/bio_annotations/data/mappings/all_filtered_tissue_mapping.json +415 -0
teddy/data_processing/utils/bio_annotations/data/sampling_probs_for_collator/all_filtered_cell_probs.json +15 -0
teddy/data_processing/utils/bio_annotations/data/sampling_probs_for_collator/all_filtered_disease_probs.json +13 -0
teddy/data_processing/utils/bio_annotations/data/sampling_probs_for_collator/all_filtered_sex_probs.json +5 -0
teddy/data_processing/utils/bio_annotations/data/sampling_probs_for_collator/all_filtered_tissue_probs.json +20 -0
teddy/data_processing/utils/bio_annotations/data/sampling_probs_for_collator/cell_probs_for_classification.json +15 -0
teddy/data_processing/utils/bio_annotations/data/sampling_probs_for_collator/disease_probs_for_classification.json +13 -0
teddy/data_processing/utils/bio_annotations/data/sampling_probs_for_collator/sex_probs_for_classification.json +5 -0
teddy/data_processing/utils/bio_annotations/data/sampling_probs_for_collator/tissue_probs_for_classification.json +20 -0
teddy/data_processing/utils/gene_mapping/__init__.py +0 -0
teddy/data_processing/utils/gene_mapping/data/2407_ensembl_processed.txt +0 -0
teddy/data_processing/utils/gene_mapping/data/2407_hgnc_mapping.any2any.txt +3 -0
teddy/data_processing/utils/gene_mapping/data/2407_mouse_gene_mapping.txt +0 -0
teddy/data_processing/utils/gene_mapping/data/human_mapping.txt +3 -0
teddy/data_processing/utils/gene_mapping/data/mouse_to_human_orthologs.one2one.txt +0 -0
teddy/data_processing/utils/gene_mapping/gene_mapper.py +629 -0
teddy/data_processing/utils/medians/data/teddy_gene_medians.json +0 -0
teddy/models/.DS_Store +0 -0
teddy/models/__init__.py +0 -0
teddy/models/classification_heads.py +285 -0
teddy/models/model_directory.py +53 -0
teddy/models/teddy_g/.DS_Store +0 -0
teddy/models/teddy_g/160M/added_tokens.json +7 -0
teddy/models/teddy_g/160M/config.json +26 -0

.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

.gitattributes CHANGED Viewed

@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+data/sample_data.h5ad filter=lfs diff=lfs merge=lfs -text
+teddy/data_processing/utils/gene_mapping/data/2407_hgnc_mapping.any2any.txt filter=lfs diff=lfs merge=lfs -text
+teddy/data_processing/utils/gene_mapping/data/human_mapping.txt filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,314 @@

+# TEDDY: A Family of Foundation Models for Single-Cell Biology
+This repository provides open-source code and configurations supporting the **TEDDY** project, as described in:
+> **[TEDDY: A FAMILY OF FOUNDATION MODELS FOR UNDERSTANDING SINGLE CELL BIOLOGY](https://arxiv.org/abs/2503.03485)**
+TEDDY leverages large-scale single-cell RNA sequencing (scRNA-seq) data (~116 million cells) to train transformer-based models. These models capture disease-related signals and generalize to diverse downstream tasks, including cross-donor and cross-disease classification.
+---
+## Table of Contents
+1. [Introduction](#introduction)
+2. [Project Goals & Paper Summary](#project-goals--paper-summary)
+3. [Pipeline Overview](#pipeline-overview)
+4. [Installation](#installation--setup)
+5. [Detailed Steps](#detailed-steps)
+   1. [Preprocessing & Tokenization](#1-preprocessing--tokenization)
+   2. [Loading TEDDY Models](#2-loading-teddy-models)
+6. [Running sample scripts on sample data](#running-sample-scripts-on-sample-data)
+7. [Running unit tests with pytest](#running-unit-tests-with-pytest)
+8. [Reference](#reference)
+---
+## Introduction
+Single-cell RNA sequencing data can span hundreds of millions of cells, each expressing thousands of genes. **TEDDY** (*T*ransformer for *E*nabling *D*rug *D*iscovery) adapts masked language modeling and ontology classification to gene expression. By scaling both **data volume** and **model capacity** (up to ~400M parameters), TEDDY learns robust biological features that generalize to **unseen diseases**, **unseen donors**, and more.
+---
+## Project Goals & Paper Summary
+Refer to the [paper](https://arxiv.org/abs/2503.03485) for full technical details. Key highlights:
+- **Data**: 116 million single cells, spanning multiple tissues, diseases, and human/mouse species.
+- **Models**:
+  - **TEDDY-G** (rank-based encoding)
+  - **TEDDY-X** (binned encoding)
+  - Sizes range from 10M to 400M parameters.
+- **Annotation Supervision**: Additional labels (disease, tissue, cell type, etc.) further refine model representations.
+- **Benchmarks**: “Held-out donors” and “held-out diseases” classification tasks showed significant gains over alternative foundation models.
+(Note: This release only includes the most performant models: TEDDY-G 70M, TEDDY-G 160M, and TEDDY-G 400M)
+---
+## Pipeline Overview
+**TEDDY** pipeline involves three steps:
+1. **Preprocessing**
+   - Load `.h5ad` files, remove low-quality cells, normalize expression counts to 10000,
+   and median normalize.
+   - Outputs a “processed” `.h5ad` file.
+2. **Tokenization**
+   - Converts each cell’s expression profile into integer tokens or rank-based embeddings.
+   - Can embed metadata tokens that can be used as ontologies in the model (e.g., `<disease>`, `<tissue_type>`, `<sex>`, `<cell_type>`) if needed.
+3. **Model Inference and Training**
+   - Uses the tokenized dataset to generate embeddigns for cells and genes.
+   - Uses the tokenized dataset for masked language modeling plus ontology classification.
+   - Model config examples live in dedicated config files for relevant architectures.
+---
+## Installation & Setup
+**Building Your Environment**
+### 1. Clone the Repository
+First, clone the repository to your local machine:
+```bash
+git clone XXX (update with the final public link)
+cd teddy-models
+```
+### 2. Environment Setup
+- Fine-tuning and pretraining of these models were conducted on GPUs, so ensure your instance is properly configured before working with large datasets.
+- Ensure you have ***Python 3.11.10*** installed. You can use `pyenv` to manage Python versions:
+  ```bash
+  pyenv install 3.11.10
+  pyenv local 3.11.10
+  ```
+  - More details on how to use `pyenv`: [pyenv documentation](https://github.com/pyenv/pyenv)
+- If you don’t already have ***Poetry*** installed, you can install it using the following command:
+  ```bash
+  curl -sSL https://install.python-poetry.org | python3 -
+  export PATH="/PATH/TO/YOUR/USER/.local/bin:$PATH"
+  ```
+- Check that poetry uses the correct python version:
+  ```bash
+  pyenv which python
+  ```
+  - Change to correct version by running:
+    ```bash
+    poetry env use /PATH/TO/YOUR/USER/.pyenv/versions/3.11.10/bin/python
+    ```
+- Run the following command to build the project and install its dependencies:
+  ```bash
+  poetry build
+  poetry install
+  ```
+- Once the setup is complete, you can use the package.
+---
+## Detailed Steps
+There are three ways to run **Preprocessing** and **Tokenization**:
+1. **Directly in Python** (importing the scripts)
+2. **Command-Line Arguments** (using flags)
+3. **JSON Config Files** (loading a `.json` with your parameters)
+### 1. Preprocessing & Tokenization
+#### Directly in Python
+Detailed [README.md for Preprocessing](teddy/data_processing/preprocessing/README.md) and [README.md for Tokenization](teddy/data_processing/tokenization/README.md) can be found in the related module folders.
+**Preprocessing example**:
+```
+from teddy.data_processing.preprocessing.preprocess import preprocess
+preprocessing_config = {
+  "min_gene_counts": 225,
+  "remove_assays": ["10x5' v1", "10x3' v1"],
+  "max_mitochondrial_prop": 10,
+  "remove_cell_types": [],
+  "hvg_method": None,
+  "normalized_total": 10000,
+  "median_dict": "teddy/data_processing/utils/medians/data/teddy_gene_medians.json",
+  "log1p": False,
+  "compute_medians": False,
+  "median_column": "index",
+  "reference_id_only": False,
+  "load_dir": "<PATH_TO_RAW_DATA_PARENT>",
+  "save_dir": "<PATH_TO_PROCESSED_DATA_PARENT>",
+}
+preprocess(
+  data_path="data/RAW_SAMPLES/my_data.h5ad",
+  metadata_path="data/RAW_SAMPLES/my_data_metadata.json",
+  hyperparameters=preprocessing_config
+)
+```
+The above preprocessing arguments were used to preprocess the corpus used for pretraining TEDDY models.
+**Tokenization example**:
+```
+from teddy.data_processing.tokenization.tokenization import tokenize
+tokenizer_config = {
+  "tokenizer_name_or_path": "teddy/models/teddy_g/400M",
+  "gene_id_column": "index",
+  "bio_annotations": True,
+  "disease_mapping": "teddy/data_processing/utils/bio_annotations/data/mappings/all_filtered_disease_mapping.json",
+  "tissue_mapping": "teddy/data_processing/utils/bio_annotations/data/mappings/all_filtered_tissue_mapping.json",
+  "cell_mapping": "teddy/data_processing/utils/bio_annotations/data/mappings/all_filtered_cell_mapping.json",
+  "sex_mapping": "teddy/data_processing/utils/bio_annotations/data/mappings/all_filtered_sex_mapping.json",
+  "max_shard_samples": 500,
+  "max_seq_len": 2048,
+  "pad_length": 2048,
+  "add_cls": False,
+  "bins": 0,
+  "continuous_rank": True,
+  "truncation_method": "max",
+  "add_disease_annotation": False,
+  "include_zero_genes": False,
+  "load_dir": "<PATH_TO_PROCESSED_DATA_PARENT>",
+  "save_dir": "<PATH_TO_TOKENIZED_DATA>"
+}
+tokenize(
+  data_path="outputs/preprocessed/my_data_preprocessed.h5ad",
+  metadata_path="outputs/preprocessed/my_data_preprocessed_metadata.json",
+  hyperparameters=tokenizer_config
+)
+```
+Above tokenization arguments were used for the Teddy models.
+####  By Saving a `config.json` and Running It with Bash
+Example **preprocess_config.json**:
+```
+{
+  "min_gene_counts": null,
+  "remove_assays": [],
+  "max_mitochondrial_prop": null,
+  "remove_cell_types": [],
+  "hvg_method": null,
+  "normalized_total": 10000,
+  "median_dict": "teddy/data_processing/utils/medians/data/teddy_gene_medians.json",
+  "log1p": false,
+  "compute_medians": false,
+  "median_column": "index",
+  "reference_id_only": false,
+  "load_dir": "<PATH_TO_RAW_DATA_PARENT>",
+  "save_dir": "<PATH_TO_PROCESSED_DATA_PARENT>"
+}
+```
+Run:
+```
+python teddy/data/preprocessing/preprocess.py \
+  --data_path data/RAW_SAMPLES/my_data.h5ad \
+  --metadata_path data/RAW_SAMPLES/my_data_metadata.json \
+  --config preprocess_config.json
+```
+(Same idea for tokenization, e.g. tokenize_config.json, then `--config tokenize_config.json`.)
+#### By Creating a `.sh` File and Executing It (With Poetry)
+You can find an example in **scripts/preprocess_sample_data.sh**:
+```
+#!/bin/bash -l
+# (Optional) Activate your Poetry environment
+poetry shell
+# 1) Generate a JSON config file on the fly
+cat <<EOF > configs/my_preprocess_config.json
+{
+  "load_dir": "data",
+  "save_dir": "data/processed",
+  "min_gene_counts": null,
+  "remove_assays": [],
+  "max_mitochondrial_prop": null,
+  "remove_cell_types": [],
+  "hvg_method": null,
+  "normalized_total": null,
+  "median_dict": "teddy/data_processing/utils/medians/data/teddy_gene_medians.json",
+  "log1p": false,
+  "compute_medians": false,
+  "median_column": "index",
+  "reference_id_only": false
+}
+EOF
+# 2) Call preprocess.py, explicitly passing data_path, metadata_path, and config_path
+python teddy/data_processing/preprocessing/preprocess.py \
+  --data_path data/sample_data.h5ad \
+  --metadata_path data/sample_data_metadata.json \
+  --config_path my_preprocess_config.json
+```
+Then do:
+```
+chmod +x preprocess_sample_data.sh
+./preprocess_sample_data.sh
+```
+---
+You can override any parameter by specifying command-line arguments, editing the `.json`, or updating the Python dictionary.
+(Same idea for tokenization, e.g. use example in **scripts/tokenize_sample_data.sh**)
+### 2. Loading TEDDY Models
+If you want to load a trained TEDDY model in your Python code, you can do so with the following snippet:
+```
+from teddy.models.model_directory import get_architecture, model_dict
+model_name_or_path = 'teddy/models/teddy_g/400M' # or local path to model files
+arch = get_architecture(model_name_or_path)
+config_cls = model_dict[arch]["config_cls"]
+model_cls = model_dict[arch]["model_cls"]
+# Load the configuration and model
+config = config_cls.from_pretrained(model_name_or_path)
+model = model_cls.from_pretrained(model_name_or_path, config=config)
+# model is now ready for inference or further fine-tuning
+```
+You can then perform inference, fine-tuning, or evaluation with the model object as needed.
+## Running sample scripts on sample data
+In the `scripts` directory of this repository, sample code has been included with which to preprocess and tokenize the sample data in the `data` directory. To switch this out for your own data, simply replace the data within the `data` directory with your data and rename file paths within the scripts as needed.
+To run the scripts included, run the following commands from the root of the `teddy-models` repository.
+```
+chmod +x scripts/*
+./scripts/preprocess_sample_data.sh
+./scripts/tokenize_sample_data.sh
+```
+## Running unit tests with pytest
+To run the unit tests in the repository, you can run `poetry run pytest`. The tests should all pass, but receiving runtime warnings is expected behavior with the simulated data for the tests.
+## Reference
+Reference to cite when you use TEDDY:
+```
+@misc{chevalier2025teddyfamilyfoundationmodels,
+      title={TEDDY: A Family Of Foundation Models For Understanding Single Cell Biology},
+      author={Alexis Chevalier and Soumya Ghosh and Urvi Awasthi and James Watkins and Julia Bieniewska and Nichita Mitrea and Olga Kotova and Kirill Shkura and Andrew Noble and Michael Steinbaugh and Julien Delile and Christoph Meier and Leonid Zhukov and Iya Khalil and Srayanta Mukherjee and Judith Mueller},
+      year={2025},
+      eprint={2503.03485},
+      archivePrefix={arXiv},
+      primaryClass={cs.LG},
+      url={https://arxiv.org/abs/2503.03485},
+```

configs/README.md ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ # Configuration files
2	+
3	+ This project includes scripts that generate and utilize configuration files. These configuration files are essential for the proper functioning of the scripts and are automatically saved in this designated directory.

data/sample_data.h5ad ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:15c61493446545284e86f6dfd276a6ba6825ca15e4a1bf2333be3149de6bc330
+size 36585033

data/sample_data_metadata.json ADDED Viewed

	@@ -0,0 +1,43 @@

+{
+    "donor_id": [
+        "Subject2",
+        "Subject3",
+        "Subject1",
+        "CTRL-2",
+        "CTRL-4"
+    ],
+    "cell_count": 3538,
+    "organism": [
+        "Homo sapiens"
+    ],
+    "sampling_parameters": [
+        {
+            "train_prop": 1.0,
+            "test_prop": 0.0,
+            "test_donors_prop": 0.0,
+            "train_donors": [
+                "Subject4",
+                "Subject5",
+                "Subject6",
+                "Subject7",
+                "Subject8",
+                "CTRL-1",
+                "CTRL-3",
+                "CTRL-5",
+                "CTRL-6",
+                "CTRL-7",
+                "CTRL-8"
+            ],
+            "test_donors": [
+                "Subject1",
+                "Subject2",
+                "Subject3",
+                "CTRL-2",
+                "CTRL-4"
+            ],
+            "include_nonhuman": false,
+            "load_dir": "public/data/cellxgene_single_datasets/alzheimers/transcriptomiscs_tangle_neurons/original/inhibitory/",
+            "save_dir": "public/data/cellxgene_single_datasets/paper_evaluations/alzheimers/transcriptomiscs_tangle_neurons/inhibitory/sampled/split1/"
+        }
+    ]
+}

poetry.lock ADDED Viewed

The diff for this file is too large to render. See raw diff

pyproject.toml ADDED Viewed

	@@ -0,0 +1,178 @@

+[tool.poetry]
+name = "teddy"
+version = "0.1.0"
+description = "A module for fine-tuning and preprocessing foundational models."
+authors = ["Olga Kotova <[email protected]>"]
+license = "MIT"
+readme = "README.md"
+[tool.poetry.dependencies]
+python = "3.11.10"
+accelerate = "0.30.1"
+aiohttp = "3.9.5"
+aiosignal = "1.3.1"
+alembic = "1.13.2"
+aniso8601 = "9.0.1"
+anndata = "0.10.7"
+attrs = "23.2.0"
+azure-common = "1.1.28"
+azure-core = "1.30.1"
+azure-mgmt-core = "1.4.0"
+azure-mgmt-storage = "21.1.0"
+azure-storage-blob = "12.20.0"
+beautifulsoup4 = "4.12.3"
+blinker = "1.8.2"
+boto3 = "1.34.112"
+botocore = "1.34.112"
+cachetools = "5.3.3"
+certifi = "2024.7.4"
+cffi = "1.16.0"
+charset-normalizer = "3.3.2"
+click = "8.1.7"
+cloudpickle = "3.0.0"
+contourpy = "1.2.1"
+cryptography = "44.0.0"
+cycler = "0.12.1"
+datasets = "2.19.1"
+deprecated = "1.2.14"
+dill = "0.3.8"
+docker = "7.1.0"
+docker-pycreds = "0.4.0"
+fabric = "3.2.2"
+filelock = "3.14.0"
+flask = "3.0.3"
+fonttools = "4.51.0"
+frozenlist = "1.4.1"
+fsspec = "2024.3.1"
+gdown = "5.2.0"
+gitdb = "4.0.11"
+gitpython = "3.1.43"
+graphene = "3.3"
+graphql-core = "3.2.3"
+graphql-relay = "3.2.0"
+greenlet = "3.0.3"
+gunicorn = "22.0.0"
+h5py = "3.11.0"
+huggingface-hub = "0.23.1"
+hyperopt = "0.1.2"
+idna = "3.7"
+igraph = "0.11.5"
+isodate = "0.6.1"
+itsdangerous = "2.2.0"
+jinja2 = "3.1.4"
+jmespath = "1.0.1"
+joblib = "1.4.2"
+kiwisolver = "1.4.5"
+legacy-api-wrap = "1.4"
+leidenalg = "0.10.2"
+llvmlite = "0.42.0"
+mako = "1.3.5"
+markdown = "3.6"
+markupsafe = "2.1.5"
+matplotlib = "3.9.0"
+mlflow = "2.16.0"
+mpmath = "1.3.0"
+multidict = "6.0.5"
+multiprocess = "0.70.16"
+natsort = "8.4.0"
+networkx = "3.3"
+numba = "0.59.1"
+numpy = "1.26.4"
+opentelemetry-api = "1.25.0"
+opentelemetry-sdk = "1.25.0"
+opentelemetry-semantic-conventions = "0.46b0"
+pandas = "2.2.2"
+patsy = "0.5.6"
+pillow = "10.3.0"
+protobuf = "4.25.3"
+psutil = "5.9.8"
+pyarrow = "15.0.2"
+pycparser = "2.22"
+pydot = "2.0.0"
+pymongo = "4.7.2"
+pynndescent = "0.5.12"
+pyparsing = "3.1.2"
+pysocks = "1.7.1"
+python-box = "7.1.1"
+python-dateutil = "2.9.0.post0"
+pytz = "2024.1"
+pyyaml = "6.0.1"
+regex = "2024.5.15"
+requests = "2.32.2"
+s3transfer = "0.10.1"
+safetensors = "0.4.3"
+scanpy = "1.10.1"
+scib = "1.1.5"
+scikit-learn = "1.5.0"
+scikit-misc = "0.3.1"
+scipy = "1.13.0"
+scvi = "0.6.8"
+seaborn = "0.13.2"
+sentry-sdk = "2.8.0"
+session-info = "1.0.0"
+setproctitle = "1.3.3"
+smmap = "5.0.1"
+soupsieve = "2.5"
+sqlalchemy = "2.0.31"
+sqlparse = "0.5.0"
+statsmodels = "0.14.2"
+sympy = "1.12"
+texttable = "1.7.0"
+threadpoolctl = "3.5.0"
+tokenizers = "0.19.1"
+torch = "^2.3.0 || >=2.0.1"
+torchtext = "^0.18.0 || >=0.15.2"
+torchvision = "^0.18.0 || >=0.15.2"
+tqdm = "4.66.4"
+transformers = "4.41.0"
+tzdata = "2024.1"
+umap-learn = "0.5.6"
+urllib3 = "2.2.2"
+wandb = "0.17.0"
+werkzeug = "3.0.6"
+wrapt = "1.16.0"
+xxhash = "3.4.1"
+yarl = "1.9.4"
+jupyter = "^1.1.1"
+ipykernel = "^6.29.5"
+tensorboard = "^2.19.0"
+pydantic = "^2.10.6"
+[tool.poetry.group.dev.dependencies]
+pytest = "^7.0"
+black = "^24.3"
+isort = "^5.0"
+ruff = "^0.0.286"
+pre-commit = "^4.0.1"
+[build-system]
+requires = ["poetry-core>=1.0.0"]
+build-backend = "poetry.core.masonry.api"
+[tool.black]
+skip-string-normalization = true
+line-length = 120
+[tool.ruff]
+# Same as Black.
+line-length = 120
+exclude = ["jupyter_notebook_config.py"]
+select = [
+    "E",  # pycodestyle errors (settings from FastAPI, thanks, @tiangolo!)
+    "W",  # pycodestyle warnings
+    "F",  # pyflakes
+    "I",  # isort
+    "C",  # flake8-comprehensions
+    "B",  # flake8-bugbear
+]
+ignore = [
+    "E501",  # line too long, handled by black
+    "C901",  # too complex
+]
+[tool.ruff.isort]
+order-by-type = true
+relative-imports-order = "closest-to-furthest"
+extra-standard-library = ["typing"]
+section-order = ["future", "standard-library", "third-party", "first-party", "local-folder"]
+known-first-party = []

scripts/preprocess_sample_data.sh ADDED Viewed

	@@ -0,0 +1,37 @@

+#!/bin/bash -l
+# (Optional) Activate your Poetry environment
+poetry shell
+# Generate a timestamp string (e.g., 20230404123056)
+TS=$(date '+%Y%m%d%H%M%S')
+CONFIG_FILE="configs/preprocessing_config_${TS}.json"
+# 1) Generate a JSON config file on the fly
+cat <<EOF > "$CONFIG_FILE"
+{
+  "load_dir": "data",
+  "save_dir": "data/processed",
+  "min_gene_counts": null,
+  "remove_assays": [],
+  "max_mitochondrial_prop": null,
+  "remove_cell_types": [],
+  "hvg_method": null,
+  "normalized_total": null,
+  "median_dict": "teddy/data_processing/utils/medians/data/teddy_gene_medians.json",
+  "log1p": false,
+  "compute_medians": false,
+  "median_column": "index",
+  "reference_id_only": false
+}
+EOF
+# 2) Call preprocess.py, explicitly passing data_path, metadata_path, and config_path
+python teddy/data_processing/preprocessing/preprocess.py \
+  --data_path data/sample_data.h5ad \
+  --metadata_path data/sample_data_metadata.json \
+  --config_path "$CONFIG_FILE"

scripts/tokenize_sample_data.sh ADDED Viewed

	@@ -0,0 +1,39 @@

+#!/bin/bash -l
+# Activate the Poetry environment (adjust this if needed)
+poetry shell
+# Generate a timestamp string (e.g., 20230404123056)
+TS=$(date '+%Y%m%d%H%M%S')
+CONFIG_FILE="configs/tokenization_config_${TS}.json"
+# Create the config file containing your tokenization arguments
+cat <<EOF > "$CONFIG_FILE"
+{
+  "tokenizer_name_or_path": "teddy/models/teddy_g/400M",
+  "gene_id_column": "index",
+  "bio_annotations": true,
+  "disease_mapping": "teddy/data_processing/utils/bio_annotations/data/mappings/all_filtered_disease_mapping.json",
+  "tissue_mapping": "teddy/data_processing/utils/bio_annotations/data/mappings/all_filtered_tissue_mapping.json",
+  "cell_mapping": "teddy/data_processing/utils/bio_annotations/data/mappings/all_filtered_cell_mapping.json",
+  "sex_mapping": "teddy/data_processing/utils/bio_annotations/data/mappings/all_filtered_sex_mapping.json",
+  "max_shard_samples": 500,
+  "max_seq_len": 2048,
+  "pad_length": 2048,
+  "add_cls": false,
+  "bins": 0,
+  "continuous_rank": true,
+  "add_disease_annotation": false,
+  "include_zero_genes": false,
+  "load_dir": "data/processed",
+  "save_dir": "data/tokenized"
+}
+EOF
+# Execute the tokenization.py script with three arguments:
+# --data_path, --metadata_path, and --config_path
+python teddy/data_processing/tokenization/tokenization.py \
+  --data_path data/processed/sample_data.h5ad \
+  --metadata_path data/processed/sample_data_metadata.json \
+  --config_path "$CONFIG_FILE"

teddy/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

teddy/__init__.py ADDED Viewed

File without changes

teddy/data_processing/__init__.py ADDED Viewed

File without changes

teddy/data_processing/preprocessing/README.md ADDED Viewed

	@@ -0,0 +1,55 @@

+# PreprocessReadMe.md
+The `preprocess.py` script is designed to preprocess gene expression data for use in our models. It takes in `data.raw.X` or `data.X` data, applies various preprocessing techniques, and prepares it for training or inference.
+# General Workflow
+The script follows these main steps:
+0. **Load Data and Metadata**: The script starts by loading the gene expression data from an AnnData file and metadata from a JSON file.
+1. **Set Raw Layer**: It checks if the `data.raw.X` layer is set in the AnnData object. If not, it sets it based on the integer counts in the `data.X`.
+2. **Initialize Processed Layer**: It initializes the `data.layer['processed']` in the AnnData object, which is the layer that will be affected by preprocessing.
+3. **Filter Genes by Reference ID**: It filters genes based on reference IDs if specified in the hyperparameters.
+4. **Remove Assays**: It removes specified assays from the data.
+5. **Filter Cells by Gene Counts**: It filters out cells with gene counts below a specified threshold.
+6. **Filter Cells by Mitochondrial Fraction**: It removes cells with a high mitochondrial gene fraction.
+7. **Filter Highly Variable Genes**: It filters genes to retain only highly variable ones using specified methods.
+8. **Normalize Data**: It normalizes the data by applying row (gene level) normalization and scaling.
+9. **Scale Columns by Median**: It scales columns based on median values from a specified dictionary.
+10. **Log Transform**: It applies a log+1 transformation to the data.
+11. **Compute Medians**: It computes and saves medians of the processed data if specified.
+12. **Update Metadata**: It updates the metadata with cell counts and processing arguments.
+13. **Save and Cleanup**: It saves the processed data and metadata to disk and performs garbage collection.
+# Preprocessing Arguments
+The script uses several preprocessing arguments to control its behavior. Here is an explanation of each argument and the steps they influence:
+- `reference_id_only`
+    - Description: Specifies whether to filter genes by reference ID.
+    - Impact: If enabled, the script filters genes based on reference IDs.
+- `remove_assays`
+    - Description: List of assays to remove from the data.
+    - Impact: The script removes specified assays from the data.
+- `min_gene_counts`
+    - Description: Minimum gene counts required for cells to be retained.
+    - Impact: The script filters out cells with gene counts below this threshold.
+- `max_mitochondrial_prop`
+    - Description: Maximum mitochondrial gene fraction allowed for cells.
+    - Impact: The script removes cells with a mitochondrial gene fraction above this threshold.
+- `hvg_method`
+    - Description: Method to use for filtering highly variable genes.
+    - Impact: The script filters genes to retain only highly variable ones using the specified method.
+- `normalized_total`
+    - Description: Value to normalize the total gene expression to.
+    - Impact: The script normalizes the data by applying row (gene level) normalization and scaling.
+- `median_dict`
+    - Description: Path to a JSON file containing median values for scaling columns.
+    - Impact: The script scales columns based on median values from the specified dictionary.
+- `median_column`
+    - Description: Column name to use for looking up median values.
+    - Impact: The script uses this column to look up median values for scaling.
+- `log1p`
+    - Description: Indicates whether to apply a log transformation to the data.
+    - Impact: If enabled, the script applies a log transformation to the data.
+- `compute_medians`
+    - Description: Indicates whether to compute and save medians of the processed data.
+    - Impact: If enabled, the script computes and saves medians of the processed data.

teddy/data_processing/preprocessing/__init__.py ADDED Viewed

File without changes

teddy/data_processing/preprocessing/preprocess.py ADDED Viewed

	@@ -0,0 +1,516 @@

+"""
+Module: preprocess.py
+This module provides a preprocessing pipeline for single-cell RNA sequencing (scRNA-seq) data
+stored in AnnData format. It includes functions for loading data, filtering cells and genes,
+normalizing and scaling data, and saving processed results. The pipeline is designed to be
+configurable via hyperparameters and supports various preprocessing steps such as mitochondrial
+gene filtering, highly variable gene selection, and log transformation.
+Main Features:
+- Load and preprocess scRNA-seq data in AnnData format.
+- Filter cells and genes based on various criteria.
+- Normalize, scale, and log-transform data.
+- Save processed data and metadata to disk.
+- Configurable via JSON-based hyperparameters.
+Dependencies:
+- anndata, numpy, pandas, scanpy, scipy, sklearn
+Usage:
+- Run this script as a standalone program with a configuration file specifying the hyperparameters.
+- Import the `preprocess` function and call it with the data path, metadata path, and hyperparameters.
+"""
+import gc
+import json
+import os
+import warnings
+from argparse import ArgumentParser
+from typing import Sequence, Optional, Union
+from pathlib import Path
+import anndata as ad
+import numpy as np
+import pandas as pd
+import scanpy as sc
+from anndata import ImplicitModificationWarning
+import scipy.sparse as sp
+from scipy.sparse import csr_matrix, issparse
+from sklearn.utils import sparsefuncs, sparsefuncs_fast
+from teddy.data_processing.utils.gene_mapping.gene_mapper import (
+    map_mouse_human,
+    map_mouse_human2,
+)
+# --- 1. Reference list of the 37 human mitochondrial genes (Ensembl IDs) -----
+_HUMAN_MITO_ENSEMBL= {
+    "ENSG00000211459", "ENSG00000210082",  # rRNAs
+    # tRNAs (22)
+    "ENSG00000210049", "ENSG00000210077", "ENSG00000209082",
+    "ENSG00000210100", "ENSG00000210107", "ENSG00000210112",
+    "ENSG00000210119", "ENSG00000210122", "ENSG00000210116",
+    "ENSG00000210117", "ENSG00000210118", "ENSG00000210124",
+    "ENSG00000210126", "ENSG00000210134", "ENSG00000210135",
+    "ENSG00000210142", "ENSG00000210144", "ENSG00000210148",
+    "ENSG00000210150", "ENSG00000210155", "ENSG00000210196",
+    "ENSG00000210151",
+    # protein-coding (13)
+    "ENSG00000198888", "ENSG00000198763", "ENSG00000198840",
+    "ENSG00000198886", "ENSG00000212907", "ENSG00000198786",
+    "ENSG00000198695", "ENSG00000198804", "ENSG00000198712",
+    "ENSG00000198938", "ENSG00000198899", "ENSG00000228253",
+    "ENSG00000198727",
+}
+_HUMAN_MITO_SYMBOLS = {
+    "MT-RNR1", "MT-RNR2", "MT-TF", "MT-TV", "MT-TL1", "MT-TI", "MT-TQ",
+    "MT-TM", "MT-TW", "MT-TA", "MT-TN", "MT-TC", "MT-TY", "MT-TD", "MT-TK",
+    "MT-TG", "MT-TR", "MT-TH", "MT-TS2", "MT-TL2", "MT-TT", "MT-TE", "MT-TP",
+    "MT-TS1", "MT-ND1", "MT-ND2", "MT-ND3", "MT-ND4", "MT-ND4L", "MT-ND5",
+    "MT-ND6", "MT-CO1", "MT-CO2", "MT-CO3", "MT-ATP6", "MT-ATP8", "MT-CYB",
+}
+def load_data_and_metadata(data_path: str, metadata_path: str):
+    """
+    Load an AnnData h5ad file (data_processing) and a JSON file (metadata).
+    """
+    data = ad.read_h5ad(data_path)
+    with open(metadata_path, "r") as f:
+        metadata = json.load(f)
+    return data, metadata
+def set_raw_if_necessary(data: ad.AnnData):
+    """
+    If data_processing.raw is None, checks if data_processing.X is integer for ~64 cells.
+    If so, set data_processing.raw = data_processing. Otherwise return None (skip).
+    """
+    if data.raw is not None:
+        return data  # Already has raw
+    # If there is a 'counts' layer
+    if 'counts' in data.layers:
+        X = data.layers['counts']
+        # convert only 64 rows instead of converting the whole thing
+        if isinstance(X, np.ndarray):
+            X_sample = X[:64]
+        elif issparse(X):
+            X_sample = X[:64].toarray()
+        # Check first 64 rows for integrality
+        if np.all(np.equal(np.mod(X_sample, 1), 0)):
+            data.raw = ad.AnnData(X = data.layers['counts'], var = data.var.copy())
+            return data
+    # If above steps fail, check that data.X has raw counts already
+    X = data.X
+    # convert only 64 rows instead of converting the whole thing
+    if isinstance(X, np.ndarray):
+        X_sample = X[:64]
+    elif issparse(X):
+        X_sample = X[:64].toarray()
+    # Check first 64 rows for integrality
+    if np.all(np.equal(np.mod(X_sample, 1), 0)):
+        data.raw = data
+        return data
+    else:
+        print("No integer-valued matrix found")
+        return None
+def initialize_processed_layer(data: ad.AnnData):
+    """
+    If 'processed' layer is missing, copy from data_processing.raw.X
+    """
+    if "processed" not in data.layers:
+        data.layers["processed"] = data.raw.X.astype("float32")
+    return data
+# Replacing inline code with a small helper:
+# (we simply inline the code from the original snippet)
+# You can also fully factor it out for clarity:
+# ---------------------------------------------------
+# Actually let's define that properly here to keep it consistent:
+def filter_reference_id(data: ad.AnnData, hyperparameters: dict):
+    human_map = pd.read_csv("teddy/data_processing/utils/gene_mapping/data/human_mapping.txt", sep="\t")
+    mouse_map = pd.read_csv("teddy/data_processing/utils/gene_mapping/data/2407_mouse_gene_mapping.txt", sep="\t")
+    orthologs = pd.read_csv(
+        "teddy/data_processing/utils/gene_mapping/data/mouse_to_human_orthologs.one2one.txt", sep="\t"
+    )
+    if hyperparameters.get("mouse_nonorthologs", False):
+        reference_id = map_mouse_human2(
+            data_frame=data.var,
+            query_column=None,
+            human_map_db=human_map,
+            mouse_map_db=mouse_map,
+            orthology_db=orthologs,
+        )["reference_id"]
+    else:
+        reference_id = map_mouse_human(
+            data_frame=data.var,
+            query_column=None,
+            human_map_db=human_map,
+            mouse_map_db=mouse_map,
+            orthology_db=orthologs,
+        )["reference_id"]
+    valid_mask = reference_id != ""
+    data = data[:, valid_mask].copy()
+    reference_id = reference_id[valid_mask].reset_index(drop=True)
+    if not isinstance(data.layers["processed"], np.ndarray):
+        corrected = data.layers["processed"].toarray()
+    else:
+        corrected = data.layers["processed"]
+    unique_ids = reference_id.unique()
+    vars_to_keep = []
+    for rid in unique_ids:
+        repeated_idx = np.where(reference_id == rid)[0]
+        vars_to_keep.append(repeated_idx[0])
+        if len(repeated_idx) > 1:
+            corrected[:, repeated_idx[0]] = corrected[:, repeated_idx].max(axis=1)
+    vars_to_keep = sorted(vars_to_keep)
+    corrected = corrected[:, vars_to_keep]
+    data = data[:, vars_to_keep]
+    with warnings.catch_warnings():
+        warnings.filterwarnings("ignore", category=ImplicitModificationWarning)
+        data.layers["processed"] = csr_matrix(corrected)
+        data.var["reference_id"] = list(reference_id[vars_to_keep])
+    gc.collect()
+    return data
+# End of inline helper
+# ---------------------------------------------------
+def remove_assays(data: ad.AnnData, assays_to_remove: list):
+    """
+    Removes observations from specified 'assay' categories if 'assay' is in data_processing.obs.
+    """
+    data = data[~data.obs.assay.isin(assays_to_remove)].copy()
+    gc.collect()
+    return data
+def filter_cells_by_gene_counts(data: ad.AnnData, min_count: int):
+    """
+    Removes cells (observations) whose total gene counts < min_count.
+    """
+    mask = sc.pp.filter_cells(data.layers["processed"], min_counts=min_count)[0]
+    data = data[np.where(mask)].copy()
+    del mask
+    gc.collect()
+    return data
+def filter_cells_by_mitochondrial_fraction(data: ad.AnnData, max_mito_prop: float):
+    """
+    Remove low-quality cells whose mitochondrial read fraction exceeds *max_fraction*.
+    DO NOT RUN THIS IN ANY PREPROCESSING PIPELINE UNTIL YOU HAVE SET RAW COUNTS
+    Parameters
+    ----------
+    data
+        `AnnData` object containing counts.  Works with dense or sparse matrices.
+    max_mito_prop
+        Threshold above which cells are discarded.
+    Returns
+    -------
+    AnnData
+        A **copy** of `data` with poor-quality cells removed and two new
+        columns added to ``.obs``:
+        - **mito_prop** – per-cell mitochondrial fraction
+        - **poor_quality_mito** – boolean flag marking dropped cells
+    """
+    # We can safely assume that counts live in data.X because we set those
+    # prior to running this step in the preprocess function.
+    counts = data.X
+    var_index = data.var_names
+    if var_index[0].startswith("ENSG"):
+        ref =  _HUMAN_MITO_ENSEMBL
+    else:
+        ref = _HUMAN_MITO_SYMBOLS
+    mito_idx = np.flatnonzero(var_index.isin(ref))
+    if mito_idx.size == 0:
+        _logger.info("No mitochondrial genes found, returning data")
+        return data
+    if sp.issparse(counts):
+        total = counts.sum(axis=1).A1
+        mito = counts[:, mito_idx].sum(axis=1).A1
+    else:
+        total = counts.sum(axis=1)
+        mito = counts[:, mito_idx].sum(axis=1)
+    mito_prop = mito / np.maximum(total, 1)
+    data.obs["mito_prop"] = mito_prop
+    data.obs["poor_quality_mito"] = mito_prop > max_mito_prop
+    filtered = data[~data.obs["poor_quality_mito"]].copy()
+    gc.collect()
+    return filtered
+def filter_highly_variable_genes(data: ad.AnnData, method: str):
+    """
+    Filter genes to those that are highly variable using scanpy.
+    method must be "seurat_v3" or "cell_ranger".
+    """
+    if "highly_variable" in data.var:
+        data = data[:, data.var["highly_variable"]]
+    else:
+        sc.pp.highly_variable_genes(data, flavor=method, n_top_genes=10000)
+    gc.collect()
+    return data
+def normalize_data_inplace(matrix_csr: csr_matrix, norm_value: float):
+    """
+    In-place row normalization + scale. matrix_csr must be a CSR matrix.
+    """
+    # In-place row normalize (L1)
+    sparsefuncs_fast.inplace_csr_row_normalize_l1(matrix_csr)
+    # Multiply each row by norm_value
+    scale_factors = np.array([norm_value] * matrix_csr.shape[0])
+    sparsefuncs.inplace_row_scale(matrix_csr, scale_factors)
+    gc.collect()
+def scale_columns_by_median_dict(layer: csr_matrix, data: ad.AnnData, median_dict_path: str, median_column: str):
+    """
+    Read a JSON median_dict, scale columns by 1/median. The lookup key is either
+    data_processing.var.index or data_processing.var[median_column].
+    """
+    with open(median_dict_path) as f:
+        median_dict = json.load(f)
+    if median_column == "index":
+        median_var = data.var.index
+    else:
+        median_var = data.var[median_column]
+    factors = []
+    for g in median_var:
+        if g in median_dict:
+            factors.append(1.0 / median_dict[g])
+        else:
+            factors.append(1.0)
+    factors = np.array(factors)
+    # Apply in-place column scale
+    sparsefuncs.inplace_csr_column_scale(layer, factors)
+def log_transform_layer(data: ad.AnnData, layer_name: str = "processed"):
+    """
+    Apply sc.pp.log1p in place to data_processing.layers[layer_name].
+    """
+    sc.pp.log1p(data, layer=layer_name, copy=False)
+def compute_and_save_medians(data: ad.AnnData, data_path: str, hyperparameters: dict):
+    """
+    Convert zeros to NaN, compute column medians ignoring NaN, and save results as JSON.
+    """
+    with warnings.catch_warnings():
+        warnings.filterwarnings("ignore", r"All-NaN (slice|axis) encountered")
+        mat = data.layers["processed"].toarray()
+        mat[mat == 0] = np.nan
+        medians = np.nanmedian(mat, axis=0)
+        if hyperparameters["median_column"] == "index":
+            median_var = data.var.index.copy()
+            if not isinstance(median_var, pd.Series):
+                median_var = pd.Series(median_var)
+        else:
+            median_var = data.var[hyperparameters["median_column"]].copy()
+        valid_idxs = np.where(~np.isnan(medians))[0]
+        median_values = {median_var.iloc[k]: medians[k].item() for k in valid_idxs}
+    save_path = data_path.replace(hyperparameters["load_dir"], hyperparameters["save_dir"])
+    save_path = save_path.replace(".h5ad", "_medians.json")
+    with open(save_path, "w") as f:
+        json.dump(median_values, f, indent=4)
+def update_metadata(metadata: dict, data: ad.AnnData, hyperparameters: dict):
+    """
+    Update metadata with cell_count and track processing arguments.
+    """
+    metadata["cell_count"] = data.n_obs
+    if "processing_args" in metadata:
+        metadata["processing_args"] = [metadata["processing_args"]] + [hyperparameters]
+    else:
+        # original fallback
+        metadata["processings_args"] = [hyperparameters]
+    return metadata
+def save_and_cleanup(data: ad.AnnData, metadata: dict, data_path: str, metadata_path: str, hyperparameters: dict):
+    """
+    Write processed data_processing and metadata to disk, then GC cleanup.
+    """
+    load_dir = hyperparameters["load_dir"]
+    save_dir = hyperparameters["save_dir"]
+    data_filename = os.path.basename(data_path)          # e.g. "sample_data.h5ad"
+    metadata_filename = os.path.basename(metadata_path)  # e.g. "sample_data_metadata.json"
+    save_processed_path = os.path.join(save_dir, data_filename)
+    save_metadata_path = os.path.join(save_dir, metadata_filename)
+    # Make sure the directories exist
+    os.makedirs(os.path.dirname(save_processed_path), exist_ok=True)
+    os.makedirs(os.path.dirname(save_metadata_path), exist_ok=True)
+    if data.n_obs == 0:
+        return None, None
+    # Ensure relevant layers are sparse matrices
+    if not isinstance(data.raw.X, csr_matrix):
+        data.raw.X = csr_matrix(data.raw.X)
+    if not isinstance(data.X, csr_matrix):
+        data.X = csr_matrix(data.X)
+    if "processed" in data.layers and not isinstance(data.layers["processed"], csr_matrix):
+        data.layers["processed"] = csr_matrix(data.layers["processed"])
+    try:
+        data.write_h5ad(save_processed_path, compression="gzip")
+    except Exception:
+        # Rare bug with categorical indexes
+        if data.obs.index.name in data.obs.columns:
+            del data.obs[data.obs.index.name]
+        data.write_h5ad(save_processed_path, compression="gzip")
+    del data
+    gc.collect()
+    with open(save_metadata_path, "w") as f:
+        json.dump(metadata, f, indent=4)
+    return True, True
+def preprocess(data_path: str, metadata_path: str, hyperparameters: dict):
+    """
+    Original pipeline steps:
+    1. Load data_processing & metadata
+    2. Ensure data_processing.raw if counts are integer
+    3. Initialize 'processed' layer
+    4. Filter genes by reference_id
+    5. Remove assays
+    6. Filter cells (min gene counts)
+    7. Filter cells (max mito fraction)
+    8. HVG filtering
+    9. Normalize total
+    10. Median-based column scaling
+    11. Log transform
+    12. Compute medians (optional)
+    13. Update metadata and save
+    """
+    # 1. Load
+    data, metadata = load_data_and_metadata(data_path, metadata_path)
+    # 2. Ensure data_processing.raw if needed
+    data = set_raw_if_necessary(data)
+    if data is None:
+        return None, None
+    # 3. Initialize 'processed'
+    data = initialize_processed_layer(data)
+    # Perturbseq fine-tuning pipeline
+    # 4. Possible map/reference_id
+    if hyperparameters["reference_id_only"]:
+        data = filter_reference_id(data, hyperparameters)
+    # 5. Remove assays
+    if "assay" in data.obs and hyperparameters["remove_assays"]:
+        data = remove_assays(data, hyperparameters["remove_assays"])
+    # 6. Filter cells by min gene counts
+    if hyperparameters["min_gene_counts"]:
+        data = filter_cells_by_gene_counts(data, hyperparameters["min_gene_counts"])
+    # 7. Filter cells by mitochondrial fraction
+    if hyperparameters["max_mitochondrial_prop"]:
+        # The "original" version *always* used feature_name, so fallback=False
+        data = filter_cells_by_mitochondrial_fraction(
+            data, hyperparameters["max_mitochondrial_prop"])
+    # 8. HVG filtering
+    if hyperparameters["hvg_method"] in ["seurat_v3", "cell_ranger"]:
+        data = filter_highly_variable_genes(data, hyperparameters["hvg_method"])
+    # 9. Normalize total (row L1 + scale)
+    if hyperparameters["normalized_total"]:
+        if not isinstance(data.layers["processed"], csr_matrix):
+            data.layers["processed"] = csr_matrix(data.layers["processed"])
+        normalize_data_inplace(data.layers["processed"], hyperparameters["normalized_total"])
+    # 10. Scale columns using median_dict
+    if hyperparameters["median_dict"]:
+        scale_columns_by_median_dict(
+            data.layers["processed"], data, hyperparameters["median_dict"], hyperparameters["median_column"]
+        )
+    # 11. Log1p transform
+    if hyperparameters["log1p"]:
+        log_transform_layer(data, "processed")
+    # 12. Possibly compute medians
+    if hyperparameters["compute_medians"]:
+        compute_and_save_medians(data, data_path, hyperparameters)
+    # 13. Update metadata, save & cleanup
+    metadata = update_metadata(metadata, data, hyperparameters)
+    return save_and_cleanup(data, metadata, data_path, metadata_path, hyperparameters)
+###############################################################################
+# Main block
+###############################################################################
+if __name__ == "__main__":
+    parser = ArgumentParser(description="Preprocess scRNA-seq data stored in AnnData format.")
+    parser.add_argument(
+        "--data_path",
+        type=str,
+        required=True,
+        help="Path to the input .h5ad file."
+    )
+    parser.add_argument(
+        "--metadata_path",
+        type=str,
+        required=True,
+        help="Path to the input metadata JSON file."
+    )
+    parser.add_argument(
+        "--config_path",
+        type=str,
+        required=True,
+        help="Path to the JSON configuration file containing hyperparameters."
+    )
+    args = parser.parse_args()
+    # Load hyperparameters from JSON
+    with open(args.config_path, "r") as f:
+        hyperparameters = json.load(f)
+    # Call the pipeline
+    success, _ = preprocess(
+        data_path=args.data_path,
+        metadata_path=args.metadata_path,
+        hyperparameters=hyperparameters
+    )
+    if success:
+        print("Preprocessing completed successfully.")
+    else:
+        print("Preprocessing returned no data (0 cells), no file saved.")

teddy/data_processing/tokenization/README.md ADDED Viewed

	@@ -0,0 +1,58 @@

+The `tokenize_for_model.py` script is designed to tokenize gene expression data for use in our models. It takes in processesed the, applies various tokenization techniques, and prepares it for training or inference.
+# General Workflow
+The script follows these main steps:
+0. **Load Tokenization Arguments**: The script starts by loading the tokenization arguments from a configuration file or dictionary.
+1. **Load Gene Tokenizer**: It loads a pre-trained gene tokenizer based on the provided tokenization arguments.
+2. **Load AnnData**: The script reads the gene expression data from an AnnData file.
+3. **Check Genes in Tokenizer**: It verifies that the genes in the dataset are present in the tokenizer's vocabulary.
+4. **Build Token Array**: The script constructs a token array for the genes in the dataset.
+5. **Convert Processed Layer to Dense**: It converts the processed layer of the AnnData object to a dense matrix.
+6. **Tokenize in Batches**: The script processes the data in batches, applying tokenization and optional binning or ranking.
+7. **Save Tokenized Data**: Finally, the script saves the tokenized data to disk.
+# Tokenization Arguments
+The script uses several tokenization arguments to control its behavior. Here is an explanation of each argument and the steps they influence:
+- `max_seq_len`
+    - Description: Specifies the maximum sequence length for the tokenized data.
+    - Impact: Determines the number of genes to include in each tokenized sequence (cell). If add_cls is enabled, the sequence length is reduced by one to accommodate the CLS token.
+- `add_cls`
+    - Description: Indicates whether to prepend a CLS token to each sequence.
+    - Impact: If enabled, a CLS token is added to the beginning of each sequence, and the sequence length is adjusted accordingly.
+- `cls_token_id`
+    - Description: The token ID to use for the CLS token.
+    - Impact: If add_cls is enabled, this token ID is used for the CLS token.
+- `random_genes`
+    - Description: Specifies whether to select a random subset of genes before applying top-k selection
+    - Impact: If enabled, a random subset of genes is selected for each batch, and then the top-k values are determined from this subset.
+- `include_zero_genes`
+    - Description: Indicates whether to include zero-expression genes in the tokenized data.
+    - Impact: If enabled, zero-expression genes are included in the tokenized sequences. Otherwise, they are filtered out.
+- `bins`
+    - Description: Specifies the number of bins to use for binning expression values.
+    - Impact: If set, the script bins the expression values into the specified number of bins. This argument is only relevant for TEDDY-X.
+- `continuous_rank`
+    - Description: Indicates whether to rank expression values continuously.
+    - Impact: If enabled, the script ranks the expression values in the range [-1, 1]. This argument is only relevant for TEDDY-X.
+- `gene_seed`
+    - Description: A random seed for reproducibility.
+    - Impact: If set, the script uses this seed to ensure reproducible random operations.
+- `gene_id_column`
+    - Description: The column name in the AnnData object that contains gene IDs.
+    - Impact: The script uses this column to identify genes from vocab in the dataset.
+- `label_column`
+    - Description: The column name in the AnnData object that contains classification labels
+    - Impact: If set, the script adds these labels to the tokenized data.
+- `bio_annotations`
+    - Description: Indicates whether to add biological annotations to the tokenized data.
+    - Impact: If enabled, the script adds annotations such as disease, tissue, cell type, and sex to the tokenized data.
+- `disease_mapping`, `tissue_mapping`, `cell_mapping`, `sex_mapping`
+    - Description: File paths to JSON files containing mappings for biological annotations.
+    - Impact: The script uses these mappings to convert biological annotations to token IDs.
+- `add_disease_annotation`
+    - Description: Indicates whether to override labels with disease annotations.
+    - Impact: If enabled, the script overrides the labels with disease annotations.
+- `max_shard_samples`
+    - Description: The maximum number of samples per shard when saving the tokenized data.
+    - Impact: The script splits the tokenized data into shards with the specified maximum number of samples.

teddy/data_processing/tokenization/__init__.py ADDED Viewed

File without changes

teddy/data_processing/tokenization/tokenization.py ADDED Viewed

	@@ -0,0 +1,419 @@

+"""
+Module: tokenization.py
+This module provides a tokenization pipeline for preprocessed single-cell RNA sequencing (scRNA-seq) data.
+It converts gene expression data stored in AnnData format into tokenized sequences that can
+be used for downstream machine learning tasks, such as masked language modeling or classification.
+Main Features:
+- Tokenizes gene expression data into integer tokens using a custom GeneTokenizer.
+- Supports additional biological annotations (e.g., disease, tissue, cell type, sex).
+- Handles both top-k and random gene selection for tokenization.
+- Configurable via JSON-based hyperparameters or TokenizationArgs objects.
+- Saves tokenized data in Hugging Face Dataset format for efficient processing.
+Dependencies:
+- anndata, numpy, torch, datasets, tqdm
+Usage:
+- Run this script as a standalone program with a configuration file specifying the hyperparameters.
+- Import the `tokenize` function and call it with the data path, metadata path, and tokenization arguments.
+"""
+import gc
+import os
+import json
+import random
+import shutil
+from argparse import ArgumentParser
+from typing import Union
+import anndata as ad
+import numpy as np
+import torch
+from datasets import Dataset, load_from_disk
+from tqdm import tqdm
+from teddy.tokenizer.gene_tokenizer import GeneTokenizer
+from teddy.tokenizer.tokenization_args import TokenizationArgs
+###############################################################################
+# Updated Functions
+###############################################################################
+def _bin_values(vals_list, tokenization_args, no_sorting=False):
+    """
+    Bins expression values into specified bins, assigning bin 0 to non-expressed genes
+    when `include_zero_genes` is True.
+    no_sorting=False => "positional chunk" approach for topk-sorted arrays - provided data_processing is expected to be sorted through topk (input expression values).
+    no_sorting=True  => simple bucketize approach ignoring the topk order - provided data_processing is not sorted (labels).
+    """
+    binned_vals = []
+    for vals in vals_list:
+        if isinstance(vals, np.ndarray):
+            vals = torch.tensor(vals)
+        vals_to_bin = vals
+        # Original binning approach
+        if not no_sorting:
+            # "positional chunk" approach from the original code
+            num_repetitions = max(1, len(vals_to_bin) // tokenization_args.bins)
+            bin_pattern = torch.arange(0, tokenization_args.bins).unsqueeze(1).repeat(1, num_repetitions).flatten()
+            # slice or pad to match the length of vals_to_bin
+            if len(bin_pattern) > len(vals_to_bin):
+                bin_pattern = bin_pattern[-len(vals_to_bin) :]
+            else:
+                extra = len(vals_to_bin) - len(bin_pattern)
+                if extra > 0:
+                    bin_pattern = torch.cat([torch.zeros(extra), bin_pattern])
+            bin_pattern = bin_pattern.flip(0)
+            binned_vals.append(bin_pattern)
+        else:
+            if len(vals_to_bin) > 0:
+                bin_edges = torch.linspace(vals_to_bin.min(), vals_to_bin.max(), steps=tokenization_args.bins + 1)
+                binned_non_zero_vals = torch.bucketize(vals_to_bin, bin_edges)
+                binned_non_zero_vals = torch.clamp(binned_non_zero_vals, min=1)
+                binned_tensor = binned_non_zero_vals.float()
+                binned_vals.append(binned_tensor)
+            else:
+                binned_tensor = torch.zeros_like(vals_to_bin, dtype=torch.float)
+                binned_vals.append(binned_tensor)
+    return binned_vals
+def _rank_continuous(vals, tokenization_args):
+    """
+    Ranks gene expression values in the range [-1, 1].
+    """
+    if isinstance(vals, np.ndarray):
+        vals = torch.tensor(vals)
+    if len(vals) > 0:
+        ranked_vals = torch.linspace(-1, 1, steps=len(vals)).flip(0)
+    else:
+        ranked_vals = vals
+    return ranked_vals
+def _prepare_tokenizer_args(tokenization_args: Union[dict, TokenizationArgs]):
+    """
+    Prepares and validates tokenization arguments, ensuring reproducibility
+    by setting random seeds if specified.
+    """
+    if isinstance(tokenization_args, dict):
+        load_dir = tokenization_args["load_dir"]
+        save_dir = tokenization_args["save_dir"]
+        token_args_obj = TokenizationArgs(**tokenization_args)
+    else:
+        # It's already TokenizationArgs
+        load_dir = tokenization_args.load_dir
+        save_dir = tokenization_args.save_dir
+        token_args_obj = tokenization_args
+    # If a random seed is specified, set it for reproducibility
+    if token_args_obj.gene_seed is not None:
+        random.seed(token_args_obj.gene_seed)
+        np.random.seed(token_args_obj.gene_seed)
+        torch.manual_seed(token_args_obj.gene_seed)
+        if torch.cuda.is_available():
+            torch.cuda.manual_seed_all(token_args_obj.gene_seed)
+    return token_args_obj, load_dir, save_dir
+def _check_genes_in_tokenizer(data: ad.AnnData, gene_id_column: str, tokenizer: GeneTokenizer):
+    """
+    Checks if the genes in the dataset are present in the tokenizer's vocabulary.
+    """
+    if gene_id_column == "index":
+        gene_index = data.var.index
+    else:
+        gene_index = data.var[gene_id_column]
+    # Check membership in vocab
+    gene_in_vocab = np.where([g in tokenizer.vocab for g in gene_index])[0]
+    coding_genes = gene_index[gene_in_vocab]
+    ratio = len(gene_in_vocab) / len(data.var)
+    if ratio < 0.1:
+        raise OSError(
+            f"Only {ratio:.2%} of gene IDs found in tokenizer vocab. " "Check gene_id_column or vocab mismatch."
+        )
+    return gene_in_vocab, coding_genes, ratio
+def _build_batch_tensors(X_batch: torch.Tensor, token_array: torch.Tensor, token_args, data=None, obs_indices=None):
+    """
+    Build topk or random subsets for each row in X_batch (batch_size x num_genes).
+    Return gene_list, vals_list, labels_list.
+    """
+    batch_size = X_batch.shape[0]
+    seq_tokens = token_args.max_seq_len - 1 if token_args.add_cls else token_args.max_seq_len
+    # If random_genes => pick random subset then topk that subset
+    if token_args.random_genes:
+        random_indices = torch.stack([torch.randperm(X_batch.shape[1])[:seq_tokens] for _ in range(batch_size)])
+        random_vals = torch.gather(X_batch, 1, random_indices)
+        top_vals, rel_indices = torch.topk(
+            random_vals, k=min(seq_tokens, random_vals.shape[1]), largest=True, sorted=True
+        )
+        # Convert rel_indices => absolute indices
+        top_indices = torch.gather(random_indices, 1, rel_indices)
+    else:
+        # normal topk
+        top_vals, top_indices = torch.topk(X_batch, k=min(seq_tokens, X_batch.shape[1]), largest=True, sorted=True)
+    gene_ids = token_array[top_indices]
+    # If add_cls => prepend a CLS token
+    if token_args.add_cls:
+        cls_col = torch.tensor(token_args.cls_token_id).repeat(batch_size, 1)
+        gene_ids = torch.cat([cls_col, gene_ids], dim=1)
+        ones_col = torch.ones(batch_size, 1, dtype=top_vals.dtype)
+        top_vals = torch.cat([ones_col, top_vals], dim=1)
+    labels_list = None
+    return gene_ids, top_vals, labels_list, None
+###############################################################################
+# Main tokenize function
+###############################################################################
+def tokenize(data_path: str, metadata_path: str, tokenization_args: Union[dict, TokenizationArgs]):
+    """
+    Tokenizes gene expression data stored in AnnData format.
+    Args:
+        data_path (str): Path to the AnnData file containing preprocessed gene expression data.
+        metadata_path (str): Path to the metadata file in JSON format.
+        tokenization_args (Union[dict, TokenizationArgs]): Configuration for tokenization.
+    """
+    token_args, load_dir, save_dir = _prepare_tokenizer_args(tokenization_args)
+    # 1) Load GeneTokenizer
+    tokenizer = GeneTokenizer.from_pretrained(token_args.tokenizer_name_or_path)
+    if token_args.cls_token_id is None:
+        token_args.cls_token_id = tokenizer.cls_token_id
+    # 2) Load AnnData
+    data = ad.read_h5ad(data_path)
+    if "processed" not in data.layers:
+        raise ValueError(f"Missing 'processed' layer in {data_path}")
+    # 3) Genes in vocab
+    gene_in_vocab, coding_genes, ratio = _check_genes_in_tokenizer(data, token_args.gene_id_column, tokenizer)
+    print(f"{ratio:.2%} of genes found in tokenizer vocab")
+    # 5) Build token array for these genes
+    token_array = torch.tensor(tokenizer.encode(coding_genes.tolist(), add_special_tokens=False))
+    # 6) Convert processed layer to dense
+    X_matrix = data.layers["processed"].toarray()
+    # 7) Prepare final dictionary => HF Dataset
+    all_data = {"gene_ids": [], "values": []}
+    BATCH_SIZE = 512
+    n_obs = data.shape[0]
+    for start_idx in tqdm(range(0, n_obs, BATCH_SIZE), desc="Tokenizing in batches"):
+        end_idx = min(start_idx + BATCH_SIZE, n_obs)
+        obs_indices = np.arange(start_idx, end_idx)
+        X_batch = torch.tensor(X_matrix[obs_indices, :][:, gene_in_vocab], dtype=torch.float)
+        gene_ids_batch, vals_batch, labels_batch, decoder_vals_batch = _build_batch_tensors(
+            X_batch,
+            token_array,
+            token_args,
+            data=None,
+            obs_indices=None,
+        )
+        final_gene_list = []
+        final_vals_list = []
+        final_labels_list = []
+        if "decoder_values" in data.layers:
+            final_decoder_vals_list = []
+        # Filter out zero if needed
+        # or keep them
+        for row_idx in range(len(gene_ids_batch)):
+            g_row = gene_ids_batch[row_idx]
+            v_row = vals_batch[row_idx]
+            if labels_batch is not None:
+                lb_row = labels_batch[row_idx]
+            else:
+                lb_row = None
+            if decoder_vals_batch is not None:
+                dec_v_row = decoder_vals_batch[row_idx]
+            else:
+                dec_v_row = None
+            if not token_args.include_zero_genes:
+                nonzero_mask = v_row != 0
+                g_row = g_row[nonzero_mask]
+                v_row = v_row[nonzero_mask]
+                if lb_row is not None:
+                    lb_row = lb_row[nonzero_mask]
+                if dec_v_row is not None:
+                    dec_v_row = dec_v_row[nonzero_mask]
+            final_gene_list.append(g_row)
+            final_vals_list.append(v_row)
+            final_labels_list.append(lb_row)
+            if "decoder_values" in data.layers:
+                final_decoder_vals_list.append(dec_v_row)
+        # If we do binning or rank => apply them
+        if token_args.bins and token_args.continuous_rank:
+            raise ValueError("Should not use bins and continuous_rank simultaneously.")
+        if token_args.bins:
+            # possibly do no_sorting if we are binning "labels"
+            # we only do "no_sorting=True" for labels, but let's keep it simple for now
+            final_vals_list = _bin_values(final_vals_list, token_args, no_sorting=False)
+        elif token_args.continuous_rank:
+            for i, vals in enumerate(final_vals_list):
+                final_vals_list[i] = _rank_continuous(vals, token_args)
+        # Add to all_data
+        for row_idx in range(len(final_gene_list)):
+            all_data["gene_ids"].append(final_gene_list[row_idx].tolist())
+            all_data["values"].append(final_vals_list[row_idx].tolist())
+    if token_args.label_column:
+        all_data["labels"] = data.obs[token_args.label_column].cat.codes.values.tolist()
+    # bio_annotations
+    if token_args.bio_annotations:
+        with open(token_args.disease_mapping) as f:
+            disease_mapping = json.load(f)
+        with open(token_args.tissue_mapping) as f:
+            tissue_mapping = json.load(f)
+        with open(token_args.cell_mapping) as f:
+            cell_mapping = json.load(f)
+        with open(token_args.sex_mapping) as f:
+            sex_mapping = json.load(f)
+        if "disease" not in data.obs.columns:
+            data.obs["disease"] = "normal"
+        if "tissue" not in data.obs.columns:
+            data.obs["tissue"] = "cultured cell"
+        if "sex" not in data.obs.columns:
+            data.obs["sex"] = "unknown"
+        if "cell_type" not in data.obs.columns:
+            data.obs["cell_type"] = "unknown"
+        mapped_diseases = [disease_mapping[k] for k in data.obs["disease"].tolist()]
+        mapped_tissues = [tissue_mapping[k] for k in data.obs["tissue"].tolist()]
+        mapped_cell_types = [cell_mapping[k] for k in data.obs["cell_type"].tolist()]
+        mapped_sexes = [sex_mapping[k] for k in data.obs["sex"].tolist()]
+        all_data["disease"] = tokenizer.encode(mapped_diseases, add_special_tokens=False)
+        all_data["tissue"] = tokenizer.encode(mapped_tissues, add_special_tokens=False)
+        all_data["cell_type"] = tokenizer.encode(mapped_cell_types, add_special_tokens=False)
+        all_data["sex"] = tokenizer.encode(mapped_sexes, add_special_tokens=False)
+    if token_args.add_disease_annotation:
+        # We override "labels" with "disease" tokens
+        all_data["labels"] = all_data["disease"]
+    del data
+    gc.collect()
+    dataset = Dataset.from_dict(all_data)
+    num_samples = len(dataset)
+    if token_args.max_shard_samples:
+        num_shards = num_samples // min(token_args.max_shard_samples, num_samples)
+    else:
+        num_shards = 1
+    # Compute the path of data_path relative to load_dir
+    relative_data_path = os.path.relpath(data_path, load_dir)
+    relative_metadata_path = os.path.relpath(metadata_path, load_dir)
+    # Remove the ".h5ad" extension from data_path if desired
+    no_extension_data_path = os.path.splitext(relative_data_path)[0]
+    # Reconstruct the final paths under save_dir
+    save_tokenized_data_path = os.path.join(save_dir, no_extension_data_path)
+    save_metadata_path = os.path.join(save_dir, relative_metadata_path)
+    dataset.save_to_disk(save_tokenized_data_path, num_shards=num_shards)
+    shutil.copy(metadata_path, save_metadata_path)
+###############################################################################
+# A simple shard function
+###############################################################################
+def shard_hf_dataset(data_path: str, metadata_path: str, tokenization_args: Union[dict, TokenizationArgs]):
+    """
+    Shards a Hugging Face Dataset into smaller chunks for efficient storage and processing.
+    """
+    if isinstance(tokenization_args, dict):
+        load_dir = tokenization_args["load_dir"]
+        save_dir = tokenization_args["save_dir"]
+        token_args_obj = TokenizationArgs(**tokenization_args)
+    else:
+        load_dir = tokenization_args.load_dir
+        save_dir = tokenization_args.save_dir
+        token_args_obj = tokenization_args
+    all_data = load_from_disk(data_path)
+    num_samples = len(all_data)
+    if token_args_obj.max_shard_samples:
+        num_shards = num_samples // min(token_args_obj.max_shard_samples, num_samples)
+    else:
+        num_shards = 1
+    save_tokenized_data_path = data_path.replace(load_dir, save_dir)
+    save_metadata_path = metadata_path.replace(load_dir, save_dir)
+    all_data.save_to_disk(save_tokenized_data_path, num_shards=num_shards)
+    shutil.copy(metadata_path, save_metadata_path)
+###############################################################################
+# Main block
+###############################################################################
+if __name__ == "__main__":
+    parser = ArgumentParser(description="Tokenize an AnnData file for downstream ML tasks.")
+    parser.add_argument(
+        "--data_path",
+        type=str,
+        required=True,
+        help="Path to the .h5ad file containing the preprocessed scRNA-seq data."
+    )
+    parser.add_argument(
+        "--metadata_path",
+        type=str,
+        required=True,
+        help="Path to the JSON file containing metadata."
+    )
+    parser.add_argument(
+        "--config_path",
+        type=str,
+        required=True,
+        help="Path to the JSON file specifying tokenization hyperparameters."
+    )
+    args = parser.parse_args()
+    # Load tokenization arguments from JSON
+    with open(args.config_path, "r") as f:
+        tokenization_args = json.load(f)
+    # Call the tokenize function
+    tokenize(
+        data_path=args.data_path,
+        metadata_path=args.metadata_path,
+        tokenization_args=tokenization_args
+    )

teddy/data_processing/utils/__init__.py ADDED Viewed

File without changes

teddy/data_processing/utils/bio_annotations/__init__.py ADDED Viewed

File without changes

teddy/data_processing/utils/bio_annotations/calculate_biostats.py ADDED Viewed

	@@ -0,0 +1,99 @@

+"""
+Module: calculate_biostats.py
+This module calculates and aggregates biological statistics from single-cell RNA sequencing (scRNA-seq) data
+stored in AnnData format. It generates per-category statistics (e.g., disease, cell type, tissue, sex)
+and computes the median expression values for genes across datasets. The results are saved as JSON and CSV files
+for downstream analysis.
+Main Features:
+- Computes the median expression values for genes in the "processed" layer of AnnData files.
+- Generates category-wise statistics (e.g., counts of diseases, cell types, tissues, and sexes).
+- Aggregates statistics across multiple training datasets.
+- Outputs results in JSON and CSV formats for easy integration with other tools.
+Dependencies:
+- anndata: For handling AnnData files.
+- numpy: For numerical operations, including median calculations.
+- pandas: For creating and exporting tabular data.
+- tqdm: For progress visualization during processing.
+- glob: For recursive file searching.
+Usage:
+- Run this script as a standalone program with the following arguments:
+   - `--load_dir`: Directory containing the training `.h5ad` files.
+   - `--stats_dict_name`: Path to save the aggregated statistics JSON file.
+"""
+import json
+import os
+from argparse import ArgumentParser
+from glob import glob
+import anndata as ad
+import numpy as np
+import pandas as pd
+from datasets.utils.logging import disable_progress_bar
+from tqdm import tqdm
+def make_median_list(file, out_file):
+    data = ad.read_h5ad(file)
+    # set up gene ids
+    gene_index = data.var.index
+    all_X = data.layers["processed"].toarray()
+    all_X[all_X == 0] = np.nan
+    median = np.nanmedian(all_X, axis=0)  # (gene_ids,)
+    num_median = np.where(~np.isnan(median))[0]
+    median_dict = {gene_index[k]: median[k].item() for k in num_median}
+    with open(out_file, "w") as f:
+        json.dump(median_dict, f, indent=4)
+if __name__ == "__main__":
+    parser = ArgumentParser()
+    parser.add_argument("--load_dir", default="")
+    parser.add_argument("--stats_dict_name", default="")
+    args = parser.parse_args()
+    disable_progress_bar()
+    # calculate median of medians
+    all_train = list(glob(args.load_dir + "/**/train_*.h5ad", recursive=True))
+    print("Generating individual stats")
+    for train in tqdm(all_train):
+        data = ad.read_h5ad(train, backed="r+")
+        stats = {}
+        for cat in ["disease", "cell_type", "tissue", "sex"]:
+            stats[cat] = data.obs[cat].value_counts().to_dict()
+        with open(os.path.join(os.path.dirname(train), "bio_stats.json"), "w") as f:
+            json.dump(stats, f, indent=4)
+    print("Collecting stats")
+    summary_dict = {}
+    summary_dict["disease"] = {}
+    summary_dict["cell_type"] = {}
+    summary_dict["tissue"] = {}
+    summary_dict["sex"] = {}
+    for train in tqdm(all_train):
+        with open(os.path.join(os.path.dirname(train), "bio_stats.json")) as f:
+            stats = json.load(f)
+        for cat in ["disease", "cell_type", "tissue", "sex"]:
+            for k in stats[cat].keys():
+                if k not in summary_dict[cat].keys():
+                    summary_dict[cat][k] = stats[cat][k]
+                else:
+                    summary_dict[cat][k] += stats[cat][k]
+    os.makedirs(os.path.dirname(args.stats_dict_name), exist_ok=True)
+    with open(args.stats_dict_name, "w") as f:
+        json.dump(summary_dict, f, indent=4)
+    # with open(args.stats_dict_name) as f:
+    #     summary_dict = json.load(f)
+    for cat in ["disease", "cell_type", "tissue", "sex"]:
+        df = pd.DataFrame.from_dict(summary_dict[cat], orient="index", columns=["Counts"])
+        df.to_csv(args.stats_dict_name.replace(".json", f"_{cat}.csv"))

teddy/data_processing/utils/bio_annotations/data/all_filtered.json ADDED Viewed

	@@ -0,0 +1,1227 @@

+{
+    "disease": {
+        "frontotemporal dementia": 143044,
+        "normal": 97973249,
+        "dementia": 2118803,
+        "autosomal dominant polycystic kidney disease": 415071,
+        "diabetic kidney disease": 360692,
+        "lung adenocarcinoma": 1229577,
+        "small cell lung carcinoma": 194368,
+        "COVID-19": 1704704,
+        "Alzheimer disease": 128732,
+        "renal cell carcinoma": 142070,
+        "dilated cardiomyopathy": 588582,
+        "Crohn disease": 263111,
+        "gastric intestinal metaplasia": 33283,
+        "gastritis": 160485,
+        "Barrett esophagus": 14172,
+        "myocardial infarction": 241855,
+        "epidermolysis bullosa": 7618,
+        "B-cell acute lymphoblastic leukemia": 129641,
+        "anencephaly": 2358,
+        "metastatic melanoma": 218670,
+        "premalignant hematological system disease": 55556,
+        "triple-negative breast carcinoma": 13700,
+        "luminal B breast carcinoma": 10467,
+        "malignant ovarian serous tumor": 649870,
+        "basal cell carcinoma": 62950,
+        "nonpapillary renal cell carcinoma": 147521,
+        "benign prostatic hyperplasia": 41258,
+        "luminal A breast carcinoma": 12402,
+        "systemic lupus erythematosus": 715392,
+        "Lewy body dementia": 26103,
+        "Parkinson disease": 130493,
+        "periodontitis": 32082,
+        "arrhythmogenic right ventricular cardiomyopathy": 52749,
+        "non-compaction cardiomyopathy": 9733,
+        "pulmonary emphysema": 21052,
+        "juvenile dermatomyositis": 92130,
+        "blastoma": 25472,
+        "hydrosalpinx": 1048,
+        "chronic kidney disease": 225534,
+        "type 1 diabetes mellitus": 90792,
+        "acute kidney failure": 128160,
+        "listeriosis": 52720,
+        "Plasmodium malariae malaria": 16864,
+        "pilocytic astrocytoma": 35774,
+        "glioblastoma": 1145013,
+        "Wilms tumor": 3474,
+        "primary sclerosing cholangitis": 27661,
+        "amyotrophic lateral sclerosis": 61201,
+        "aspiration pneumonia": 13718,
+        "malignant pancreatic neoplasm": 4296,
+        "amyotrophic lateral sclerosis 26 with or without frontotemporal dementia": 44240,
+        "acute myeloid leukemia": 31466,
+        "breast carcinoma": 71768,
+        "temporal lobe epilepsy": 2094,
+        "type 2 diabetes mellitus": 143044,
+        "non-small cell lung carcinoma": 524920,
+        "Crohn ileitis": 22076,
+        "respiratory failure": 4176,
+        "long COVID-19": 258,
+        "tubular adenoma": 719,
+        "tubulovillous adenoma": 402,
+        "colorectal cancer": 1155,
+        "multiple sclerosis": 10249,
+        "clear cell renal carcinoma": 96277,
+        "colon sessile serrated adenoma/polyp": 748,
+        "hyperplastic polyp": 101,
+        "epilepsy": 92965,
+        "brain neoplasm": 78,
+        "hydrocephalus": 6,
+        "plasma cell myeloma": 3264,
+        "lymphadenitis": 1373,
+        "primary biliary cholangitis": 31864,
+        "chromophobe renal cell carcinoma": 501,
+        "kidney oncocytoma": 45668,
+        "neuroendocrine carcinoma": 558,
+        "adenocarcinoma": 1033,
+        "influenza": 8650,
+        "endocrine pancreas disorder": 28000,
+        "age related macular degeneration 7": 1052,
+        "basal laminar drusen": 891,
+        "opiate dependence": 50745,
+        "digestive system disorder": 15495,
+        "breast cancer": 4376,
+        "cardiomyopathy": 2570,
+        "respiratory system disorder": 82770,
+        "pulmonary fibrosis": 215970,
+        "chronic obstructive pulmonary disease": 58075,
+        "interstitial lung disease": 57615,
+        "squamous cell lung carcinoma": 20585,
+        "cystic fibrosis": 17940,
+        "lung large cell carcinoma": 17825,
+        "chronic rhinitis": 16790,
+        "lymphangioleiomyomatosis": 11385,
+        "pleomorphic carcinoma": 11155,
+        "pulmonary sarcoidosis": 4600,
+        "hypersensitivity pneumonitis": 3910,
+        "non-specific interstitial pneumonia": 230
+    },
+    "cell_type": {
+        "oligodendrocyte": 4057585,
+        "neuron": 6962719,
+        "astrocyte": 1585731,
+        "oligodendrocyte precursor cell": 742101,
+        "microglial cell": 2726973,
+        "endothelial cell": 2112592,
+        "extravillous trophoblast": 83805,
+        "placental villous trophoblast": 57075,
+        "syncytiotrophoblast cell": 14970,
+        "skin fibroblast": 123651,
+        "T cell": 742776,
+        "enterocyte": 46834,
+        "endothelial cell of lymphatic vessel": 128117,
+        "fibroblast": 1845139,
+        "blood vessel endothelial cell": 169588,
+        "B cell": 1179496,
+        "enteroendocrine cell": 11373,
+        "macrophage": 847475,
+        "dendritic cell": 79130,
+        "vascular leptomeningeal cell": 74718,
+        "retina horizontal cell": 21504,
+        "natural killer cell": 672198,
+        "large pre-B-II cell": 15414,
+        "small pre-B-II cell": 13686,
+        "double negative thymocyte": 60459,
+        "pro-B cell": 47113,
+        "group 3 innate lymphoid cell": 10890,
+        "late pro-B cell": 10121,
+        "fraction A pre-pro B cell": 7828,
+        "B-2 B cell": 2072,
+        "unknown": 1083041,
+        "early lymphoid progenitor": 3226,
+        "double-positive, alpha-beta thymocyte": 70221,
+        "hematopoietic stem cell": 86169,
+        "naive thymus-derived CD4-positive, alpha-beta T cell": 676827,
+        "hematopoietic multipotent progenitor cell": 18797,
+        "B-1 B cell": 394,
+        "naive thymus-derived CD8-positive, alpha-beta T cell": 240314,
+        "megakaryocyte-erythroid progenitor cell": 25072,
+        "regulatory T cell": 176108,
+        "mature B cell": 15042,
+        "group 2 innate lymphoid cell": 287,
+        "innate lymphoid cell": 367929,
+        "immature B cell": 18765,
+        "common myeloid progenitor": 2420,
+        "CD8-alpha-alpha-positive, alpha-beta intraepithelial T cell": 6932,
+        "granulocyte monocyte progenitor cell": 5762,
+        "plasma cell": 298232,
+        "kidney proximal convoluted tubule epithelial cell": 870511,
+        "leukocyte": 139392,
+        "kidney loop of Henle thick ascending limb epithelial cell": 357616,
+        "kidney distal convoluted tubule epithelial cell": 96341,
+        "kidney interstitial fibroblast": 87152,
+        "blood vessel smooth muscle cell": 27558,
+        "kidney collecting duct principal cell": 218533,
+        "kidney collecting duct intercalated cell": 53571,
+        "podocyte": 22019,
+        "mesangial cell": 2969,
+        "kidney granular cell": 3024,
+        "macula densa epithelial cell": 1030,
+        "muscle cell": 35598,
+        "fibroblast of dermis": 54301,
+        "tendon cell": 28946,
+        "Schwann cell": 35927,
+        "chondrocyte": 304322,
+        "smooth muscle cell": 216928,
+        "endothelial cell of artery": 206630,
+        "reticulocyte": 676,
+        "vein endothelial cell": 261893,
+        "pericyte": 452287,
+        "peridermal cell": 834,
+        "basal cell": 438249,
+        "articular chondrocyte": 128,
+        "mesenchymal cell": 434310,
+        "connective tissue cell": 71,
+        "erythrocyte": 284387,
+        "hypertrophic chondrocyte": 146,
+        "megakaryocyte": 75963,
+        "muscle fibroblast": 34365,
+        "mature NK T cell": 138818,
+        "myeloid cell": 329228,
+        "kidney interstitial cell": 12086,
+        "epithelial cell of nephron": 9155,
+        "mesenchymal stem cell": 75348,
+        "epithelial cell of proximal tubule": 272380,
+        "kidney connecting tubule epithelial cell": 24270,
+        "epithelial cell of glomerular capsule": 940,
+        "nephron tubule epithelial cell": 750,
+        "kidney collecting duct cell": 670,
+        "stromal cell of ovary": 49866,
+        "granulosa cell": 32638,
+        "theca cell": 7196,
+        "epithelial cell": 699771,
+        "epithelial cell of alveolus of lung": 3875,
+        "goblet cell": 16960,
+        "ionocyte": 2292,
+        "hepatocyte": 423766,
+        "ciliated epithelial cell": 6479,
+        "neuroendocrine cell": 1145,
+        "club cell": 94729,
+        "brush cell": 1345,
+        "platelet": 38070,
+        "central nervous system macrophage": 170125,
+        "ependymal cell": 97775,
+        "vascular associated smooth muscle cell": 235809,
+        "mesothelial cell": 180169,
+        "neutrophil": 147519,
+        "monocyte": 378010,
+        "stromal cell": 1139185,
+        "cord blood hematopoietic stem cell": 120,
+        "mast cell": 140002,
+        "professional antigen presenting cell": 3636,
+        "erythroid lineage cell": 6784,
+        "primordial germ cell": 2661,
+        "alternatively activated macrophage": 13712,
+        "L2/3-6 intratelencephalic projecting glutamatergic neuron": 4474360,
+        "pvalb GABAergic cortical interneuron": 544593,
+        "chandelier pvalb GABAergic cortical interneuron": 64124,
+        "sst GABAergic cortical interneuron": 478132,
+        "Bergmann glial cell": 52764,
+        "glutamatergic neuron": 10341906,
+        "transit amplifying cell of colon": 1716,
+        "CD8-alpha-beta-positive, alpha-beta intraepithelial T cell": 9,
+        "intestinal crypt stem cell": 13171,
+        "intestinal tuft cell": 136,
+        "enteric smooth muscle cell": 12676,
+        "smooth muscle cell of large intestine": 1194,
+        "interstitial cell of Cajal": 3948,
+        "smooth muscle cell of small intestine": 255,
+        "cardiac valve cell": 89903,
+        "primitive red blood cell": 61602,
+        "neurectodermal cell": 15338,
+        "midbrain dopaminergic neuron": 223991,
+        "paraxial cell": 44784,
+        "mesodermal cell": 6565020,
+        "splanchnic mesodermal cell": 274057,
+        "neuroplacodal cell": 30685,
+        "premigratory neural crest cell": 133537,
+        "notochordal cell": 30276,
+        "hemangioblast": 7952,
+        "spinal cord interneuron": 69801,
+        "endodermal cell": 13635,
+        "surface ectodermal cell": 8005,
+        "gut endothelial cell": 9987,
+        "anterior visceral endoderm cell": 2078,
+        "activated CD4-negative, CD8-negative type I NK T cell": 2035,
+        "parietal epithelial cell": 6234,
+        "kidney loop of Henle epithelial cell": 2578,
+        "kidney loop of Henle thin descending limb epithelial cell": 42271,
+        "malignant cell": 1185459,
+        "exhausted T cell": 21551,
+        "CD4-positive helper T cell": 81404,
+        "CD8-positive, alpha-beta T cell": 820541,
+        "promonocyte": 25130,
+        "granulocyte": 52137,
+        "osteoclast": 4143,
+        "promyelocyte": 5841,
+        "Kupffer cell": 57611,
+        "pre-conventional dendritic cell": 340,
+        "myelocyte": 7951,
+        "plasmacytoid dendritic cell": 65151,
+        "common dendritic progenitor": 2427,
+        "mural cell": 382262,
+        "myofibroblast cell": 121210,
+        "glial cell": 85689,
+        "lymphocyte": 61696,
+        "retinal ganglion cell": 193525,
+        "lamp5 GABAergic cortical interneuron": 374201,
+        "luminal epithelial cell of mammary gland": 232901,
+        "endothelial cell of vascular tree": 296575,
+        "mammary gland epithelial cell": 73026,
+        "adipocyte of breast": 4072,
+        "IgA plasma cell": 33332,
+        "class switched memory B cell": 13393,
+        "naive B cell": 264747,
+        "IgG plasma cell": 7559,
+        "unswitched memory B cell": 8330,
+        "centrilobular region hepatocyte": 43795,
+        "periportal region hepatocyte": 64925,
+        "blood cell": 1735,
+        "tracheal epithelial cell": 91061,
+        "medium spiny neuron": 94404,
+        "inhibitory interneuron": 161476,
+        "cell": 2298,
+        "uterine smooth muscle cell": 10162,
+        "decidual natural killer cell, human": 9147,
+        "endothelial cell of uterus": 5504,
+        "trophoblast giant cell": 54,
+        "embryonic fibroblast": 207,
+        "cardiac endothelial cell": 47096,
+        "fibroblast of cardiac tissue": 328488,
+        "immature innate lymphoid cell": 52909,
+        "cardiac muscle myoblast": 20061,
+        "lymphoid lineage restricted progenitor cell": 17592,
+        "smooth muscle myoblast": 3485,
+        "neuronal receptor cell": 1736,
+        "fibroblast of lymphatic vessel": 425,
+        "flat midget bipolar cell": 452566,
+        "classical monocyte": 798528,
+        "conventional dendritic cell": 85061,
+        "CD14-positive monocyte": 492097,
+        "effector memory CD8-positive, alpha-beta T cell": 228419,
+        "CD14-positive, CD16-positive monocyte": 3762,
+        "central memory CD4-positive, alpha-beta T cell": 664927,
+        "CD56-positive, CD161-positive immature natural killer cell, human": 910,
+        "CD16-positive, CD56-dim natural killer cell, human": 242315,
+        "CD8-positive, alpha-beta cytotoxic T cell": 48837,
+        "supporting cell": 25177,
+        "interstitial cell of ovary": 15608,
+        "hematopoietic cell": 59088,
+        "neural cell": 6988055,
+        "germ cell": 5414,
+        "ovarian surface epithelial cell": 3128,
+        "L4/5 intratelencephalic projecting glutamatergic neuron of the primary motor cortex": 327769,
+        "L6 corticothalamic-projecting glutamatergic cortical neuron": 146758,
+        "vip GABAergic cortical interneuron": 584164,
+        "L6 intratelencephalic projecting glutamatergic neuron of the primary motor cortex": 81460,
+        "hippocampal neuron": 64655,
+        "L6b glutamatergic cortical neuron": 137215,
+        "L5/6 near-projecting glutamatergic neuron of the primary motor cortex": 38791,
+        "L5 extratelencephalic projecting glutamatergic cortical neuron": 47098,
+        "pyramidal neuron": 25116,
+        "sncg GABAergic cortical interneuron": 159382,
+        "corticothalamic-projecting glutamatergic cortical neuron": 160820,
+        "L2/3 intratelencephalic projecting glutamatergic neuron of the primary motor cortex": 12699,
+        "sst chodl GABAergic cortical interneuron": 2506,
+        "cortical interneuron": 17508,
+        "vascular leptomeningeal cell (Mmus)": 84,
+        "meis2 expressing cortical GABAergic cell": 72,
+        "Cajal-Retzius cell": 52222,
+        "fibroblast of lung": 41525,
+        "type I pneumocyte": 84264,
+        "type II pneumocyte": 317250,
+        "gut absorptive cell": 2017,
+        "progenitor cell": 102352,
+        "intestinal crypt stem cell of large intestine": 1020,
+        "transit amplifying cell of small intestine": 1050,
+        "intestinal crypt stem cell of small intestine": 864,
+        "secretory cell": 93111,
+        "intestine goblet cell": 5700,
+        "enterocyte of epithelium of large intestine": 21896,
+        "paneth cell of epithelium of small intestine": 61,
+        "intestinal enteroendocrine cell": 560,
+        "duodenum glandular cell": 2,
+        "large intestine goblet cell": 7884,
+        "T follicular helper cell": 25073,
+        "GABAergic neuron": 2473147,
+        "fibroblast of mammary gland": 2345758,
+        "perivascular cell": 202406,
+        "luminal adaptive secretory precursor cell of mammary gland": 370605,
+        "endothelial tip cell": 33694,
+        "CD8-positive, alpha-beta memory T cell": 127077,
+        "luminal hormone-sensing cell of mammary gland": 246454,
+        "myoepithelial cell of mammary gland": 64397,
+        "capillary endothelial cell": 221145,
+        "brain vascular cell": 128078,
+        "dopaminergic neuron": 45014,
+        "serotonergic neuron": 3977,
+        "cerebellar neuron": 259346,
+        "neural progenitor cell": 323569,
+        "CD4-positive, alpha-beta T cell": 1055045,
+        "glycinergic amacrine cell": 462394,
+        "starburst amacrine cell": 51890,
+        "retinal rod cell": 1321083,
+        "Mueller cell": 264909,
+        "rod bipolar cell": 157105,
+        "ON-bipolar cell": 52848,
+        "OFF-bipolar cell": 43664,
+        "retinal cone cell": 87116,
+        "amacrine cell": 290117,
+        "melanocyte": 22531,
+        "retinal pigment epithelial cell": 7979,
+        "adipocyte": 145654,
+        "fibro/adipogenic progenitor cell": 15532,
+        "neuron associated cell": 888,
+        "inhibitory motor neuron": 483,
+        "motor neuron": 484,
+        "precursor B cell": 56201,
+        "interneuron": 34115,
+        "fallopian tube secretory epithelial cell": 105472,
+        "suprabasal keratinocyte": 22807,
+        "basal cell of epidermis": 83046,
+        "proerythroblast": 12294,
+        "kidney loop of Henle ascending limb epithelial cell": 8025,
+        "collagen secreting cell": 12335,
+        "epithelial cell of proximal tubule segment 1": 5885,
+        "MHC-II-positive classical monocyte": 77,
+        "naive T cell": 16638,
+        "chondroblast": 25460,
+        "osteoblast": 13763,
+        "myoblast": 221886,
+        "skeletal muscle myoblast": 12575,
+        "Schwann cell precursor": 459201,
+        "keratinocyte": 831988,
+        "inflammatory macrophage": 11854,
+        "monocyte-derived dendritic cell": 30,
+        "Langerhans cell": 730,
+        "cytotoxic T cell": 4099,
+        "forebrain neuroblast": 19384,
+        "chandelier cell": 898,
+        "caudal ganglionic eminence derived GABAergic cortical interneuron": 45259,
+        "basal cell of prostate epithelium": 100016,
+        "epithelial cell of urethra": 10903,
+        "luminal cell of prostate epithelium": 60854,
+        "prostate gland microvascular endothelial cell": 8454,
+        "prostate stromal cell": 1040,
+        "smooth muscle cell of prostate": 12018,
+        "lymphocyte of B lineage": 604,
+        "smooth muscle cell of the pulmonary artery": 5782,
+        "acinar cell of salivary gland": 97013,
+        "memory B cell": 116175,
+        "adventitial cell": 35481,
+        "duct epithelial cell": 5535,
+        "endothelial cell of hepatic sinusoid": 55050,
+        "non-classical monocyte": 134769,
+        "plasmablast": 20107,
+        "glomerular endothelial cell": 31920,
+        "renal intercalated cell": 428,
+        "vasa recta ascending limb cell": 670,
+        "vasa recta descending limb cell": 318,
+        "kidney epithelial cell": 8247,
+        "renal beta-intercalated cell": 3596,
+        "renal alpha-intercalated cell": 5747,
+        "urothelial cell": 45,
+        "renal principal cell": 2709,
+        "cell of skeletal muscle": 986689,
+        "thymocyte": 25853,
+        "pro-T cell": 5208,
+        "hematopoietic precursor cell": 10899,
+        "stem cell": 26362,
+        "paneth cell": 1204,
+        "type L enteroendocrine cell": 538,
+        "type EC enteroendocrine cell": 1642,
+        "hepatic stellate cell": 6302,
+        "cholangiocyte": 4645,
+        "endothelial cell of periportal hepatic sinusoid": 2930,
+        "endothelial cell of pericentral hepatic sinusoid": 7562,
+        "alveolar macrophage": 428496,
+        "effector memory CD4-positive, alpha-beta T cell": 86897,
+        "myeloid leukocyte": 13316,
+        "CD1c-positive myeloid dendritic cell": 90776,
+        "myeloid dendritic cell, human": 204,
+        "stratified epithelial cell": 26520,
+        "epithelial cell of stratum germinativum of esophagus": 405,
+        "mononuclear phagocyte": 41624,
+        "mucus secreting cell": 1815,
+        "regular atrial cardiac myocyte": 137963,
+        "Tc1 cell": 4156,
+        "endothelial cell of placenta": 21094,
+        "Hofbauer cell": 40343,
+        "group 3 innate lymphoid cell, human": 187,
+        "kidney collecting duct epithelial cell": 707,
+        "fenestrated cell": 3592,
+        "early T lineage precursor": 672,
+        "CD4-positive, alpha-beta memory T cell": 17487,
+        "erythroid progenitor cell": 776619,
+        "central memory CD8-positive, alpha-beta T cell": 26584,
+        "gamma-delta T cell": 113818,
+        "early promyelocyte": 4281,
+        "CD16-negative, CD56-bright natural killer cell, human": 37098,
+        "megakaryocyte progenitor cell": 423,
+        "late promyelocyte": 1039,
+        "basophil mast progenitor cell": 275,
+        "CD4-positive, alpha-beta cytotoxic T cell": 18248,
+        "airway submucosal gland duct basal cell": 4957,
+        "serous secreting cell of bronchus submucosal gland": 16876,
+        "ciliated cell": 32399,
+        "lung secretory cell": 20594,
+        "myoepithelial cell": 2506,
+        "lung macrophage": 15050,
+        "mesenchymal stem cell of adipose tissue": 31777,
+        "regular ventricular cardiac myocyte": 235052,
+        "choroid plexus epithelial cell": 77830,
+        "aortic endothelial cell": 969,
+        "fibrocyte": 386,
+        "kidney loop of Henle thin ascending limb epithelial cell": 28383,
+        "kidney interstitial alternatively activated macrophage": 6719,
+        "renal interstitial pericyte": 11322,
+        "papillary tips cell": 714,
+        "fast muscle cell": 2253,
+        "skeletal muscle fiber": 1683,
+        "slow muscle cell": 1564,
+        "skeletal muscle satellite cell": 439526,
+        "retinal blood vessel endothelial cell": 980,
+        "non-myelinating Schwann cell": 587,
+        "lung perichondrial fibroblast": 1769,
+        "respiratory suprabasal cell": 4061,
+        "lung pericyte": 14470,
+        "memory T cell": 19260,
+        "leptomeningeal cell": 3653,
+        "Sertoli cell": 1349,
+        "macroglial cell": 38258,
+        "retinal bipolar neuron": 137284,
+        "cerebellar granule cell": 763044,
+        "intermediate monocyte": 6477,
+        "erythroblast": 78556,
+        "midzonal region hepatocyte": 7649,
+        "endothelial cell of venule": 17454,
+        "helper T cell": 25512,
+        "mucosal invariant T cell": 82156,
+        "T-helper 17 cell": 465,
+        "olfactory epithelial cell": 150663,
+        "auditory epithelial cell": 131901,
+        "endo-epithelial cell": 167871,
+        "epithelial cell of amnion": 80603,
+        "intermediate mesodermal cell": 88574,
+        "ectodermal cell": 50893,
+        "metanephric mesenchyme stem cell": 7479,
+        "ureteric bud cell": 5049,
+        "pituitary gland cell": 10257,
+        "pancreatic acinar cell": 45689,
+        "lens epithelial cell": 4935,
+        "epithelial cell of parathyroid gland": 540,
+        "epithelial cell of thymus": 651,
+        "intrahepatic cholangiocyte": 4572,
+        "epithelial cell of thyroid gland": 1057,
+        "peripheral nervous system neuron": 185140,
+        "neural crest cell": 1710,
+        "sensory neuron": 423,
+        "cerebral cortex endothelial cell": 42882,
+        "microvascular endothelial cell": 7776,
+        "brain pericyte": 5548,
+        "endocardial cell": 69205,
+        "adipocyte of epicardial fat of left ventricle": 732,
+        "CD14-low, CD16-positive monocyte": 82303,
+        "DN4 thymocyte": 5218,
+        "pancreatic stellate cell": 27724,
+        "pancreatic ductal cell": 163344,
+        "type B pancreatic cell": 186236,
+        "CD8-positive, alpha-beta memory T cell, CD45RO-positive": 15518,
+        "alpha-beta T cell": 28679,
+        "effector memory CD8-positive, alpha-beta T cell, terminally differentiated": 6391,
+        "brown preadipocyte": 37252,
+        "brown adipocyte": 73109,
+        "lung ciliated cell": 2749,
+        "effector CD8-positive, alpha-beta T cell": 120129,
+        "T-helper 22 cell": 29502,
+        "myeloid dendritic cell": 8631,
+        "dendritic cell, human": 5529,
+        "erythroid progenitor cell, mammalian": 922,
+        "ILC1, human": 825,
+        "CD34-positive, CD38-negative hematopoietic stem cell": 561,
+        "IgM plasma cell": 1012,
+        "T-helper 1 cell": 99,
+        "group 2 innate lymphoid cell, human": 66,
+        "myeloid lineage restricted progenitor cell": 2530,
+        "T-helper 2 cell": 35,
+        "astrocyte of the cerebral cortex": 277552,
+        "near-projecting glutamatergic cortical neuron": 112806,
+        "effector CD4-positive, alpha-beta T cell": 99747,
+        "type I NK T cell": 44099,
+        "CD141-positive myeloid dendritic cell": 4243,
+        "mature conventional dendritic cell": 228,
+        "melanocyte of skin": 2528,
+        "pancreatic A cell": 85447,
+        "pancreatic D cell": 33374,
+        "pancreatic PP cell": 7920,
+        "CD14-positive, CD16-negative classical monocyte": 42474,
+        "CD4-positive, CD25-positive, alpha-beta regulatory T cell": 1167,
+        "kidney connecting tubule principal cell": 317,
+        "epithelial cell of large intestine": 8616,
+        "Purkinje cell": 270713,
+        "granule cell": 63342,
+        "neuron associated cell (sensu Vertebrata)": 36057,
+        "stellate neuron": 29019,
+        "neuronal brush cell": 13107,
+        "myotube": 34568,
+        "muscle precursor cell": 65050,
+        "transitional stage B cell": 50088,
+        "immature neutrophil": 16,
+        "medial ganglionic eminence derived interneuron": 1431,
+        "caudal ganglionic eminence derived interneuron": 852,
+        "bronchus fibroblast of lung": 11918,
+        "pigmented epithelial cell": 21792,
+        "smooth muscle cell of sphincter of pupil": 1107,
+        "IgG plasmablast": 599,
+        "IgA plasmablast": 413,
+        "plasmatocyte": 1663,
+        "kidney cortex artery cell": 1199,
+        "kidney capillary endothelial cell": 168,
+        "kidney proximal straight tubule epithelial cell": 13294,
+        "cardiac muscle cell": 765261,
+        "mesothelial cell of epicardium": 2111,
+        "fetal cardiomyocyte": 128,
+        "cardiac mesenchymal cell": 4,
+        "pneumocyte": 3852,
+        "mononuclear cell": 922,
+        "tonsil germinal center B cell": 700,
+        "centroblast": 110,
+        "centrocyte": 56,
+        "macrophage dendritic cell progenitor": 218,
+        "immature NK T cell": 3517,
+        "neuroblast (sensu Vertebrata)": 2062822,
+        "alveolar type 2 fibroblast cell": 33366,
+        "tracheobronchial smooth muscle cell": 18892,
+        "lung goblet cell": 724,
+        "respiratory basal cell": 132261,
+        "brush cell of trachebronchial tree": 912,
+        "mesothelial fibroblast": 78,
+        "bladder urothelial cell": 7613,
+        "bladder cell": 5132,
+        "neoplastic cell": 25559,
+        "endothelial cell of coronary artery": 8166,
+        "cardiac neuron": 11728,
+        "OFF retinal ganglion cell": 896,
+        "ON retinal ganglion cell": 482,
+        "lung resident memory CD8-positive, alpha-beta T cell": 6137,
+        "lung resident memory CD4-positive, alpha-beta T cell": 2732,
+        "deuterosomal cell": 266,
+        "granulocytopoietic cell": 16454,
+        "basophil": 816,
+        "PP cell": 408,
+        "pancreatic epsilon cell": 95,
+        "fibroblast of connective tissue of prostate": 7852,
+        "double negative T regulatory cell": 265,
+        "progenitor cell of mammary luminal epithelium": 8436,
+        "lactocyte": 5949,
+        "vascular lymphangioblast": 4051,
+        "lung endothelial cell": 35063,
+        "respiratory goblet cell": 1842,
+        "cardiac pacemaker cell of sinoatrial node": 792,
+        "activated CD4-positive, alpha-beta T cell": 3104,
+        "differentiation-committed oligodendrocyte precursor": 3019,
+        "glycinergic neuron": 93491,
+        "keratinocyte stem cell": 5652,
+        "bronchial smooth muscle cell": 11900,
+        "epidermal cell": 2044,
+        "basal epithelial cell of tracheobronchial tree": 927,
+        "neural stem cell": 272,
+        "mature alpha-beta T cell": 54887,
+        "brush cell of epithelium proper of large intestine": 313,
+        "smooth muscle cell of trachea": 337,
+        "ciliated columnar cell of tracheobronchial tree": 71880,
+        "early pro-B cell": 32954,
+        "pulmonary interstitial fibroblast": 176,
+        "neuroepithelial stem cell": 163,
+        "lung neuroendocrine cell": 627,
+        "common lymphoid progenitor": 7871,
+        "plasmacytoid dendritic cell, human": 3061,
+        "activated CD4-positive, alpha-beta T cell, human": 1153,
+        "lateral mesodermal cell": 2491777,
+        "hypothalamus cell": 342662,
+        "primitive erythroid progenitor": 427024,
+        "retinal progenitor cell": 340536,
+        "spinal cord motor neuron": 152116,
+        "cranial motor neuron": 98269,
+        "enteric neuron": 37246,
+        "spiral ganglion neuron": 27856,
+        "cerebral cortex GABAergic interneuron": 565934,
+        "embryonic blood vessel endothelial progenitor cell": 10477,
+        "sympathetic neuron": 55650,
+        "olfactory receptor cell": 10237,
+        "extraembryonic cell": 717,
+        "fibroblast of breast": 5733,
+        "endothelial cell of umbilical vein": 56790,
+        "transit amplifying cell": 3564,
+        "M cell of gut": 52,
+        "hypendymal cell": 2379,
+        "oogonial cell": 4905,
+        "female germ cell": 1990,
+        "male germ cell": 1271,
+        "oocyte": 316,
+        "basket cell": 20982,
+        "epithelial cell of prostate": 15037,
+        "basal epithelial cell of prostatic duct": 12694,
+        "contractile cell": 726,
+        "mature T cell": 111587,
+        "eosinophil": 150,
+        "corneal epithelial cell": 15616,
+        "corneal endothelial cell": 4434,
+        "activated CD8-positive, alpha-beta T cell": 15428,
+        "follicular B cell": 3924,
+        "colon macrophage": 9,
+        "myelinating Schwann cell": 43810,
+        "cell in vitro": 23880,
+        "S cone cell": 813,
+        "lung interstitial macrophage": 152,
+        "Leydig cell": 549,
+        "L2/3 intratelencephalic projecting glutamatergic neuron": 92223,
+        "enterocyte of colon": 12280,
+        "mesenchymal lymphangioblast": 2134,
+        "colon epithelial cell": 3672,
+        "CD34-positive, CD56-positive, CD117-positive common innate lymphoid precursor, human": 1012,
+        "NKp44-positive group 3 innate lymphoid cell, human": 748,
+        "NKp44-negative group 3 innate lymphoid cell, human": 374,
+        "primary sensory neuron (sensu Teleostei)": 22,
+        "type N enteroendocrine cell": 22,
+        "progenitor cell of endocrine pancreas": 22,
+        "CD4-positive, alpha-beta thymocyte": 10413,
+        "fibroblast of connective tissue of nonglandular part of prostate": 3872,
+        "fibroblast of connective tissue of glandular part of prostate": 1609,
+        "CD8-positive, alpha-beta thymocyte": 4748,
+        "enucleate erythrocyte": 802,
+        "lung microvascular endothelial cell": 118,
+        "serous cell of epithelium of bronchus": 8,
+        "pulmonary ionocyte": 529,
+        "epithelial cell of pancreas": 1184,
+        "cultured cell": 183749,
+        "reticular cell": 1742,
+        "inflammatory cell": 1504,
+        "stem cell of epidermis": 1247,
+        "pigmented ciliary epithelial cell": 8987,
+        "non-pigmented ciliary epithelial cell": 2750,
+        "ciliary muscle cell": 7528,
+        "acinar cell": 326656,
+        "endocrine cell": 116526,
+        "non-terminally differentiated cell": 4,
+        "pre-natural killer cell": 4,
+        "midget ganglion cell of retina": 79066,
+        "GABAergic amacrine cell": 328894,
+        "diffuse bipolar 3b cell": 15386,
+        "diffuse bipolar 2 cell": 38767,
+        "ON parasol ganglion cell": 2100,
+        "diffuse bipolar 1 cell": 20617,
+        "invaginating midget bipolar cell": 22451,
+        "diffuse bipolar 3a cell": 17037,
+        "H2 horizontal cell": 6438,
+        "OFFx cell": 8003,
+        "H1 horizontal cell": 25682,
+        "diffuse bipolar 4 cell": 17039,
+        "diffuse bipolar 6 cell": 6525,
+        "OFF parasol ganglion cell": 324,
+        "hepatic pit cell": 6559,
+        "follicular dendritic cell": 2,
+        "mature gamma-delta T cell": 1406,
+        "thalamic excitatory neuron": 76744,
+        "small bistratified retinal ganglion cell": 2226,
+        "mature microglial cell": 11466,
+        "intestinal epithelial cell": 29221,
+        "epithelial cell of lung": 29856,
+        "CD38-negative naive B cell": 4952,
+        "urethra urothelial cell": 29251,
+        "seminal vesicle glandular cell": 5439,
+        "type I cell of adrenal cortex": 3043,
+        "germinal center B cell": 217,
+        "kidney cell": 4410,
+        "kidney loop of Henle medullary thick ascending limb epithelial cell": 4792,
+        "kidney loop of Henle cortical thick ascending limb epithelial cell": 2800,
+        "kidney cortex tubule cell": 984,
+        "kidney glomerular epithelial cell": 192,
+        "preadipocyte": 128328,
+        "type 6 cone bipolar cell (sensu Mus)": 24353,
+        "type 5a cone bipolar cell": 20325,
+        "type 7 cone bipolar cell (sensu Mus)": 16313,
+        "type 3b cone bipolar cell": 12627,
+        "type 3a cone bipolar cell": 11349,
+        "type 5b cone bipolar cell": 9221,
+        "type 5 cone bipolar cell (sensu Mus)": 11677,
+        "type 8 cone bipolar cell (sensu Mus)": 3790,
+        "type 9 cone bipolar cell (sensu Mus)": 3519,
+        "type 2 cone bipolar cell (sensu Mus)": 899,
+        "type 4 cone bipolar cell (sensu Mus)": 759,
+        "type 1 cone bipolar cell (sensu Mus)": 3221,
+        "cerebellar granule cell precursor": 18256,
+        "unipolar brush cell": 3600,
+        "glioblast": 2686,
+        "immature astrocyte": 4156,
+        "meningeal macrophage": 1348,
+        "noradrenergic cell": 6,
+        "multi-ciliated epithelial cell": 40320,
+        "pulmonary artery endothelial cell": 18256,
+        "cone retinal bipolar cell": 884,
+        "retinal astrocyte": 70,
+        "efferent neuron": 711,
+        "enterocyte of epithelium proper of ileum": 474,
+        "ileal goblet cell": 68,
+        "smooth muscle fiber of ileum": 52,
+        "enteroendocrine cell of small intestine": 53,
+        "aortic smooth muscle cell": 5680,
+        "mesothelial cell of visceral pleura": 336,
+        "ciliated cell of the bronchus": 19278,
+        "squamous epithelial cell": 2828,
+        "nasal mucosa goblet cell": 54749,
+        "memory regulatory T cell": 45,
+        "naive regulatory T cell": 1512,
+        "myeloid suppressor cell": 44926,
+        "adipose macrophage": 5054,
+        "absorptive cell": 110,
+        "intestinal crypt stem cell of colon": 53,
+        "mature astrocyte": 14537,
+        "hair follicular keratinocyte": 14465,
+        "sebum secreting cell": 369,
+        "granular cell of epidermis": 444,
+        "anterior lens cell": 4920,
+        "secondary lens fiber": 1493,
+        "lens fiber cell": 731,
+        "A2 amacrine cell": 759,
+        "sperm": 10,
+        "abnormal cell": 11002,
+        "myometrial cell": 144,
+        "epithelial cell of uterus": 87,
+        "prickle cell": 29060,
+        "Merkel cell": 964,
+        "cortical thymic epithelial cell": 8923,
+        "medullary thymic epithelial cell": 684,
+        "epicardial adipocyte": 3447,
+        "peritubular capillary endothelial cell": 3,
+        "conjunctival epithelial cell": 1933,
+        "glomerular capillary endothelial cell": 218,
+        "columnar/cuboidal epithelial cell": 4,
+        "kidney resident macrophage": 40,
+        "ON-blue cone bipolar cell": 1599,
+        "CD8-alpha alpha positive, gamma-delta intraepithelial T cell": 1480,
+        "NKp46-positive innate lymphoid cell, human": 12750,
+        "neutrophil progenitor cell": 16,
+        "skeletal muscle satellite stem cell": 1384,
+        "mucosal type mast cell": 294,
+        "metallothionein-positive alveolar macrophage": 46,
+        "cerebral cortex neuron": 6961,
+        "basal cell of epithelium of trachea": 95,
+        "tracheal goblet cell": 336,
+        "photoreceptor cell": 2970,
+        "cochlea auditory hair cell": 430,
+        "pinealocyte": 340,
+        "iris pigment epithelial cell": 310,
+        "radial glial cell": 1331,
+        "GABAergic interneuron": 4622,
+        "pancreatic endocrine cell": 11527,
+        "endothelial cell of sinusoid": 212,
+        "DN3 thymocyte": 3597,
+        "DN1 thymic pro-T cell": 1622,
+        "parasol ganglion cell of retina": 9659,
+        "epithelial cell of proximal tubule segment 3": 3962,
+        "valve interstitial cell": 174,
+        "valve endothelial cell": 150,
+        "myocyte of sinoatrial node": 63,
+        "colon goblet cell": 226,
+        "enteroendocrine cell of colon": 54,
+        "paneth cell of colon": 221,
+        "cholinergic neuron": 13860,
+        "L4/5 intratelencephalic projecting glutamatergic neuron": 6704,
+        "L6 intratelencephalic projecting glutamatergic neuron": 2232,
+        "L3 intratelencephalic projecting glutamatergic neuron": 862,
+        "tanycyte": 2560,
+        "IgG-negative class switched memory B cell": 3171,
+        "IgG memory B cell": 1557,
+        "indirect pathway medium spiny neuron": 5000,
+        "direct pathway medium spiny neuron": 4055,
+        "elicited macrophage": 68370,
+        "alveolar type 1 fibroblast cell": 35555,
+        "respiratory hillock cell": 7550,
+        "epithelial cell of lower respiratory tract": 10120,
+        "serous secreting cell": 2615,
+        "tracheobronchial serous cell": 3820,
+        "tracheobronchial goblet cell": 1650,
+        "bronchial goblet cell": 1473,
+        "epithelial fate stem cell": 200,
+        "lymphatic endothelial cell of medulla ceiling": 364,
+        "lymphatic endothelial cell of subcapsular sinus floor": 283,
+        "lymphatic endothelial cell of subcapsular sinus ceiling": 194,
+        "lymph node lymphatic vessel endothelial cell": 52,
+        "tissue-resident macrophage": 3616,
+        "glandular epithelial cell": 10369,
+        "L4 intratelencephalic projecting glutamatergic neuron": 3992,
+        "L5/6 near-projecting glutamatergic neuron": 838,
+        "forebrain radial glial cell": 17848,
+        "white adipocyte": 6056,
+        "precursor cell": 145,
+        "primary cultured cell": 29,
+        "liver dendritic cell": 642,
+        "giant bipolar cell": 6497,
+        "eurydendroid cell": 208,
+        "type A enteroendocrine cell": 841,
+        "type D enteroendocrine cell": 69,
+        "serous cell of epithelium of trachea": 20,
+        "T follicular regulatory cell": 32,
+        "enterocyte of epithelium of small intestine": 336,
+        "tuft cell of colon": 199,
+        "small intestine goblet cell": 149,
+        "epithelial cell of small intestine": 60,
+        "BEST4+ intestinal epithelial cell, human": 18,
+        "microfold cell of epithelium of small intestine": 11,
+        "foveolar cell of stomach": 98557,
+        "mucous neck cell": 30510,
+        "type G enteroendocrine cell": 9971,
+        "natural T-regulatory cell": 9175,
+        "peptic cell": 2821,
+        "P/D1 enteroendocrine cell": 2307,
+        "parietal cell": 686,
+        "eye photoreceptor cell": 318,
+        "keratocyte": 242,
+        "preosteoblast": 240,
+        "endosteal cell": 110,
+        "immature natural killer cell": 5,
+        "basal cell of epithelium of bronchus": 19220,
+        "brush cell of bronchus": 308,
+        "sensory neuron of dorsal root ganglion": 31650,
+        "parasympathetic neuron": 28182,
+        "immature T cell": 1318,
+        "epithelial cell of esophagus": 24915,
+        "glandular cell of esophagus": 990,
+        "perineuronal satellite cell": 7154,
+        "olfactory ensheathing cell": 1946
+    },
+    "tissue": {
+        "occipital cortex": 162963,
+        "trophoblast": 30396,
+        "dermis": 25135,
+        "skin of body": 166875,
+        "small intestine": 46378,
+        "dorsolateral prefrontal cortex": 3412403,
+        "fovea centralis": 487732,
+        "peripheral region of retina": 2732577,
+        "bone marrow": 488335,
+        "liver": 767833,
+        "thymus": 257017,
+        "kidney": 1923863,
+        "hindlimb": 124567,
+        "cerebral cortex": 3652821,
+        "lung": 3548542,
+        "lymph node": 688503,
+        "axilla": 14419,
+        "brain": 2497814,
+        "adrenal gland": 14065,
+        "bone spine": 3772,
+        "pleural effusion": 56289,
+        "cerebellum lobule": 52243,
+        "blood": 6877569,
+        "ovary": 80987,
+        "cerebellum": 2415752,
+        "thalamic complex": 808199,
+        "pleura": 577320,
+        "primary motor cortex": 1383100,
+        "myelencephalon": 182804,
+        "pons": 528502,
+        "midbrain": 1080027,
+        "cerebral nuclei": 704599,
+        "hypothalamus": 617214,
+        "cortex of kidney": 743525,
+        "entorhinal cortex": 271265,
+        "colonic epithelium": 12167,
+        "colon": 176975,
+        "ileum": 313492,
+        "hindgut": 3942,
+        "embryo": 26758932,
+        "renal medulla": 468214,
+        "heart left ventricle": 908218,
+        "heart right ventricle": 1008903,
+        "middle temporal gyrus": 3335710,
+        "lamina propria of mucosa of colon": 8626,
+        "sigmoid colon": 7269,
+        "retina": 653872,
+        "body of stomach": 225939,
+        "submucosal esophageal gland": 6716,
+        "cardia of stomach": 27350,
+        "lower esophagus": 30615,
+        "esophagogastric junction": 13434,
+        "duodenum": 12222,
+        "breast": 5177404,
+        "caudate lobe of liver": 222664,
+        "inguinal fat pad": 53002,
+        "epididymal fat pad": 161432,
+        "periovarian fat pad": 24905,
+        "hippocampal formation": 1261479,
+        "spinal cord": 90537,
+        "tracheal epithelial cell": 91061,
+        "cortical layer VI": 141377,
+        "corpus callosum": 138982,
+        "cortical layer V": 101073,
+        "cortical layer II/III": 88877,
+        "striatum": 1276789,
+        "pia mater": 25346,
+        "olfactory region": 7353,
+        "brain ventricle": 3306,
+        "decidua basalis": 32543,
+        "spleen": 401655,
+        "prefrontal cortex": 2048128,
+        "temporal lobe": 1019468,
+        "primary somatosensory cortex": 701183,
+        "parietal cortex": 63124,
+        "secondary visual cortex": 29245,
+        "primary auditory cortex": 199760,
+        "autopod skin": 13687,
+        "macula lutea": 358464,
+        "macula lutea proper": 105599,
+        "gonad": 172997,
+        "primary visual cortex": 1374285,
+        "anterolateral visual area": 10288,
+        "anterior cingulate cortex": 400317,
+        "visual cortex": 193642,
+        "subicular complex": 55770,
+        "secondary somatosensory cortex": 2116,
+        "posterior parietal association areas": 28569,
+        "temporal cortex": 59645,
+        "agranular insular cortex": 54701,
+        "retrosplenial region": 1604,
+        "gustatory cortex": 1580,
+        "lateral entorhinal cortex": 1572,
+        "medial entorhinal cortex": 1112,
+        "medial orbital frontal cortex": 110134,
+        "auditory cortex": 73036,
+        "claustrum of brain": 4,
+        "amygdala": 142045,
+        "lateral amygdaloid nucleus": 18643,
+        "medial amygdaloid nucleus": 17295,
+        "apex of heart": 83168,
+        "interventricular septum": 392607,
+        "right cardiac atrium": 114445,
+        "left cardiac atrium": 380823,
+        "diencephalon": 238693,
+        "pigment epithelium of eye": 956,
+        "subcutaneous abdominal adipose tissue": 41136,
+        "visceral abdominal adipose tissue": 35796,
+        "posterior hypothalamic region": 74913,
+        "intestine": 25127,
+        "adnexa of uterus": 280223,
+        "nose skin": 71320,
+        "Brodmann (1909) area 25": 36225,
+        "ileal epithelium": 8118,
+        "forelimb": 500298,
+        "head of caudate nucleus": 28424,
+        "skin of forehead": 15247,
+        "ganglionic eminence": 8108,
+        "respiratory airway": 467047,
+        "transition zone of prostate": 88242,
+        "peripheral zone of prostate": 54097,
+        "parotid gland": 178118,
+        "upper lobe of left lung": 27792,
+        "substantia nigra pars compacta": 314973,
+        "renal pelvis": 95,
+        "kidney blood vessel": 59,
+        "epithelium of esophagus": 39735,
+        "sinoatrial node": 66843,
+        "rectum": 1208,
+        "decidua": 185295,
+        "inferior temporal gyrus": 99318,
+        "bronchus": 47507,
+        "mesenteric fat pad": 5461,
+        "atrioventricular node": 729,
+        "aorta": 12331,
+        "renal papilla": 168923,
+        "tendon of semitendinosus": 10413,
+        "adipose tissue": 80482,
+        "exocrine pancreas": 20609,
+        "myometrium": 3014,
+        "cardiac atrium": 7745,
+        "subcutaneous adipose tissue": 121403,
+        "mammary gland": 28594,
+        "skin of chest": 4486,
+        "muscle of abdomen": 1610,
+        "sublingual gland": 418,
+        "muscle of pelvic diaphragm": 14648,
+        "sclera": 372,
+        "cardiac ventricle": 1141,
+        "skin of abdomen": 7387,
+        "rectus abdominis muscle": 300,
+        "trachea": 51774,
+        "prostate gland": 116873,
+        "coronary artery": 278,
+        "posterior part of tongue": 242,
+        "cornea": 23298,
+        "endometrium": 117357,
+        "bladder organ": 11030,
+        "anterior part of tongue": 4346,
+        "vasculature": 8863,
+        "uterus": 34,
+        "large intestine": 64353,
+        "lacrimal gland": 4,
+        "inguinal lymph node": 20664,
+        "gingiva": 86814,
+        "inguinal part of abdomen": 8437,
+        "neural tube": 10299,
+        "glabella skin": 4007,
+        "neocortex": 6313087,
+        "white matter": 1111027,
+        "hindbrain": 567702,
+        "pallidum": 121800,
+        "cortical subplate": 342549,
+        "ventricular system of brain": 167874,
+        "alveolus of lung": 39661,
+        "thoracic lymph node": 46753,
+        "jejunal epithelium": 32845,
+        "lamina propria": 36972,
+        "brown preadipocyte": 7005,
+        "brain white matter": 20212,
+        "brain gray matter": 6670,
+        "caudal ganglionic eminence": 49470,
+        "medial ganglionic eminence": 28614,
+        "parietal lobe": 20622,
+        "orbitofrontal cortex": 6972,
+        "skin of temple": 61595,
+        "skin of cheek": 47600,
+        "islet of Langerhans": 439308,
+        "cervical spinal cord white matter": 35591,
+        "white matter of cerebellum": 21181,
+        "Brodmann (1909) area 4": 14097,
+        "iris": 48066,
+        "ascitic fluid": 93627,
+        "omentum": 152695,
+        "peritoneum": 55902,
+        "abdomen": 18036,
+        "right ovary": 17910,
+        "lung parenchyma": 645194,
+        "tonsil": 25382,
+        "superior frontal gyrus": 71605,
+        "bladder lumen": 12866,
+        "heart": 51664,
+        "fallopian tube": 47676,
+        "lower lobe of left lung": 63958,
+        "urethra": 77362,
+        "placenta": 230912,
+        "omental fat pad": 82178,
+        "skin of forearm": 32892,
+        "skin of pes": 32646,
+        "inguinal region skin": 10056,
+        "gonadal fat pad": 6539,
+        "pancreas": 1411636,
+        "tongue": 112725,
+        "diaphragm": 4850,
+        "limb muscle": 111326,
+        "brown adipose tissue": 3957,
+        "endothelial cell": 56790,
+        "cerebellar vermis": 236767,
+        "testis": 1219,
+        "gonad primordium": 399,
+        "mesoderm": 139,
+        "submucosa of ileum": 9015,
+        "submucosa of ascending colon": 6989,
+        "superior parietal cortex": 42614,
+        "cerebrocerebellum": 48747,
+        "perirhinal cortex": 21809,
+        "inferior parietal cortex": 19112,
+        "nucleus accumbens": 35731,
+        "Brodmann (1909) area 19": 11814,
+        "ventral lateral nucleus of thalamus": 23976,
+        "medial dorsal nucleus of thalamus": 10144,
+        "superior temporal sulcus": 5717,
+        "lateral geniculate body": 13543,
+        "medulla oblongata": 312,
+        "right frontal lobe": 90870,
+        "right parietal lobe": 52245,
+        "meningeal dura mater": 35748,
+        "dura mater": 702,
+        "brain meninx": 282,
+        "subdural space": 246,
+        "mesenteric lymph node": 67885,
+        "frontal cortex": 127978,
+        "choroid plexus": 33310,
+        "anterior hypothalamic region": 24475,
+        "gut wall": 170929,
+        "epithelial cell of alveolus of lung": 14952,
+        "cultured cell": 71512,
+        "zone of skin": 12787,
+        "ciliary body": 19320,
+        "muscle organ": 2410668,
+        "Brodmann (1909) area 23": 1424,
+        "esophagus": 14368,
+        "pyloric antrum": 11248,
+        "umbilical cord blood": 11302,
+        "outer medulla of kidney": 21560,
+        "inner medulla of kidney": 6904,
+        "brainstem": 92366,
+        "basal forebrain": 7885,
+        "perifoveal part of retina": 4422,
+        "endocrine pancreas": 2088,
+        "mesenteric artery": 6624,
+        "fimbria of uterine tube": 24914,
+        "ampulla of uterine tube": 30895,
+        "isthmus of fallopian tube": 33922,
+        "skin of back": 4402,
+        "skin of breast": 2732,
+        "nasopharynx": 24100,
+        "lamina propria of large intestine": 29204,
+        "lamina propria of small intestine": 62327,
+        "angular gyrus": 114354,
+        "pubis": 2762,
+        "descending colon": 1215,
+        "ascending colon": 2756,
+        "hepatic cecum": 768,
+        "mammary gland epithelial cell": 61935,
+        "caecum": 36317,
+        "left colon": 24992,
+        "right colon": 18368,
+        "upper outer quadrant of breast": 34306,
+        "skin epidermis": 7006,
+        "scalp": 3029,
+        "skin of external ear": 4239,
+        "lens of camera-type eye": 11296,
+        "skin of scalp": 48525,
+        "skin of trunk": 22552,
+        "transverse colon": 537,
+        "hepatic flexure of colon": 23,
+        "eye trabecular meshwork": 10728,
+        "corneo-scleral junction": 7491,
+        "epithelial cell of lung": 35350,
+        "retinal neural layer": 4316,
+        "chorioretinal region": 2512,
+        "cervical lymph node": 4355,
+        "putamen": 125801,
+        "nose": 188460,
+        "nasal cavity": 574,
+        "peripheral lymph node": 5625,
+        "left ovary": 2162,
+        "parietal peritoneum": 260,
+        "urinary bladder": 78,
+        "adrenal tissue": 9884,
+        "perirenal fat": 5334,
+        "vein": 1512,
+        "insular cortex": 295812,
+        "cingulate cortex": 280656,
+        "respiratory basal cell": 12211,
+        "duodeno-jejunal junction": 17920,
+        "Brodmann (1909) area 46": 34492,
+        "barrel cortex": 9775,
+        "caecum epithelium": 220,
+        "preadipocyte": 38642,
+        "trophoblast cell": 5247,
+        "retrosplenial granular cortex": 67319,
+        "frontal lobe": 34397,
+        "lateral visual area": 18939,
+        "upper leg skin": 2711,
+        "caudate nucleus": 24433,
+        "lateral nuclear group of thalamus": 12825,
+        "jejunum": 461,
+        "eye": 3814,
+        "dorsal thalamus": 65204,
+        "ventral thalamus": 14796,
+        "venous blood": 25296,
+        "bronchial epithelial cell": 59516
+    },
+    "sex": {
+        "female": 46072046,
+        "male": 61255034,
+        "unknown": 4409822
+    }
+}

teddy/data_processing/utils/bio_annotations/data/mappings/all_filtered_cell_mapping.json ADDED Viewed

	@@ -0,0 +1,862 @@

+{
+    "oligodendrocyte": "neural_cell",
+    "neuron": "neural_cell",
+    "astrocyte": "neural_cell",
+    "oligodendrocyte precursor cell": "neural_cell",
+    "microglial cell": "immune_cell",
+    "endothelial cell": "epithelial_cell",
+    "extravillous trophoblast": "embryonic_cell",
+    "placental villous trophoblast": "embryonic_cell",
+    "syncytiotrophoblast cell": "embryonic_cell",
+    "skin fibroblast": "connective_cell",
+    "T cell": "hematopoietic_cell",
+    "enterocyte": "ciliated_cell",
+    "endothelial cell of lymphatic vessel": "epithelial_cell",
+    "fibroblast": "connective_cell",
+    "blood vessel endothelial cell": "epithelial_cell",
+    "B cell": "hematopoietic_cell",
+    "enteroendocrine cell": "secretory_cell",
+    "macrophage": "immune_cell",
+    "dendritic cell": "immune_cell",
+    "vascular leptomeningeal cell": "connective_cell",
+    "retina horizontal cell": "neural_cell",
+    "natural killer cell": "hematopoietic_cell",
+    "large pre-B-II cell": "hematopoietic_cell",
+    "small pre-B-II cell": "hematopoietic_cell",
+    "double negative thymocyte": "hematopoietic_cell",
+    "pro-B cell": "precursor_cell",
+    "group 3 innate lymphoid cell": "hematopoietic_cell",
+    "late pro-B cell": "precursor_cell",
+    "fraction A pre-pro B cell": "hematopoietic_cell",
+    "B-2 B cell": "hematopoietic_cell",
+    "unknown": "unknown",
+    "early lymphoid progenitor": "precursor_cell",
+    "double-positive, alpha-beta thymocyte": "hematopoietic_cell",
+    "hematopoietic stem cell": "hematopoietic_cell",
+    "naive thymus-derived CD4-positive, alpha-beta T cell": "hematopoietic_cell",
+    "hematopoietic multipotent progenitor cell": "precursor_cell",
+    "B-1 B cell": "hematopoietic_cell",
+    "naive thymus-derived CD8-positive, alpha-beta T cell": "hematopoietic_cell",
+    "megakaryocyte-erythroid progenitor cell": "precursor_cell",
+    "regulatory T cell": "hematopoietic_cell",
+    "mature B cell": "hematopoietic_cell",
+    "group 2 innate lymphoid cell": "hematopoietic_cell",
+    "innate lymphoid cell": "hematopoietic_cell",
+    "immature B cell": "hematopoietic_cell",
+    "common myeloid progenitor": "precursor_cell",
+    "CD8-alpha-alpha-positive, alpha-beta intraepithelial T cell": "hematopoietic_cell",
+    "granulocyte monocyte progenitor cell": "precursor_cell",
+    "plasma cell": "hematopoietic_cell",
+    "kidney proximal convoluted tubule epithelial cell": "ciliated_cell",
+    "leukocyte": "hematopoietic_cell",
+    "kidney loop of Henle thick ascending limb epithelial cell": "epithelial_cell",
+    "kidney distal convoluted tubule epithelial cell": "epithelial_cell",
+    "kidney interstitial fibroblast": "connective_cell",
+    "blood vessel smooth muscle cell": "contractile_cell",
+    "kidney collecting duct principal cell": "epithelial_cell",
+    "kidney collecting duct intercalated cell": "epithelial_cell",
+    "podocyte": "epithelial_cell",
+    "mesangial cell": "connective_cell",
+    "kidney granular cell": "contractile_cell",
+    "macula densa epithelial cell": "epithelial_cell",
+    "muscle cell": "contractile_cell",
+    "fibroblast of dermis": "connective_cell",
+    "tendon cell": "connective_cell",
+    "Schwann cell": "neural_cell",
+    "chondrocyte": "connective_cell",
+    "smooth muscle cell": "contractile_cell",
+    "endothelial cell of artery": "epithelial_cell",
+    "reticulocyte": "hematopoietic_cell",
+    "vein endothelial cell": "epithelial_cell",
+    "pericyte": "perivascular_cell",
+    "peridermal cell": "epithelial_cell",
+    "basal cell": "epithelial_cell",
+    "articular chondrocyte": "connective_cell",
+    "mesenchymal cell": "connective_cell",
+    "connective tissue cell": "connective_cell",
+    "erythrocyte": "hematopoietic_cell",
+    "hypertrophic chondrocyte": "connective_cell",
+    "megakaryocyte": "hematopoietic_cell",
+    "muscle fibroblast": "skeletal_muscle",
+    "mature NK T cell": "hematopoietic_cell",
+    "myeloid cell": "immune_cell",
+    "kidney interstitial cell": "connective_cell",
+    "epithelial cell of nephron": "epithelial_cell",
+    "mesenchymal stem cell": "connective_cell",
+    "epithelial cell of proximal tubule": "ciliated_cell",
+    "kidney connecting tubule epithelial cell": "epithelial_cell",
+    "epithelial cell of glomerular capsule": "epithelial_cell",
+    "nephron tubule epithelial cell": "epithelial_cell",
+    "kidney collecting duct cell": "epithelial_cell",
+    "stromal cell of ovary": "connective_cell",
+    "granulosa cell": "epithelial_cell",
+    "theca cell": "connective_cell",
+    "epithelial cell": "epithelial_cell",
+    "epithelial cell of alveolus of lung": "epithelial_cell",
+    "goblet cell": "epithelial_cell",
+    "ionocyte": "epithelial_cell",
+    "hepatocyte": "epithelial_cell",
+    "ciliated epithelial cell": "ciliated_cell",
+    "neuroendocrine cell": "secretory_cell",
+    "club cell": "precursor_cell",
+    "brush cell": "epithelial_cell",
+    "platelet": "hematopoietic_cell",
+    "central nervous system macrophage": "immune_cell",
+    "ependymal cell": "ciliated_cell",
+    "vascular associated smooth muscle cell": "contractile_cell",
+    "mesothelial cell": "epithelial_cell",
+    "neutrophil": "immune_cell",
+    "monocyte": "precursor_cell",
+    "stromal cell": "connective_cell",
+    "cord blood hematopoietic stem cell": "hematopoietic_cell",
+    "mast cell": "hematopoietic_cell",
+    "professional antigen presenting cell": "hematopoietic_cell",
+    "erythroid lineage cell": "hematopoietic_cell",
+    "primordial germ cell": "unknown",
+    "alternatively activated macrophage": "immune_cell",
+    "L2/3-6 intratelencephalic projecting glutamatergic neuron": "neural_cell",
+    "pvalb GABAergic cortical interneuron": "neural_cell",
+    "chandelier pvalb GABAergic cortical interneuron": "neural_cell",
+    "sst GABAergic cortical interneuron": "neural_cell",
+    "Bergmann glial cell": "neural_cell",
+    "glutamatergic neuron": "neural_cell",
+    "transit amplifying cell of colon": "epithelial_cell",
+    "CD8-alpha-beta-positive, alpha-beta intraepithelial T cell": "hematopoietic_cell",
+    "intestinal crypt stem cell": "epithelial_cell",
+    "intestinal tuft cell": "epithelial_cell",
+    "enteric smooth muscle cell": "contractile_cell",
+    "smooth muscle cell of large intestine": "contractile_cell",
+    "interstitial cell of Cajal": "epithelial_cell",
+    "smooth muscle cell of small intestine": "contractile_cell",
+    "cardiac valve cell": "embryonic_cell",
+    "primitive red blood cell": "hematopoietic_cell",
+    "neurectodermal cell": "embryonic_cell",
+    "midbrain dopaminergic neuron": "neural_cell",
+    "paraxial cell": "embryonic_cell",
+    "mesodermal cell": "embryonic_cell",
+    "splanchnic mesodermal cell": "embryonic_cell",
+    "neuroplacodal cell": "embryonic_cell",
+    "premigratory neural crest cell": "embryonic_cell",
+    "notochordal cell": "epithelial_cell",
+    "hemangioblast": "embryonic_cell",
+    "spinal cord interneuron": "neural_cell",
+    "endodermal cell": "unknown",
+    "surface ectodermal cell": "embryonic_cell",
+    "gut endothelial cell": "epithelial_cell",
+    "anterior visceral endoderm cell": "embryonic_cell",
+    "activated CD4-negative, CD8-negative type I NK T cell": "hematopoietic_cell",
+    "parietal epithelial cell": "epithelial_cell",
+    "kidney loop of Henle epithelial cell": "epithelial_cell",
+    "kidney loop of Henle thin descending limb epithelial cell": "epithelial_cell",
+    "malignant cell": "unknown",
+    "exhausted T cell": "hematopoietic_cell",
+    "CD4-positive helper T cell": "hematopoietic_cell",
+    "CD8-positive, alpha-beta T cell": "hematopoietic_cell",
+    "promonocyte": "precursor_cell",
+    "granulocyte": "hematopoietic_cell",
+    "osteoclast": "unknown",
+    "promyelocyte": "precursor_cell",
+    "Kupffer cell": "immune_cell",
+    "pre-conventional dendritic cell": "immune_cell",
+    "myelocyte": "precursor_cell",
+    "plasmacytoid dendritic cell": "immune_cell",
+    "common dendritic progenitor": "precursor_cell",
+    "mural cell": "perivascular_cell",
+    "myofibroblast cell": "connective_cell",
+    "glial cell": "neural_cell",
+    "lymphocyte": "hematopoietic_cell",
+    "retinal ganglion cell": "neural_cell",
+    "lamp5 GABAergic cortical interneuron": "neural_cell",
+    "luminal epithelial cell of mammary gland": "epithelial_cell",
+    "endothelial cell of vascular tree": "epithelial_cell",
+    "mammary gland epithelial cell": "epithelial_cell",
+    "adipocyte of breast": "connective_cell",
+    "IgA plasma cell": "hematopoietic_cell",
+    "class switched memory B cell": "hematopoietic_cell",
+    "naive B cell": "hematopoietic_cell",
+    "IgG plasma cell": "hematopoietic_cell",
+    "unswitched memory B cell": "hematopoietic_cell",
+    "centrilobular region hepatocyte": "epithelial_cell",
+    "periportal region hepatocyte": "epithelial_cell",
+    "blood cell": "hematopoietic_cell",
+    "tracheal epithelial cell": "epithelial_cell",
+    "medium spiny neuron": "neural_cell",
+    "inhibitory interneuron": "neural_cell",
+    "cell": "unknown",
+    "uterine smooth muscle cell": "contractile_cell",
+    "decidual natural killer cell, human": "connective_cell",
+    "endothelial cell of uterus": "epithelial_cell",
+    "trophoblast giant cell": "embryonic_cell",
+    "embryonic fibroblast": "connective_cell",
+    "cardiac endothelial cell": "epithelial_cell",
+    "fibroblast of cardiac tissue": "connective_cell",
+    "immature innate lymphoid cell": "hematopoietic_cell",
+    "cardiac muscle myoblast": "precursor_cell",
+    "lymphoid lineage restricted progenitor cell": "precursor_cell",
+    "smooth muscle myoblast": "precursor_cell",
+    "neuronal receptor cell": "neural_cell",
+    "fibroblast of lymphatic vessel": "connective_cell",
+    "flat midget bipolar cell": "neural_cell",
+    "classical monocyte": "precursor_cell",
+    "conventional dendritic cell": "immune_cell",
+    "CD14-positive monocyte": "precursor_cell",
+    "effector memory CD8-positive, alpha-beta T cell": "hematopoietic_cell",
+    "CD14-positive, CD16-positive monocyte": "precursor_cell",
+    "central memory CD4-positive, alpha-beta T cell": "hematopoietic_cell",
+    "CD56-positive, CD161-positive immature natural killer cell, human": "hematopoietic_cell",
+    "CD16-positive, CD56-dim natural killer cell, human": "hematopoietic_cell",
+    "CD8-positive, alpha-beta cytotoxic T cell": "hematopoietic_cell",
+    "supporting cell": "unknown",
+    "interstitial cell of ovary": "connective_cell",
+    "hematopoietic cell": "hematopoietic_cell",
+    "neural cell": "neural_cell",
+    "germ cell": "unknown",
+    "ovarian surface epithelial cell": "epithelial_cell",
+    "L4/5 intratelencephalic projecting glutamatergic neuron of the primary motor cortex": "neural_cell",
+    "L6 corticothalamic-projecting glutamatergic cortical neuron": "neural_cell",
+    "vip GABAergic cortical interneuron": "neural_cell",
+    "L6 intratelencephalic projecting glutamatergic neuron of the primary motor cortex": "neural_cell",
+    "hippocampal neuron": "neural_cell",
+    "L6b glutamatergic cortical neuron": "neural_cell",
+    "L5/6 near-projecting glutamatergic neuron of the primary motor cortex": "neural_cell",
+    "L5 extratelencephalic projecting glutamatergic cortical neuron": "neural_cell",
+    "pyramidal neuron": "neural_cell",
+    "sncg GABAergic cortical interneuron": "neural_cell",
+    "corticothalamic-projecting glutamatergic cortical neuron": "neural_cell",
+    "L2/3 intratelencephalic projecting glutamatergic neuron of the primary motor cortex": "neural_cell",
+    "sst chodl GABAergic cortical interneuron": "neural_cell",
+    "cortical interneuron": "neural_cell",
+    "vascular leptomeningeal cell (Mmus)": "connective_cell",
+    "meis2 expressing cortical GABAergic cell": "secretory_cell",
+    "Cajal-Retzius cell": "neural_cell",
+    "fibroblast of lung": "connective_cell",
+    "type I pneumocyte": "epithelial_cell",
+    "type II pneumocyte": "epithelial_cell",
+    "gut absorptive cell": "epithelial_cell",
+    "progenitor cell": "precursor_cell",
+    "intestinal crypt stem cell of large intestine": "precursor_cell",
+    "transit amplifying cell of small intestine": "epithelial_cell",
+    "intestinal crypt stem cell of small intestine": "precursor_cell",
+    "secretory cell": "secretory_cell",
+    "intestine goblet cell": "epithelial_cell",
+    "enterocyte of epithelium of large intestine": "ciliated_cell",
+    "paneth cell of epithelium of small intestine": "secretory_cell",
+    "intestinal enteroendocrine cell": "secretory_cell",
+    "duodenum glandular cell": "secretory_cell",
+    "large intestine goblet cell": "epithelial_cell",
+    "T follicular helper cell": "hematopoietic_cell",
+    "GABAergic neuron": "neural_cell",
+    "fibroblast of mammary gland": "connective_cell",
+    "perivascular cell": "epithelial_cell",
+    "luminal adaptive secretory precursor cell of mammary gland": "epithelial_cell",
+    "endothelial tip cell": "epithelial_cell",
+    "CD8-positive, alpha-beta memory T cell": "hematopoietic_cell",
+    "luminal hormone-sensing cell of mammary gland": "epithelial_cell",
+    "myoepithelial cell of mammary gland": "contractile_cell",
+    "capillary endothelial cell": "epithelial_cell",
+    "brain vascular cell": "neural_cell",
+    "dopaminergic neuron": "neural_cell",
+    "serotonergic neuron": "neural_cell",
+    "cerebellar neuron": "neural_cell",
+    "neural progenitor cell": "neural_cell",
+    "CD4-positive, alpha-beta T cell": "hematopoietic_cell",
+    "glycinergic amacrine cell": "neural_cell",
+    "starburst amacrine cell": "neural_cell",
+    "retinal rod cell": "neural_cell",
+    "Mueller cell": "neural_cell",
+    "rod bipolar cell": "neural_cell",
+    "ON-bipolar cell": "neural_cell",
+    "OFF-bipolar cell": "neural_cell",
+    "retinal cone cell": "neural_cell",
+    "amacrine cell": "neural_cell",
+    "melanocyte": "secretory_cell",
+    "retinal pigment epithelial cell": "epithelial_cell",
+    "adipocyte": "embryonic_cell",
+    "fibro/adipogenic progenitor cell": "precursor_cell",
+    "neuron associated cell": "neural_cell",
+    "inhibitory motor neuron": "neural_cell",
+    "motor neuron": "neural_cell",
+    "precursor B cell": "hematopoietic_cell",
+    "interneuron": "neural_cell",
+    "fallopian tube secretory epithelial cell": "epithelial_cell",
+    "suprabasal keratinocyte": "epithelial_cell",
+    "basal cell of epidermis": "epithelial_cell",
+    "proerythroblast": "hematopoietic_cell",
+    "kidney loop of Henle ascending limb epithelial cell": "epithelial_cell",
+    "collagen secreting cell": "connective_cell",
+    "epithelial cell of proximal tubule segment 1": "ciliated_cell",
+    "MHC-II-positive classical monocyte": "precursor_cell",
+    "naive T cell": "hematopoietic_cell",
+    "chondroblast": "connective_cell",
+    "osteoblast": "connective_cell",
+    "myoblast": "precursor_cell",
+    "skeletal muscle myoblast": "skeletal_muscle",
+    "Schwann cell precursor": "neural_cell",
+    "keratinocyte": "epithelial_cell",
+    "inflammatory macrophage": "immune_cell",
+    "monocyte-derived dendritic cell": "immune_cell",
+    "Langerhans cell": "immune_cell",
+    "cytotoxic T cell": "hematopoietic_cell",
+    "forebrain neuroblast": "neural_cell",
+    "chandelier cell": "neural_cell",
+    "caudal ganglionic eminence derived GABAergic cortical interneuron": "neural_cell",
+    "basal cell of prostate epithelium": "epithelial_cell",
+    "epithelial cell of urethra": "epithelial_cell",
+    "luminal cell of prostate epithelium": "epithelial_cell",
+    "prostate gland microvascular endothelial cell": "epithelial_cell",
+    "prostate stromal cell": "connective_cell",
+    "smooth muscle cell of prostate": "contractile_cell",
+    "lymphocyte of B lineage": "hematopoietic_cell",
+    "smooth muscle cell of the pulmonary artery": "contractile_cell",
+    "acinar cell of salivary gland": "epithelial_cell",
+    "memory B cell": "hematopoietic_cell",
+    "adventitial cell": "connective_cell",
+    "duct epithelial cell": "epithelial_cell",
+    "endothelial cell of hepatic sinusoid": "epithelial_cell",
+    "non-classical monocyte": "precursor_cell",
+    "plasmablast": "hematopoietic_cell",
+    "glomerular endothelial cell": "epithelial_cell",
+    "renal intercalated cell": "epithelial_cell",
+    "vasa recta ascending limb cell": "epithelial_cell",
+    "vasa recta descending limb cell": "epithelial_cell",
+    "kidney epithelial cell": "epithelial_cell",
+    "renal beta-intercalated cell": "epithelial_cell",
+    "renal alpha-intercalated cell": "epithelial_cell",
+    "urothelial cell": "epithelial_cell",
+    "renal principal cell": "epithelial_cell",
+    "cell of skeletal muscle": "skeletal_muscle",
+    "thymocyte": "hematopoietic_cell",
+    "pro-T cell": "precursor_cell",
+    "hematopoietic precursor cell": "hematopoietic_cell",
+    "stem cell": "precursor_cell",
+    "paneth cell": "secretory_cell",
+    "type L enteroendocrine cell": "secretory_cell",
+    "type EC enteroendocrine cell": "secretory_cell",
+    "hepatic stellate cell": "connective_cell",
+    "cholangiocyte": "epithelial_cell",
+    "endothelial cell of periportal hepatic sinusoid": "epithelial_cell",
+    "endothelial cell of pericentral hepatic sinusoid": "epithelial_cell",
+    "alveolar macrophage": "immune_cell",
+    "effector memory CD4-positive, alpha-beta T cell": "hematopoietic_cell",
+    "myeloid leukocyte": "hematopoietic_cell",
+    "CD1c-positive myeloid dendritic cell": "immune_cell",
+    "myeloid dendritic cell, human": "immune_cell",
+    "stratified epithelial cell": "epithelial_cell",
+    "epithelial cell of stratum germinativum of esophagus": "epithelial_cell",
+    "mononuclear phagocyte": "immune_cell",
+    "mucus secreting cell": "secretory_cell",
+    "regular atrial cardiac myocyte": "contractile_cell",
+    "Tc1 cell": "hematopoietic_cell",
+    "endothelial cell of placenta": "epithelial_cell",
+    "Hofbauer cell": "immune_cell",
+    "group 3 innate lymphoid cell, human": "hematopoietic_cell",
+    "kidney collecting duct epithelial cell": "epithelial_cell",
+    "fenestrated cell": "epithelial_cell",
+    "early T lineage precursor": "hematopoietic_cell",
+    "CD4-positive, alpha-beta memory T cell": "hematopoietic_cell",
+    "erythroid progenitor cell": "precursor_cell",
+    "central memory CD8-positive, alpha-beta T cell": "hematopoietic_cell",
+    "gamma-delta T cell": "hematopoietic_cell",
+    "early promyelocyte": "precursor_cell",
+    "CD16-negative, CD56-bright natural killer cell, human": "hematopoietic_cell",
+    "megakaryocyte progenitor cell": "precursor_cell",
+    "late promyelocyte": "precursor_cell",
+    "basophil mast progenitor cell": "precursor_cell",
+    "CD4-positive, alpha-beta cytotoxic T cell": "hematopoietic_cell",
+    "airway submucosal gland duct basal cell": "epithelial_cell",
+    "serous secreting cell of bronchus submucosal gland": "epithelial_cell",
+    "ciliated cell": "ciliated_cell",
+    "lung secretory cell": "secretory_cell",
+    "myoepithelial cell": "contractile_cell",
+    "lung macrophage": "immune_cell",
+    "mesenchymal stem cell of adipose tissue": "precursor_cell",
+    "regular ventricular cardiac myocyte": "contractile_cell",
+    "choroid plexus epithelial cell": "epithelial_cell",
+    "aortic endothelial cell": "epithelial_cell",
+    "fibrocyte": "connective_cell",
+    "kidney loop of Henle thin ascending limb epithelial cell": "epithelial_cell",
+    "kidney interstitial alternatively activated macrophage": "immune_cell",
+    "renal interstitial pericyte": "perivascular_cell",
+    "papillary tips cell": "unknown",
+    "fast muscle cell": "skeletal_muscle",
+    "skeletal muscle fiber": "skeletal_muscle",
+    "slow muscle cell": "skeletal_muscle",
+    "skeletal muscle satellite cell": "skeletal_muscle",
+    "retinal blood vessel endothelial cell": "epithelial_cell",
+    "non-myelinating Schwann cell": "neural_cell",
+    "lung perichondrial fibroblast": "connective_cell",
+    "respiratory suprabasal cell": "epithelial_cell",
+    "lung pericyte": "perivascular_cell",
+    "memory T cell": "hematopoietic_cell",
+    "leptomeningeal cell": "connective_cell",
+    "Sertoli cell": "secretory_cell",
+    "macroglial cell": "neural_cell",
+    "retinal bipolar neuron": "neural_cell",
+    "cerebellar granule cell": "neural_cell",
+    "intermediate monocyte": "precursor_cell",
+    "erythroblast": "hematopoietic_cell",
+    "midzonal region hepatocyte": "epithelial_cell",
+    "endothelial cell of venule": "epithelial_cell",
+    "helper T cell": "hematopoietic_cell",
+    "mucosal invariant T cell": "hematopoietic_cell",
+    "T-helper 17 cell": "hematopoietic_cell",
+    "olfactory epithelial cell": "epithelial_cell",
+    "auditory epithelial cell": "epithelial_cell",
+    "endo-epithelial cell": "epithelial_cell",
+    "epithelial cell of amnion": "epithelial_cell",
+    "intermediate mesodermal cell": "embryonic_cell",
+    "ectodermal cell": "embryonic_cell",
+    "metanephric mesenchyme stem cell": "precursor_cell",
+    "ureteric bud cell": "epithelial_cell",
+    "pituitary gland cell": "neural_cell",
+    "pancreatic acinar cell": "epithelial_cell",
+    "lens epithelial cell": "epithelial_cell",
+    "epithelial cell of parathyroid gland": "epithelial_cell",
+    "epithelial cell of thymus": "epithelial_cell",
+    "intrahepatic cholangiocyte": "epithelial_cell",
+    "epithelial cell of thyroid gland": "epithelial_cell",
+    "peripheral nervous system neuron": "neural_cell",
+    "neural crest cell": "embryonic_cell",
+    "sensory neuron": "neural_cell",
+    "cerebral cortex endothelial cell": "epithelial_cell",
+    "microvascular endothelial cell": "epithelial_cell",
+    "brain pericyte": "perivascular_cell",
+    "endocardial cell": "epithelial_cell",
+    "adipocyte of epicardial fat of left ventricle": "connective_cell",
+    "CD14-low, CD16-positive monocyte": "precursor_cell",
+    "DN4 thymocyte": "hematopoietic_cell",
+    "pancreatic stellate cell": "connective_cell",
+    "pancreatic ductal cell": "epithelial_cell",
+    "type B pancreatic cell": "immune_cell",
+    "CD8-positive, alpha-beta memory T cell, CD45RO-positive": "hematopoietic_cell",
+    "alpha-beta T cell": "hematopoietic_cell",
+    "effector memory CD8-positive, alpha-beta T cell, terminally differentiated": "hematopoietic_cell",
+    "brown preadipocyte": "connective_cell",
+    "brown adipocyte": "connective_cell",
+    "lung ciliated cell": "ciliated_cell",
+    "effector CD8-positive, alpha-beta T cell": "hematopoietic_cell",
+    "T-helper 22 cell": "hematopoietic_cell",
+    "myeloid dendritic cell": "immune_cell",
+    "dendritic cell, human": "immune_cell",
+    "erythroid progenitor cell, mammalian": "precursor_cell",
+    "ILC1, human": "hematopoietic_cell",
+    "CD34-positive, CD38-negative hematopoietic stem cell": "precursor_cell",
+    "IgM plasma cell": "hematopoietic_cell",
+    "T-helper 1 cell": "hematopoietic_cell",
+    "group 2 innate lymphoid cell, human": "hematopoietic_cell",
+    "myeloid lineage restricted progenitor cell": "precursor_cell",
+    "T-helper 2 cell": "hematopoietic_cell",
+    "astrocyte of the cerebral cortex": "neural_cell",
+    "near-projecting glutamatergic cortical neuron": "neural_cell",
+    "effector CD4-positive, alpha-beta T cell": "hematopoietic_cell",
+    "type I NK T cell": "hematopoietic_cell",
+    "CD141-positive myeloid dendritic cell": "immune_cell",
+    "mature conventional dendritic cell": "immune_cell",
+    "melanocyte of skin": "secretory_cell",
+    "pancreatic A cell": "epithelial_cell",
+    "pancreatic D cell": "epithelial_cell",
+    "pancreatic PP cell": "epithelial_cell",
+    "CD14-positive, CD16-negative classical monocyte": "precursor_cell",
+    "CD4-positive, CD25-positive, alpha-beta regulatory T cell": "hematopoietic_cell",
+    "kidney connecting tubule principal cell": "epithelial_cell",
+    "epithelial cell of large intestine": "epithelial_cell",
+    "Purkinje cell": "neural_cell",
+    "granule cell": "neural_cell",
+    "neuron associated cell (sensu Vertebrata)": "neural_cell",
+    "stellate neuron": "neural_cell",
+    "neuronal brush cell": "epithelial_cell",
+    "myotube": "contractile_cell",
+    "muscle precursor cell": "precursor_cell",
+    "transitional stage B cell": "hematopoietic_cell",
+    "immature neutrophil": "immune_cell",
+    "medial ganglionic eminence derived interneuron": "neural_cell",
+    "caudal ganglionic eminence derived interneuron": "neural_cell",
+    "bronchus fibroblast of lung": "connective_cell",
+    "pigmented epithelial cell": "epithelial_cell",
+    "smooth muscle cell of sphincter of pupil": "contractile_cell",
+    "IgG plasmablast": "hematopoietic_cell",
+    "IgA plasmablast": "hematopoietic_cell",
+    "plasmatocyte": "immune_cell",
+    "kidney cortex artery cell": "epithelial_cell",
+    "kidney capillary endothelial cell": "epithelial_cell",
+    "kidney proximal straight tubule epithelial cell": "ciliated_cell",
+    "cardiac muscle cell": "contractile_cell",
+    "mesothelial cell of epicardium": "epithelial_cell",
+    "fetal cardiomyocyte": "contractile_cell",
+    "cardiac mesenchymal cell": "embryonic_cell",
+    "pneumocyte": "epithelial_cell",
+    "mononuclear cell": "hematopoietic_cell",
+    "tonsil germinal center B cell": "hematopoietic_cell",
+    "centroblast": "hematopoietic_cell",
+    "centrocyte": "hematopoietic_cell",
+    "macrophage dendritic cell progenitor": "precursor_cell",
+    "immature NK T cell": "hematopoietic_cell",
+    "neuroblast (sensu Vertebrata)": "neural_cell",
+    "alveolar type 2 fibroblast cell": "connective_cell",
+    "tracheobronchial smooth muscle cell": "contractile_cell",
+    "lung goblet cell": "epithelial_cell",
+    "respiratory basal cell": "epithelial_cell",
+    "brush cell of trachebronchial tree": "epithelial_cell",
+    "mesothelial fibroblast": "connective_cell",
+    "bladder urothelial cell": "unknown",
+    "bladder cell": "unknown",
+    "neoplastic cell": "epithelial_cell",
+    "endothelial cell of coronary artery": "epithelial_cell",
+    "cardiac neuron": "neural_cell",
+    "OFF retinal ganglion cell": "neural_cell",
+    "ON retinal ganglion cell": "neural_cell",
+    "lung resident memory CD8-positive, alpha-beta T cell": "hematopoietic_cell",
+    "lung resident memory CD4-positive, alpha-beta T cell": "hematopoietic_cell",
+    "deuterosomal cell": "epithelial_cell",
+    "granulocytopoietic cell": "hematopoietic_cell",
+    "basophil": "hematopoietic_cell",
+    "PP cell": "epithelial_cell",
+    "pancreatic epsilon cell": "epithelial_cell",
+    "fibroblast of connective tissue of prostate": "connective_cell",
+    "double negative T regulatory cell": "immune_cell",
+    "progenitor cell of mammary luminal epithelium": "precursor_cell",
+    "lactocyte": "secretory_cell",
+    "vascular lymphangioblast": "immune_cell",
+    "lung endothelial cell": "epithelial_cell",
+    "respiratory goblet cell": "epithelial_cell",
+    "cardiac pacemaker cell of sinoatrial node": "contractile_cell",
+    "activated CD4-positive, alpha-beta T cell": "hematopoietic_cell",
+    "differentiation-committed oligodendrocyte precursor": "neural_cell",
+    "glycinergic neuron": "neural_cell",
+    "keratinocyte stem cell": "precursor_cell",
+    "bronchial smooth muscle cell": "contractile_cell",
+    "epidermal cell": "epithelial_cell",
+    "basal epithelial cell of tracheobronchial tree": "epithelial_cell",
+    "neural stem cell": "precursor_cell",
+    "mature alpha-beta T cell": "hematopoietic_cell",
+    "brush cell of epithelium proper of large intestine": "epithelial_cell",
+    "smooth muscle cell of trachea": "contractile_cell",
+    "ciliated columnar cell of tracheobronchial tree": "ciliated_cell",
+    "early pro-B cell": "precursor_cell",
+    "pulmonary interstitial fibroblast": "connective_cell",
+    "neuroepithelial stem cell": "precursor_cell",
+    "lung neuroendocrine cell": "secretory_cell",
+    "common lymphoid progenitor": "precursor_cell",
+    "plasmacytoid dendritic cell, human": "immune_cell",
+    "activated CD4-positive, alpha-beta T cell, human": "hematopoietic_cell",
+    "lateral mesodermal cell": "embryonic_cell",
+    "hypothalamus cell": "neural_cell",
+    "primitive erythroid progenitor": "precursor_cell",
+    "retinal progenitor cell": "precursor_cell",
+    "spinal cord motor neuron": "neural_cell",
+    "cranial motor neuron": "neural_cell",
+    "enteric neuron": "neural_cell",
+    "spiral ganglion neuron": "neural_cell",
+    "cerebral cortex GABAergic interneuron": "neural_cell",
+    "embryonic blood vessel endothelial progenitor cell": "neural_cell",
+    "sympathetic neuron": "neural_cell",
+    "olfactory receptor cell": "neural_cell",
+    "extraembryonic cell": "embryonic_cell",
+    "fibroblast of breast": "connective_cell",
+    "endothelial cell of umbilical vein": "epithelial_cell",
+    "transit amplifying cell": "unknown",
+    "M cell of gut": "epithelial_cell",
+    "hypendymal cell": "epithelial_cell",
+    "oogonial cell": "unknown",
+    "female germ cell": "unknown",
+    "male germ cell": "unknown",
+    "oocyte": "unknown",
+    "basket cell": "secretory_cell",
+    "epithelial cell of prostate": "epithelial_cell",
+    "basal epithelial cell of prostatic duct": "epithelial_cell",
+    "contractile cell": "contractile_cell",
+    "mature T cell": "hematopoietic_cell",
+    "eosinophil": "hematopoietic_cell",
+    "corneal epithelial cell": "epithelial_cell",
+    "corneal endothelial cell": "epithelial_cell",
+    "activated CD8-positive, alpha-beta T cell": "hematopoietic_cell",
+    "follicular B cell": "hematopoietic_cell",
+    "colon macrophage": "immune_cell",
+    "myelinating Schwann cell": "neural_cell",
+    "cell in vitro": "unknown",
+    "S cone cell": "neural_cell",
+    "lung interstitial macrophage": "connective_cell",
+    "Leydig cell": "unknown",
+    "L2/3 intratelencephalic projecting glutamatergic neuron": "neural_cell",
+    "enterocyte of colon": "ciliated_cell",
+    "mesenchymal lymphangioblast": "precursor_cell",
+    "colon epithelial cell": "epithelial_cell",
+    "CD34-positive, CD56-positive, CD117-positive common innate lymphoid precursor, human": "precursor_cell",
+    "NKp44-positive group 3 innate lymphoid cell, human": "hematopoietic_cell",
+    "NKp44-negative group 3 innate lymphoid cell, human": "hematopoietic_cell",
+    "primary sensory neuron (sensu Teleostei)": "neural_cell",
+    "type N enteroendocrine cell": "secretory_cell",
+    "progenitor cell of endocrine pancreas": "precursor_cell",
+    "CD4-positive, alpha-beta thymocyte": "hematopoietic_cell",
+    "fibroblast of connective tissue of nonglandular part of prostate": "connective_cell",
+    "fibroblast of connective tissue of glandular part of prostate": "connective_cell",
+    "CD8-positive, alpha-beta thymocyte": "hematopoietic_cell",
+    "enucleate erythrocyte": "hematopoietic_cell",
+    "lung microvascular endothelial cell": "epithelial_cell",
+    "serous cell of epithelium of bronchus": "secretory_cell",
+    "pulmonary ionocyte": "epithelial_cell",
+    "epithelial cell of pancreas": "epithelial_cell",
+    "cultured cell": "unknown",
+    "reticular cell": "unknown",
+    "inflammatory cell": "immune_cell",
+    "stem cell of epidermis": "precursor_cell",
+    "pigmented ciliary epithelial cell": "epithelial_cell",
+    "non-pigmented ciliary epithelial cell": "epithelial_cell",
+    "ciliary muscle cell": "contractile_cell",
+    "acinar cell": "epithelial_cell",
+    "endocrine cell": "secretory_cell",
+    "non-terminally differentiated cell": "unknown",
+    "pre-natural killer cell": "hematopoietic_cell",
+    "midget ganglion cell of retina": "neural_cell",
+    "GABAergic amacrine cell": "neural_cell",
+    "diffuse bipolar 3b cell": "neural_cell",
+    "diffuse bipolar 2 cell": "neural_cell",
+    "ON parasol ganglion cell": "neural_cell",
+    "diffuse bipolar 1 cell": "neural_cell",
+    "invaginating midget bipolar cell": "neural_cell",
+    "diffuse bipolar 3a cell": "neural_cell",
+    "H2 horizontal cell": "neural_cell",
+    "OFFx cell": "neural_cell",
+    "H1 horizontal cell": "neural_cell",
+    "diffuse bipolar 4 cell": "neural_cell",
+    "diffuse bipolar 6 cell": "neural_cell",
+    "OFF parasol ganglion cell": "neural_cell",
+    "hepatic pit cell": "hematopoietic_cell",
+    "follicular dendritic cell": "immune_cell",
+    "mature gamma-delta T cell": "hematopoietic_cell",
+    "thalamic excitatory neuron": "neural_cell",
+    "small bistratified retinal ganglion cell": "neural_cell",
+    "mature microglial cell": "neural_cell",
+    "intestinal epithelial cell": "epithelial_cell",
+    "epithelial cell of lung": "epithelial_cell",
+    "CD38-negative naive B cell": "hematopoietic_cell",
+    "urethra urothelial cell": "epithelial_cell",
+    "seminal vesicle glandular cell": "secretory_cell",
+    "type I cell of adrenal cortex": "epithelial_cell",
+    "germinal center B cell": "hematopoietic_cell",
+    "kidney cell": "unknown",
+    "kidney loop of Henle medullary thick ascending limb epithelial cell": "epithelial_cell",
+    "kidney loop of Henle cortical thick ascending limb epithelial cell": "epithelial_cell",
+    "kidney cortex tubule cell": "epithelial_cell",
+    "kidney glomerular epithelial cell": "epithelial_cell",
+    "preadipocyte": "connective_cell",
+    "type 6 cone bipolar cell (sensu Mus)": "neural_cell",
+    "type 5a cone bipolar cell": "neural_cell",
+    "type 7 cone bipolar cell (sensu Mus)": "neural_cell",
+    "type 3b cone bipolar cell": "neural_cell",
+    "type 3a cone bipolar cell": "neural_cell",
+    "type 5b cone bipolar cell": "neural_cell",
+    "type 5 cone bipolar cell (sensu Mus)": "neural_cell",
+    "type 8 cone bipolar cell (sensu Mus)": "neural_cell",
+    "type 9 cone bipolar cell (sensu Mus)": "neural_cell",
+    "type 2 cone bipolar cell (sensu Mus)": "neural_cell",
+    "type 4 cone bipolar cell (sensu Mus)": "neural_cell",
+    "type 1 cone bipolar cell (sensu Mus)": "neural_cell",
+    "cerebellar granule cell precursor": "neural_cell",
+    "unipolar brush cell": "epithelial_cell",
+    "glioblast": "precursor_cell",
+    "immature astrocyte": "neural_cell",
+    "meningeal macrophage": "immune_cell",
+    "noradrenergic cell": "secretory_cell",
+    "multi-ciliated epithelial cell": "ciliated_cell",
+    "pulmonary artery endothelial cell": "epithelial_cell",
+    "cone retinal bipolar cell": "neural_cell",
+    "retinal astrocyte": "neural_cell",
+    "efferent neuron": "neural_cell",
+    "enterocyte of epithelium proper of ileum": "ciliated_cell",
+    "ileal goblet cell": "epithelial_cell",
+    "smooth muscle fiber of ileum": "contractile_cell",
+    "enteroendocrine cell of small intestine": "secretory_cell",
+    "aortic smooth muscle cell": "contractile_cell",
+    "mesothelial cell of visceral pleura": "epithelial_cell",
+    "ciliated cell of the bronchus": "ciliated_cell",
+    "squamous epithelial cell": "epithelial_cell",
+    "nasal mucosa goblet cell": "epithelial_cell",
+    "memory regulatory T cell": "hematopoietic_cell",
+    "naive regulatory T cell": "hematopoietic_cell",
+    "myeloid suppressor cell": "immune_cell",
+    "adipose macrophage": "immune_cell",
+    "absorptive cell": "unknown",
+    "intestinal crypt stem cell of colon": "precursor_cell",
+    "mature astrocyte": "neural_cell",
+    "hair follicular keratinocyte": "epithelial_cell",
+    "sebum secreting cell": "secretory_cell",
+    "granular cell of epidermis": "epithelial_cell",
+    "anterior lens cell": "neural_cell",
+    "secondary lens fiber": "epithelial_cell",
+    "lens fiber cell": "epithelial_cell",
+    "A2 amacrine cell": "neural_cell",
+    "sperm": "unknown",
+    "abnormal cell": "unknown",
+    "myometrial cell": "unknown",
+    "epithelial cell of uterus": "epithelial_cell",
+    "prickle cell": "epithelial_cell",
+    "Merkel cell": "epithelial_cell",
+    "cortical thymic epithelial cell": "epithelial_cell",
+    "medullary thymic epithelial cell": "epithelial_cell",
+    "epicardial adipocyte": "connective_cell",
+    "peritubular capillary endothelial cell": "epithelial_cell",
+    "conjunctival epithelial cell": "epithelial_cell",
+    "glomerular capillary endothelial cell": "epithelial_cell",
+    "columnar/cuboidal epithelial cell": "epithelial_cell",
+    "kidney resident macrophage": "immune_cell",
+    "ON-blue cone bipolar cell": "neural_cell",
+    "CD8-alpha alpha positive, gamma-delta intraepithelial T cell": "hematopoietic_cell",
+    "NKp46-positive innate lymphoid cell, human": "hematopoietic_cell",
+    "neutrophil progenitor cell": "precursor_cell",
+    "skeletal muscle satellite stem cell": "precursor_cell",
+    "mucosal type mast cell": "hematopoietic_cell",
+    "metallothionein-positive alveolar macrophage": "immune_cell",
+    "cerebral cortex neuron": "neural_cell",
+    "basal cell of epithelium of trachea": "epithelial_cell",
+    "tracheal goblet cell": "epithelial_cell",
+    "photoreceptor cell": "neural_cell",
+    "cochlea auditory hair cell": "neural_cell",
+    "pinealocyte": "epithelial_cell",
+    "iris pigment epithelial cell": "epithelial_cell",
+    "radial glial cell": "neural_cell",
+    "GABAergic interneuron": "neural_cell",
+    "pancreatic endocrine cell": "secretory_cell",
+    "endothelial cell of sinusoid": "epithelial_cell",
+    "DN3 thymocyte": "hematopoietic_cell",
+    "DN1 thymic pro-T cell": "precursor_cell",
+    "parasol ganglion cell of retina": "neural_cell",
+    "epithelial cell of proximal tubule segment 3": "ciliated_cell",
+    "valve interstitial cell": "epithelial_cell",
+    "valve endothelial cell": "epithelial_cell",
+    "myocyte of sinoatrial node": "contractile_cell",
+    "colon goblet cell": "epithelial_cell",
+    "enteroendocrine cell of colon": "secretory_cell",
+    "paneth cell of colon": "secretory_cell",
+    "cholinergic neuron": "neural_cell",
+    "L4/5 intratelencephalic projecting glutamatergic neuron": "neural_cell",
+    "L6 intratelencephalic projecting glutamatergic neuron": "neural_cell",
+    "L3 intratelencephalic projecting glutamatergic neuron": "neural_cell",
+    "tanycyte": "neural_cell",
+    "IgG-negative class switched memory B cell": "hematopoietic_cell",
+    "IgG memory B cell": "hematopoietic_cell",
+    "indirect pathway medium spiny neuron": "neural_cell",
+    "direct pathway medium spiny neuron": "neural_cell",
+    "elicited macrophage": "immune_cell",
+    "alveolar type 1 fibroblast cell": "connective_cell",
+    "respiratory hillock cell": "epithelial_cell",
+    "epithelial cell of lower respiratory tract": "epithelial_cell",
+    "serous secreting cell": "secretory_cell",
+    "tracheobronchial serous cell": "secretory_cell",
+    "tracheobronchial goblet cell": "secretory_cell",
+    "bronchial goblet cell": "secretory_cell",
+    "epithelial fate stem cell": "epithelial_cell",
+    "lymphatic endothelial cell of medulla ceiling": "epithelial_cell",
+    "lymphatic endothelial cell of subcapsular sinus floor": "epithelial_cell",
+    "lymphatic endothelial cell of subcapsular sinus ceiling": "epithelial_cell",
+    "lymph node lymphatic vessel endothelial cell": "epithelial_cell",
+    "tissue-resident macrophage": "immune_cell",
+    "glandular epithelial cell": "epithelial_cell",
+    "L4 intratelencephalic projecting glutamatergic neuron": "neural_cell",
+    "L5/6 near-projecting glutamatergic neuron": "neural_cell",
+    "forebrain radial glial cell": "neural_cell",
+    "white adipocyte": "connective_cell",
+    "precursor cell": "precursor_cell",
+    "primary cultured cell": "unknown",
+    "liver dendritic cell": "immune_cell",
+    "giant bipolar cell": "neural_cell",
+    "eurydendroid cell": "neural_cell",
+    "type A enteroendocrine cell": "secretory_cell",
+    "type D enteroendocrine cell": "secretory_cell",
+    "serous cell of epithelium of trachea": "secretory_cell",
+    "T follicular regulatory cell": "hematopoietic_cell",
+    "enterocyte of epithelium of small intestine": "ciliated_cell",
+    "tuft cell of colon": "epithelial_cell",
+    "small intestine goblet cell": "epithelial_cell",
+    "epithelial cell of small intestine": "epithelial_cell",
+    "BEST4+ intestinal epithelial cell, human": "epithelial_cell",
+    "microfold cell of epithelium of small intestine": "immune_cell",
+    "foveolar cell of stomach": "epithelial_cell",
+    "mucous neck cell": "epithelial_cell",
+    "type G enteroendocrine cell": "secretory_cell",
+    "natural T-regulatory cell": "immune_cell",
+    "peptic cell": "epithelial_cell",
+    "P/D1 enteroendocrine cell": "secretory_cell",
+    "parietal cell": "epithelial_cell",
+    "eye photoreceptor cell": "neural_cell",
+    "keratocyte": "connective_cell",
+    "preosteoblast": "unknown",
+    "endosteal cell": "unknown",
+    "immature natural killer cell": "hematopoietic_cell",
+    "basal cell of epithelium of bronchus": "epithelial_cell",
+    "brush cell of bronchus": "epithelial_cell",
+    "sensory neuron of dorsal root ganglion": "neural_cell",
+    "parasympathetic neuron": "neural_cell",
+    "immature T cell": "hematopoietic_cell",
+    "epithelial cell of esophagus": "epithelial_cell",
+    "glandular cell of esophagus": "secretory_cell",
+    "perineuronal satellite cell": "neural_cell",
+    "olfactory ensheathing cell": "neural_cell",
+    "onychocyte": "embryonic_cell",
+    "epidermal Langerhans cell": "immune_cell",
+    "brush cell of trachea": "epithelial_cell",
+    "mesothelial cell of pleura": "epithelial_cell",
+    "subcutaneous adipocyte": "connective_cell",
+    "hepatoblast": "embryonic_cell",
+    "stromal cell of endometrium": "connective_cell",
+    "central nervous system neuron": "neural_cell",
+    "intraepithelial lymphocyte": "hematopoietic_cell",
+    "amygdala excitatory neuron": "neural_cell",
+    "bistratified retinal ganglion cell": "neural_cell",
+    "chromaffin cell": "embryonic_cell",
+    "chorionic trophoblast cell": "embryonic_cell",
+    "B-1a B cell": "hematopoietic_cell",
+    "ganglion interneuron": "neural_cell",
+    "B-1b B cell": "hematopoietic_cell",
+    "tongue muscle cell": "contractile_cell",
+    "cortical cell of adrenal gland": "embryonic_cell",
+    "histaminergic neuron": "neural_cell",
+    "epithelial cell of exocrine pancreas": "epithelial_cell",
+    "cerebellar Golgi cell": "neural_cell",
+    "kidney inner medulla collecting duct epithelial cell": "epithelial_cell",
+    "kidney pelvis urothelial cell": "epithelial_cell",
+    "atrioventricular bundle cell": "contractile_cell",
+    "peripheral blood mononuclear cell": "hematopoietic_cell",
+    "type II NK T cell": "hematopoietic_cell",
+    "immature alpha-beta T cell": "hematopoietic_cell",
+    "bipolar neuron": "neural_cell",
+    "brainstem motor neuron": "neural_cell",
+    "epithelial cell of lacrimal sac": "epithelial_cell",
+    "skeletal muscle fibroblast": "skeletal_muscle",
+    "salivary gland cell": "secretory_cell",
+    "astrocyte of the cerebellum": "neural_cell",
+    "CD4-positive, alpha-beta memory T cell, CD45RO-positive": "hematopoietic_cell",
+    "GIP cell": "epithelial_cell",
+    "decidual cell": "connective_cell",
+    "migratory enteric neural crest cell": "neural_cell",
+    "dentate gyrus neuron": "neural_cell",
+    "taste receptor cell": "epithelial_cell",
+    "dermis microvascular lymphatic vessel endothelial cell": "epithelial_cell",
+    "activated type II NK T cell": "hematopoietic_cell",
+    "bone marrow cell": "skeletal_muscle",
+    "CNS interneuron": "neural_cell",
+    "type I enteroendocrine cell": "secretory_cell",
+    "hair follicle melanocyte": "secretory_cell",
+    "kidney afferent arteriole endothelial cell": "epithelial_cell",
+    "multinucleated giant cell": "unknown",
+    "conjunctiva goblet cell": "epithelial_cell",
+    "thyroid follicular cell": "epithelial_cell",
+    "embryonic stem cell": "embryonic_cell",
+    "respiratory epithelial cell": "epithelial_cell",
+    "bronchial epithelial cell": "epithelial_cell",
+    "endothelial stalk cell": "epithelial_cell",
+    "enucleated reticulocyte": "hematopoietic_cell",
+    "kidney efferent arteriole endothelial cell": "epithelial_cell",
+    "hippocampal CA1-3 neuron": "neural_cell",
+    "intratelencephalic-projecting glutamatergic cortical neuron": "neural_cell",
+    "gingival epithelial cell": "epithelial_cell",
+    "visceromotor neuron": "neural_cell",
+    "sebaceous gland cell": "epithelial_cell",
+    "activated CD8-positive, alpha-beta T cell, human": "hematopoietic_cell",
+    "stromal cell of lamina propria of small intestine": "connective_cell",
+    "pre-B-I cell": "precursor_cell",
+    "immature Schwann cell": "precursor_cell",
+    "CD8-positive, alpha-beta cytokine secreting effector T cell": "hematopoietic_cell",
+    "epithelial cell of sweat gland": "epithelial_cell",
+    "ventricular cardiac muscle cell": "contractile_cell"
+}

teddy/data_processing/utils/bio_annotations/data/mappings/all_filtered_disease_mapping.json ADDED Viewed

	@@ -0,0 +1,125 @@

+{
+    "Alzheimer disease": "brain_disease",
+    "B-cell acute lymphoblastic leukemia": "cancer_disease",
+    "Barrett esophagus": "digestive_disease",
+    "COVID-19": "infectious_disease",
+    "Crohn disease": "immune_disease",
+    "Crohn ileitis": "immune_disease",
+    "Lewy body dementia": "brain_disease",
+    "Parkinson disease": "brain_disease",
+    "Plasmodium malariae malaria": "infectious_disease",
+    "Wilms tumor": "cancer_disease",
+    "acute kidney failure": "kidney_disease",
+    "acute myeloid leukemia": "cancer_disease",
+    "acute myocardial infarction": "cardiovascular_disease",
+    "acute promyelocytic leukemia": "cancer_disease",
+    "adenocarcinoma": "cancer_disease",
+    "age related macular degeneration 7": "other_disease",
+    "amyotrophic lateral sclerosis": "brain_disease",
+    "amyotrophic lateral sclerosis 26 with or without frontotemporal dementia": "brain_disease",
+    "anencephaly": "genetic_disease",
+    "arrhythmogenic right ventricular cardiomyopathy": "cardiovascular_disease",
+    "aspiration pneumonia": "infectious_disease",
+    "autosomal dominant polycystic kidney disease": "genetic_disease",
+    "basal cell carcinoma": "cancer_disease",
+    "basal laminar drusen": "other_disease",
+    "benign prostatic hyperplasia": "other_disease",
+    "blastoma": "cancer_disease",
+    "brain neoplasm": "cancer_disease",
+    "breast cancer": "cancer_disease",
+    "breast carcinoma": "cancer_disease",
+    "cardiomyopathy": "cardiovascular_disease",
+    "cataract": "other_disease",
+    "chromophobe renal cell carcinoma": "cancer_disease",
+    "chronic kidney disease": "kidney_disease",
+    "chronic obstructive pulmonary disease": "immune_disease",
+    "chronic rhinitis": "immune_disease",
+    "clear cell renal carcinoma": "cancer_disease",
+    "colon sessile serrated adenoma/polyp": "cancer_disease",
+    "colorectal cancer": "cancer_disease",
+    "common variable immunodeficiency": "immune_disease",
+    "congenital heart disease": "cardiovascular_disease",
+    "cystic fibrosis": "immune_disease",
+    "dementia": "brain_disease",
+    "diabetic kidney disease": "immune_disease",
+    "digestive system disorder": "digestive_disease",
+    "dilated cardiomyopathy": "cardiovascular_disease",
+    "endocrine pancreas disorder": "other_disease",
+    "epidermolysis bullosa": "other_disease",
+    "epilepsy": "brain_disease",
+    "frontotemporal dementia": "brain_disease",
+    "gastric intestinal metaplasia": "cancer_disease",
+    "gastritis": "digestive_disease",
+    "gingivitis": "other_disease",
+    "glioblastoma": "cancer_disease",
+    "heart disorder": "cardiovascular_disease",
+    "heart failure": "cardiovascular_disease",
+    "hydrocephalus": "brain_disease",
+    "hydrosalpinx": "other_disease",
+    "hyperplastic polyp": "cancer_disease",
+    "hypersensitivity pneumonitis": "immune_disease",
+    "influenza": "infectious_disease",
+    "injury": "other_disease",
+    "interstitial lung disease": "respiratory_disease",
+    "juvenile dermatomyositis": "immune_disease",
+    "keloid": "other_disease",
+    "kidney benign neoplasm": "cancer_disease",
+    "kidney oncocytoma": "cancer_disease",
+    "listeriosis": "infectious_disease",
+    "localized scleroderma": "immune_disease",
+    "long COVID-19": "infectious_disease",
+    "luminal A breast carcinoma": "cancer_disease",
+    "luminal B breast carcinoma": "cancer_disease",
+    "lung adenocarcinoma": "cancer_disease",
+    "lung large cell carcinoma": "cancer_disease",
+    "lymphadenitis": "infectious_disease",
+    "lymphangioleiomyomatosis": "respiratory_disease",
+    "macular degeneration": "other_disease",
+    "malignant ovarian serous tumor": "cancer_disease",
+    "malignant pancreatic neoplasm": "cancer_disease",
+    "metastatic melanoma": "cancer_disease",
+    "multiple sclerosis": "brain_disease",
+    "myocardial infarction": "cardiovascular_disease",
+    "neuroendocrine carcinoma": "cancer_disease",
+    "non-compaction cardiomyopathy": "cardiovascular_disease",
+    "non-small cell lung carcinoma": "cancer_disease",
+    "non-specific interstitial pneumonia": "immune_disease",
+    "nonpapillary renal cell carcinoma": "cancer_disease",
+    "normal": "healthy",
+    "opiate dependence": "other_disease",
+    "periodontitis": "other_disease",
+    "pilocytic astrocytoma": "cancer_disease",
+    "plasma cell myeloma": "cancer_disease",
+    "pleomorphic carcinoma": "cancer_disease",
+    "premalignant hematological system disease": "cancer_disease",
+    "primary biliary cholangitis": "immune_disease",
+    "primary sclerosing cholangitis": "immune_disease",
+    "pulmonary emphysema": "respiratory_disease",
+    "pulmonary fibrosis": "immune_disease",
+    "pulmonary sarcoidosis": "immune_disease",
+    "renal cell carcinoma": "cancer_disease",
+    "respiratory failure": "respiratory_disease",
+    "respiratory system disorder": "respiratory_disease",
+    "severe acute respiratory syndrome": "infectious_disease",
+    "small cell lung carcinoma": "cancer_disease",
+    "squamous cell lung carcinoma": "cancer_disease",
+    "systemic lupus erythematosus": "immune_disease",
+    "temporal lobe epilepsy": "brain_disease",
+    "tongue cancer": "cancer_disease",
+    "toxoplasmosis": "infectious_disease",
+    "triple-negative breast carcinoma": "cancer_disease",
+    "trisomy 18": "genetic_disease",
+    "tubular adenoma": "cancer_disease",
+    "tubulovillous adenoma": "cancer_disease",
+    "type 1 diabetes mellitus": "immune_disease",
+    "type 2 diabetes mellitus": "immune_disease",
+    "B-cell non-Hodgkin lymphoma": "cancer_disease",
+    "colorectal neoplasm": "cancer_disease",
+    "follicular lymphoma": "cancer_disease",
+    "Down syndrome": "genetic_disease",
+    "gastric cancer": "cancer_disease",
+    "post-COVID-19 disorder": "infectious_disease",
+    "encephalomyelitis": "brain_disease",
+    "pneumonia": "infectious_disease",
+    "rheumatoid arthritis": "immune_disease"
+}

teddy/data_processing/utils/bio_annotations/data/mappings/all_filtered_sex_mapping.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+    "female": "female",
+    "male": "male",
+    "unknown": "unknown"
+}

teddy/data_processing/utils/bio_annotations/data/mappings/all_filtered_tissue_mapping.json ADDED Viewed

	@@ -0,0 +1,415 @@

+{
+    "Brodmann (1909) area 19": "central_nervous_tissue",
+    "Brodmann (1909) area 23": "central_nervous_tissue",
+    "Brodmann (1909) area 25": "central_nervous_tissue",
+    "Brodmann (1909) area 4": "central_nervous_tissue",
+    "Brodmann (1909) area 46": "central_nervous_tissue",
+    "abdomen": "musculature_tissue",
+    "abdominal wall": "musculature_tissue",
+    "adipose tissue": "adipose_tissue",
+    "adnexa of uterus": "reproductive_tissue",
+    "adrenal gland": "endocrine_tissue",
+    "adrenal tissue": "endocrine_tissue",
+    "agranular insular cortex": "central_nervous_tissue",
+    "alveolus of lung": "respiratory_tissue",
+    "ampulla of uterine tube": "reproductive_tissue",
+    "amygdala": "central_nervous_tissue",
+    "angular gyrus": "central_nervous_tissue",
+    "anterior cingulate cortex": "central_nervous_tissue",
+    "anterior cingulate gyrus": "central_nervous_tissue",
+    "anterior hypothalamic region": "central_nervous_tissue",
+    "anterior part of tongue": "sensory_tissue",
+    "anterior wall of left ventricle": "cardiovascular_tissue",
+    "anterolateral visual area": "central_nervous_tissue",
+    "aorta": "cardiovascular_tissue",
+    "apex of heart": "cardiovascular_tissue",
+    "artery": "cardiovascular_tissue",
+    "ascending colon": "digestive_tissue",
+    "ascitic fluid": "unknown",
+    "atrioventricular node": "cardiovascular_tissue",
+    "auditory cortex": "central_nervous_tissue",
+    "autopod skin": "integumentary_tissue",
+    "axilla": "integumentary_tissue",
+    "barrel cortex": "central_nervous_tissue",
+    "basal forebrain": "central_nervous_tissue",
+    "basal ganglion": "central_nervous_tissue",
+    "basal zone of heart": "cardiovascular_tissue",
+    "bladder lumen": "renal_tissue",
+    "bladder organ": "renal_tissue",
+    "blood": "hematopoietic_tissue",
+    "body of stomach": "digestive_tissue",
+    "bone marrow": "hematopoietic_tissue",
+    "bone spine": "hematopoietic_tissue",
+    "brain": "central_nervous_tissue",
+    "brain gray matter": "central_nervous_tissue",
+    "brain meninx": "central_nervous_tissue",
+    "brain ventricle": "central_nervous_tissue",
+    "brain white matter": "central_nervous_tissue",
+    "brainstem": "central_nervous_tissue",
+    "breast": "exocrine_tissue",
+    "bronchial epithelial cell": "respiratory_tissue",
+    "bronchopulmonary lymph node": "immune_tissue",
+    "bronchus": "respiratory_tissue",
+    "brown adipose tissue": "immune_tissue",
+    "brown preadipocyte": "adipose_tissue",
+    "caecum": "digestive_tissue",
+    "caecum epithelium": "digestive_tissue",
+    "cardia of stomach": "digestive_tissue",
+    "cardiac atrium": "cardiovascular_tissue",
+    "cardiac ventricle": "cardiovascular_tissue",
+    "caudal ganglionic eminence": "central_nervous_tissue",
+    "caudate lobe of liver": "hepatic_tissue",
+    "caudate nucleus": "central_nervous_tissue",
+    "cerebellar cortex": "central_nervous_tissue",
+    "cerebellar vermis": "central_nervous_tissue",
+    "cerebellum": "central_nervous_tissue",
+    "cerebellum lobule": "central_nervous_tissue",
+    "cerebellum vermis lobule": "central_nervous_tissue",
+    "cerebral cortex": "central_nervous_tissue",
+    "cerebral nuclei": "central_nervous_tissue",
+    "cerebrocerebellum": "central_nervous_tissue",
+    "cervical lymph node": "immune_tissue",
+    "cervical spinal cord white matter": "central_nervous_tissue",
+    "chorionic villus": "reproductive_tissue",
+    "chorioretinal region": "eye_tissue",
+    "choroid plexus": "central_nervous_tissue",
+    "ciliary body": "eye_tissue",
+    "cingulate cortex": "central_nervous_tissue",
+    "claustrum of brain": "central_nervous_tissue",
+    "colon": "digestive_tissue",
+    "colonic epithelium": "digestive_tissue",
+    "conjunctiva": "eye_tissue",
+    "cornea": "eye_tissue",
+    "corneo-scleral junction": "eye_tissue",
+    "coronary artery": "cardiovascular_tissue",
+    "corpus callosum": "central_nervous_tissue",
+    "cortex of kidney": "renal_tissue",
+    "cortical layer II/III": "central_nervous_tissue",
+    "cortical layer V": "central_nervous_tissue",
+    "cortical layer VI": "central_nervous_tissue",
+    "cortical plate": "central_nervous_tissue",
+    "cortical subplate": "central_nervous_tissue",
+    "cultured cell": "unknown",
+    "decidua": "reproductive_tissue",
+    "decidua basalis": "embryonic_tissue",
+    "dentate nucleus": "central_nervous_tissue",
+    "dermis": "integumentary_tissue",
+    "descending colon": "digestive_tissue",
+    "diaphragm": "musculature_tissue",
+    "diencephalon": "central_nervous_tissue",
+    "dorsal thalamus": "central_nervous_tissue",
+    "dorsolateral prefrontal cortex": "central_nervous_tissue",
+    "duodeno-jejunal junction": "digestive_tissue",
+    "duodenum": "digestive_tissue",
+    "dura mater": "central_nervous_tissue",
+    "embryo": "embryonic_tissue",
+    "embryonic stem cell": "embryonic_tissue",
+    "endocrine pancreas": "endocrine_tissue",
+    "endometrium": "reproductive_tissue",
+    "endothelial cell": "cardiovascular_tissue",
+    "entorhinal cortex": "central_nervous_tissue",
+    "epididymal fat pad": "cardiovascular_tissue",
+    "epithelial cell of alveolus of lung": "respiratory_tissue",
+    "epithelial cell of lung": "respiratory_tissue",
+    "epithelium of esophagus": "digestive_tissue",
+    "epithelium of small intestine": "digestive_tissue",
+    "epithelium of trachea": "respiratory_tissue",
+    "esophagogastric junction": "digestive_tissue",
+    "esophagus": "digestive_tissue",
+    "esophagus muscularis mucosa": "digestive_tissue",
+    "exocrine pancreas": "exocrine_tissue",
+    "eye": "eye_tissue",
+    "eye trabecular meshwork": "eye_tissue",
+    "fallopian tube": "reproductive_tissue",
+    "fimbria of uterine tube": "reproductive_tissue",
+    "forebrain": "central_nervous_tissue",
+    "forelimb": "musculature_tissue",
+    "fovea centralis": "eye_tissue",
+    "frontal cortex": "central_nervous_tissue",
+    "frontal lobe": "central_nervous_tissue",
+    "gallbladder": "digestive_tissue",
+    "ganglionic eminence": "central_nervous_tissue",
+    "gastrocnemius": "musculature_tissue",
+    "gingiva": "exocrine_tissue",
+    "glabella skin": "integumentary_tissue",
+    "gonad": "reproductive_tissue",
+    "gonad primordium": "reproductive_tissue",
+    "gonadal fat pad": "reproductive_tissue",
+    "gustatory cortex": "central_nervous_tissue",
+    "gut wall": "digestive_tissue",
+    "head of caudate nucleus": "central_nervous_tissue",
+    "heart": "cardiovascular_tissue",
+    "heart left ventricle": "cardiovascular_tissue",
+    "heart right ventricle": "cardiovascular_tissue",
+    "hemisphere part of cerebellar posterior lobe": "central_nervous_tissue",
+    "hepatic cecum": "hepatic_tissue",
+    "hepatic flexure of colon": "digestive_tissue",
+    "hindbrain": "central_nervous_tissue",
+    "hindgut": "digestive_tissue",
+    "hindlimb": "musculature_tissue",
+    "hindlimb skin": "integumentary_tissue",
+    "hippocampal formation": "central_nervous_tissue",
+    "hypothalamus": "central_nervous_tissue",
+    "ileal epithelium": "digestive_tissue",
+    "ileum": "digestive_tissue",
+    "inferior parietal cortex": "central_nervous_tissue",
+    "inferior temporal gyrus": "central_nervous_tissue",
+    "inguinal fat pad": "adipose_tissue",
+    "inguinal lymph node": "immune_tissue",
+    "inguinal part of abdomen": "musculature_tissue",
+    "inguinal region skin": "integumentary_tissue",
+    "inner medulla of kidney": "adipose_tissue",
+    "insular cortex": "central_nervous_tissue",
+    "interventricular septum": "cardiovascular_tissue",
+    "intestine": "digestive_tissue",
+    "iris": "eye_tissue",
+    "islet of Langerhans": "endocrine_tissue",
+    "isthmus of fallopian tube": "reproductive_tissue",
+    "jejunal epithelium": "digestive_tissue",
+    "jejunum": "digestive_tissue",
+    "kidney": "renal_tissue",
+    "kidney blood vessel": "cardiovascular_tissue",
+    "lacrimal gland": "exocrine_tissue",
+    "lamina propria": "digestive_tissue",
+    "lamina propria of large intestine": "digestive_tissue",
+    "lamina propria of mucosa of colon": "digestive_tissue",
+    "lamina propria of small intestine": "digestive_tissue",
+    "large intestine": "digestive_tissue",
+    "lateral amygdaloid nucleus": "central_nervous_tissue",
+    "lateral entorhinal cortex": "central_nervous_tissue",
+    "lateral ganglionic eminence": "central_nervous_tissue",
+    "lateral geniculate body": "central_nervous_tissue",
+    "lateral nuclear group of thalamus": "central_nervous_tissue",
+    "lateral visual area": "central_nervous_tissue",
+    "left cardiac atrium": "cardiovascular_tissue",
+    "left colon": "digestive_tissue",
+    "left frontal lobe": "central_nervous_tissue",
+    "left lung": "respiratory_tissue",
+    "left ovary": "reproductive_tissue",
+    "left parietal lobe": "central_nervous_tissue",
+    "left temporal lobe": "central_nervous_tissue",
+    "lens of camera-type eye": "eye_tissue",
+    "limb muscle": "musculature_tissue",
+    "lingula of left lung": "respiratory_tissue",
+    "liver": "hepatic_tissue",
+    "lower esophagus": "digestive_tissue",
+    "lower leg skin": "integumentary_tissue",
+    "lower lobe of left lung": "respiratory_tissue",
+    "lower lobe of right lung": "respiratory_tissue",
+    "lung": "respiratory_tissue",
+    "lung parenchyma": "respiratory_tissue",
+    "lymph node": "immune_tissue",
+    "macula lutea": "eye_tissue",
+    "macula lutea proper": "eye_tissue",
+    "mammary gland": "exocrine_tissue",
+    "mammary gland epithelial cell": "exocrine_tissue",
+    "medial amygdaloid nucleus": "central_nervous_tissue",
+    "medial dorsal nucleus of thalamus": "central_nervous_tissue",
+    "medial entorhinal cortex": "central_nervous_tissue",
+    "medial ganglionic eminence": "central_nervous_tissue",
+    "medial orbital frontal cortex": "central_nervous_tissue",
+    "medulla oblongata": "central_nervous_tissue",
+    "meningeal dura mater": "central_nervous_tissue",
+    "mesenteric artery": "cardiovascular_tissue",
+    "mesenteric fat pad": "immune_tissue",
+    "mesenteric lymph node": "immune_tissue",
+    "mesoderm": "reproductive_tissue",
+    "midbrain": "central_nervous_tissue",
+    "middle lobe of right lung": "respiratory_tissue",
+    "middle temporal gyrus": "central_nervous_tissue",
+    "mucosa": "digestive_tissue",
+    "muscle of abdomen": "musculature_tissue",
+    "muscle of pelvic diaphragm": "musculature_tissue",
+    "muscle organ": "musculature_tissue",
+    "muscle tissue": "musculature_tissue",
+    "myelencephalon": "central_nervous_tissue",
+    "myometrium": "reproductive_tissue",
+    "nasal cavity": "sensory_tissue",
+    "nasopharynx": "respiratory_tissue",
+    "neocortex": "central_nervous_tissue",
+    "neural tube": "embryonic_tissue",
+    "nose": "sensory_tissue",
+    "nose skin": "integumentary_tissue",
+    "nucleus accumbens": "central_nervous_tissue",
+    "occipital cortex": "central_nervous_tissue",
+    "occipital lobe": "central_nervous_tissue",
+    "olfactory region": "sensory_tissue",
+    "omental fat pad": "adipose_tissue",
+    "omentum": "musculature_tissue",
+    "orbitofrontal cortex": "central_nervous_tissue",
+    "outer medulla of kidney": "adipose_tissue",
+    "ovary": "reproductive_tissue",
+    "pallidum": "central_nervous_tissue",
+    "pancreas": "endocrine_tissue",
+    "paracolic gutter": "musculature_tissue",
+    "parietal cortex": "central_nervous_tissue",
+    "parietal lobe": "central_nervous_tissue",
+    "parietal peritoneum": "musculature_tissue",
+    "parotid gland": "exocrine_tissue",
+    "perifoveal part of retina": "eye_tissue",
+    "periovarian fat pad": "reproductive_tissue",
+    "peripheral lymph node": "immune_tissue",
+    "peripheral region of retina": "eye_tissue",
+    "peripheral zone of prostate": "reproductive_tissue",
+    "perirenal fat": "adipose_tissue",
+    "perirhinal cortex": "central_nervous_tissue",
+    "peritoneum": "musculature_tissue",
+    "pia mater": "central_nervous_tissue",
+    "pigment epithelium of eye": "eye_tissue",
+    "placenta": "reproductive_tissue",
+    "pleura": "respiratory_tissue",
+    "pleural effusion": "respiratory_tissue",
+    "pons": "central_nervous_tissue",
+    "posterior hypothalamic region": "central_nervous_tissue",
+    "posterior parietal association areas": "central_nervous_tissue",
+    "posterior part of tongue": "sensory_tissue",
+    "preadipocyte": "musculature_tissue",
+    "prefrontal cortex": "central_nervous_tissue",
+    "primary auditory cortex": "central_nervous_tissue",
+    "primary motor cortex": "central_nervous_tissue",
+    "primary somatosensory cortex": "central_nervous_tissue",
+    "primary visual cortex": "central_nervous_tissue",
+    "prostate gland": "reproductive_tissue",
+    "pubis": "hematopoietic_tissue",
+    "putamen": "central_nervous_tissue",
+    "pyloric antrum": "digestive_tissue",
+    "rectum": "digestive_tissue",
+    "rectus abdominis muscle": "musculature_tissue",
+    "renal medulla": "renal_tissue",
+    "renal papilla": "renal_tissue",
+    "renal pelvis": "renal_tissue",
+    "respiratory airway": "respiratory_tissue",
+    "respiratory basal cell": "respiratory_tissue",
+    "retina": "eye_tissue",
+    "retinal neural layer": "eye_tissue",
+    "retrosplenial granular cortex": "central_nervous_tissue",
+    "retrosplenial region": "central_nervous_tissue",
+    "rib": "hematopoietic_tissue",
+    "right cardiac atrium": "cardiovascular_tissue",
+    "right colon": "digestive_tissue",
+    "right frontal lobe": "central_nervous_tissue",
+    "right lung": "respiratory_tissue",
+    "right occipital lobe": "central_nervous_tissue",
+    "right ovary": "reproductive_tissue",
+    "right parietal lobe": "central_nervous_tissue",
+    "right temporal lobe": "central_nervous_tissue",
+    "saliva": "exocrine_tissue",
+    "scalp": "integumentary_tissue",
+    "sclera": "eye_tissue",
+    "secondary somatosensory cortex": "central_nervous_tissue",
+    "secondary visual cortex": "central_nervous_tissue",
+    "sigmoid colon": "digestive_tissue",
+    "sinoatrial node": "cardiovascular_tissue",
+    "skin epidermis": "integumentary_tissue",
+    "skin of abdomen": "integumentary_tissue",
+    "skin of back": "integumentary_tissue",
+    "skin of body": "integumentary_tissue",
+    "skin of breast": "integumentary_tissue",
+    "skin of cheek": "integumentary_tissue",
+    "skin of chest": "integumentary_tissue",
+    "skin of external ear": "integumentary_tissue",
+    "skin of face": "integumentary_tissue",
+    "skin of forearm": "integumentary_tissue",
+    "skin of forehead": "integumentary_tissue",
+    "skin of hip": "integumentary_tissue",
+    "skin of leg": "integumentary_tissue",
+    "skin of pes": "integumentary_tissue",
+    "skin of prepuce of penis": "integumentary_tissue",
+    "skin of scalp": "integumentary_tissue",
+    "skin of shoulder": "integumentary_tissue",
+    "skin of temple": "integumentary_tissue",
+    "skin of trunk": "integumentary_tissue",
+    "small intestine": "digestive_tissue",
+    "spinal cord": "central_nervous_tissue",
+    "spleen": "immune_tissue",
+    "stomach": "digestive_tissue",
+    "striatum": "central_nervous_tissue",
+    "subcutaneous abdominal adipose tissue": "adipose_tissue",
+    "subcutaneous adipose tissue": "adipose_tissue",
+    "subdural space": "central_nervous_tissue",
+    "subicular complex": "central_nervous_tissue",
+    "sublingual gland": "exocrine_tissue",
+    "submucosa of ascending colon": "digestive_tissue",
+    "submucosa of ileum": "digestive_tissue",
+    "submucosal esophageal gland": "digestive_tissue",
+    "substantia nigra pars compacta": "central_nervous_tissue",
+    "superior frontal gyrus": "central_nervous_tissue",
+    "superior parietal cortex": "central_nervous_tissue",
+    "superior temporal sulcus": "central_nervous_tissue",
+    "telencephalon": "central_nervous_tissue",
+    "temporal cortex": "central_nervous_tissue",
+    "temporal lobe": "central_nervous_tissue",
+    "temporoparietal junction": "central_nervous_tissue",
+    "tendon of semitendinosus": "musculature_tissue",
+    "testis": "reproductive_tissue",
+    "thalamic complex": "central_nervous_tissue",
+    "thoracic lymph node": "immune_tissue",
+    "thymus": "immune_tissue",
+    "thyroid gland": "endocrine_tissue",
+    "tongue": "sensory_tissue",
+    "tonsil": "immune_tissue",
+    "trachea": "respiratory_tissue",
+    "tracheal epithelial cell": "respiratory_tissue",
+    "transition zone of prostate": "reproductive_tissue",
+    "transverse colon": "digestive_tissue",
+    "trophoblast": "embryonic_tissue",
+    "trophoblast cell": "embryonic_tissue",
+    "umbilical cord blood": "hematopoietic_tissue",
+    "upper leg skin": "integumentary_tissue",
+    "upper lobe of left lung": "respiratory_tissue",
+    "upper lobe of right lung": "respiratory_tissue",
+    "upper outer quadrant of breast": "exocrine_tissue",
+    "ureter": "renal_tissue",
+    "urethra": "renal_tissue",
+    "urinary bladder": "renal_tissue",
+    "uterine cervix": "reproductive_tissue",
+    "uterus": "reproductive_tissue",
+    "vasculature": "cardiovascular_tissue",
+    "vault of skull": "hematopoietic_tissue",
+    "vein": "cardiovascular_tissue",
+    "venous blood": "cardiovascular_tissue",
+    "ventral lateral nucleus of thalamus": "cardiovascular_tissue",
+    "ventral thalamus": "cardiovascular_tissue",
+    "ventricular system of brain": "central_nervous_tissue",
+    "vermiform appendix": "digestive_tissue",
+    "visceral abdominal adipose tissue": "adipose_tissue",
+    "visual cortex": "central_nervous_tissue",
+    "white matter": "central_nervous_tissue",
+    "white matter of cerebellum": "central_nervous_tissue",
+    "yolk sac": "embryonic_tissue",
+    "zone of skin": "integumentary_tissue",
+    "basolateral amygdaloid nuclear complex": "central_nervous_tissue",
+    "optic cup": "eye_tissue",
+    "pontine nuclear group": "central_nervous_tissue",
+    "arm skin": "integumentary_tissue",
+    "central amygdaloid nucleus": "central_nervous_tissue",
+    "caudate-putamen": "central_nervous_tissue",
+    "insula": "central_nervous_tissue",
+    "pulvinar nucleus": "central_nervous_tissue",
+    "cuneus cortex": "central_nervous_tissue",
+    "granular insular cortex": "central_nervous_tissue",
+    "hippocampal field": "central_nervous_tissue",
+    "T cell": "hematopoietic_tissue",
+    "dentate gyrus of hippocampal formation": "central_nervous_tissue",
+    "central nucleus of inferior colliculus": "central_nervous_tissue",
+    "olfactory cortex": "central_nervous_tissue",
+    "skeletal muscle tissue": "musculature_tissue",
+    "body of caudate nucleus": "central_nervous_tissue",
+    "substantia innominata": "central_nervous_tissue",
+    "corticomedial nuclear complex": "central_nervous_tissue",
+    "globus pallidus": "central_nervous_tissue",
+    "renal glomerulus": "renal_tissue",
+    "anterior cerebral artery": "cardiovascular_tissue",
+    "lateral septal complex": "central_nervous_tissue",
+    "coronal suture": "cardiovascular_tissue",
+    "bed nucleus of stria terminalis": "central_nervous_tissue",
+    "subiculum": "central_nervous_tissue",
+    "piriform cortex": "central_nervous_tissue",
+    "mesonephros": "renal_tissue",
+    "posterior parahippocampal gyrus": "central_nervous_tissue",
+    "cerebellar hemisphere": "central_nervous_tissue",
+    "Brodmann (1909) area 24": "central_nervous_tissue",
+    "septal nuclear complex": "central_nervous_tissue",
+    "anterior olfactory nucleus": "sensory_tissue",
+    "Brodmann (1909) area 38": "central_nervous_tissue"
+}

teddy/data_processing/utils/bio_annotations/data/sampling_probs_for_collator/all_filtered_cell_probs.json ADDED Viewed

	@@ -0,0 +1,15 @@

+{
+    "ciliated_cell": 0.9,
+    "connective_cell": 0.514,
+    "contractile_cell": 0.9,
+    "embryonic_cell": 0.386,
+    "epithelial_cell": 0.332,
+    "hematopoietic_cell": 0.358,
+    "immune_cell": 0.708,
+    "neural_cell": 0.073,
+    "perivascular_cell": 0.9,
+    "precursor_cell": 0.902,
+    "secretory_cell": 0.9,
+    "skeletal_muscle": 0.9,
+    "unknown": 0
+}

teddy/data_processing/utils/bio_annotations/data/sampling_probs_for_collator/all_filtered_disease_probs.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+    "brain_disease": 0.952,
+    "cancer_disease": 0.531,
+    "cardiovascular_disease": 0.95,
+    "digestive_disease": 0.95,
+    "genetic_disease": 0.95,
+    "immune_disease": 0.95,
+    "infectious_disease": 0.95,
+    "kidney_disease": 0.95,
+    "other_disease": 0.95,
+    "respiratory_disease": 0.95,
+    "healthy": 0.112
+}

teddy/data_processing/utils/bio_annotations/data/sampling_probs_for_collator/all_filtered_sex_probs.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+    "female":0.238,
+    "male":0.316,
+    "unknown":0
+}

teddy/data_processing/utils/bio_annotations/data/sampling_probs_for_collator/all_filtered_tissue_probs.json ADDED Viewed

	@@ -0,0 +1,20 @@

+{
+    "adipose_tissue": 0.9,
+    "cardiovascular_tissue": 0.853,
+    "central_nervous_tissue": 0.067,
+    "digestive_tissue": 0.9,
+    "embryonic_tissue": 0.106,
+    "endocrine_tissue": 0.9,
+    "exocrine_tissue": 0.508,
+    "eye_tissue": 0.634,
+    "hematopoietic_tissue": 0.384,
+    "hepatic_tissue": 0.9,
+    "immune_tissue": 0.9,
+    "integumentary_tissue": 0.9,
+    "musculature_tissue": 0.822,
+    "renal_tissue": 0.833,
+    "reproductive_tissue": 0.9,
+    "respiratory_tissue": 0.493,
+    "sensory_tissue": 0.9,
+    "unknown": 0
+}

teddy/data_processing/utils/bio_annotations/data/sampling_probs_for_collator/cell_probs_for_classification.json ADDED Viewed

	@@ -0,0 +1,15 @@

+{
+    "ciliated_cell": 1,
+    "connective_cell": 1,
+    "contractile_cell": 1,
+    "embryonic_cell": 1,
+    "epithelial_cell": 1,
+    "hematopoietic_cell": 1,
+    "immune_cell": 1,
+    "neural_cell": 1,
+    "perivascular_cell": 1,
+    "precursor_cell": 1,
+    "secretory_cell": 1,
+    "skeletal_muscle": 1,
+    "unknown": 0
+}

teddy/data_processing/utils/bio_annotations/data/sampling_probs_for_collator/disease_probs_for_classification.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+    "brain_disease": 1,
+    "cancer_disease": 1,
+    "cardiovascular_disease": 1,
+    "digestive_disease": 1,
+    "genetic_disease": 1,
+    "immune_disease": 1,
+    "infectious_disease": 1,
+    "kidney_disease": 1,
+    "other_disease": 1,
+    "respiratory_disease": 1,
+    "healthy": 1
+}

teddy/data_processing/utils/bio_annotations/data/sampling_probs_for_collator/sex_probs_for_classification.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+    "female": 1,
+    "male": 1,
+    "unknown":0
+}

teddy/data_processing/utils/bio_annotations/data/sampling_probs_for_collator/tissue_probs_for_classification.json ADDED Viewed

	@@ -0,0 +1,20 @@

+{
+    "adipose_tissue": 1,
+    "cardiovascular_tissue": 1,
+    "central_nervous_tissue": 1,
+    "digestive_tissue": 1,
+    "embryonic_tissue": 1,
+    "endocrine_tissue": 1,
+    "exocrine_tissue": 1,
+    "eye_tissue": 1,
+    "hematopoietic_tissue": 1,
+    "hepatic_tissue": 1,
+    "immune_tissue": 1,
+    "integumentary_tissue": 1,
+    "musculature_tissue": 1,
+    "renal_tissue": 1,
+    "reproductive_tissue": 1,
+    "respiratory_tissue": 1,
+    "sensory_tissue": 1,
+    "unknown": 0
+}

teddy/data_processing/utils/gene_mapping/__init__.py ADDED Viewed

File without changes

teddy/data_processing/utils/gene_mapping/data/2407_ensembl_processed.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

teddy/data_processing/utils/gene_mapping/data/2407_hgnc_mapping.any2any.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:014adbde393ed0655d41fc7bf841946f39c3dab0153515908624d84730130c37
+size 12584454

teddy/data_processing/utils/gene_mapping/data/2407_mouse_gene_mapping.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

teddy/data_processing/utils/gene_mapping/data/human_mapping.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e56e9aa46a7bddca6b769af5548e0553c5b14c3c8c0b534122a44f52bc960b82
+size 22122103

teddy/data_processing/utils/gene_mapping/data/mouse_to_human_orthologs.one2one.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

teddy/data_processing/utils/gene_mapping/gene_mapper.py ADDED Viewed

	@@ -0,0 +1,629 @@

+"""
+Module: gene_mapper.py
+This module provides utilities for mapping gene identifiers between human and mouse datasets,
+as well as handling orthology relationships. It is designed to process gene expression data
+and map gene IDs to standardized formats for downstream analysis.
+Main Features:
+- Map human and mouse gene IDs to a common reference format.
+- Handle orthology relationships to convert mouse gene symbols to human gene symbols.
+- Combine mapping results from multiple sources and flag discrepancies.
+- Transform wide-format gene data into long-format for easier processing.
+- Categorize gene mappings based on their relationships (e.g., one-to-one, one-to-many).
+Dependencies:
+- pandas: For data manipulation.
+- numpy: For numerical operations.
+- warnings: For handling warnings during processing.
+Usage:
+- Import the functions and use them to map gene IDs or process gene data.
+- Run the script directly to execute test cases for the implemented functions.
+Why:
+- This module is essential for harmonizing gene identifiers across datasets, enabling
+  consistent analysis of gene expression data from different species or sources.
+"""
+import warnings
+import numpy as np
+import pandas as pd
+# import re
+def map_mouse_human(data_frame, query_column, human_map_db, mouse_map_db, orthology_db, verbose=False):
+    """
+    Maps gene IDs from a dataset to human and mouse reference databases, and resolves orthology relationships.
+    Args:
+        data_frame (pd.DataFrame): Input data containing gene IDs to map.
+        query_column (str): Column name in the input data containing gene IDs.
+        human_map_db (pd.DataFrame): Reference database for human gene mapping.
+        mouse_map_db (pd.DataFrame): Reference database for mouse gene mapping.
+        orthology_db (pd.DataFrame): Database containing orthology relationships between mouse and human genes.
+        verbose (bool): Whether to print detailed logs during processing.
+    Returns:
+        pd.DataFrame: A combined mapping result with discrepancies flagged.
+    """
+    if verbose:
+        print("------------    map human gene ids    ------------")
+    mapped_hsap = map_genes(
+        expr_mat=data_frame,
+        expr_ids=query_column,
+        annot_mat=human_map_db,
+        annot_from="id",
+        annot_to="reference_id",
+        return_unmapped=True,
+        keep_prev_ids=True,
+        verbose=verbose,
+    )
+    if verbose:
+        print("------------    map mouse gene ids    ------------")
+    mapped_mus = map_genes(
+        expr_mat=data_frame,
+        expr_ids=query_column,
+        annot_mat=mouse_map_db,
+        annot_from="id",
+        annot_to="reference_id",
+        return_unmapped=True,
+        keep_prev_ids=True,
+        verbose=verbose,
+    )
+    if verbose:
+        print("------------    mouse to human orthologs    ------------")
+    mouse_hsap = orthologs_to_human(
+        mouse_df=mapped_mus,
+        mouse_col="reference_id",
+        orthology_df=orthology_db,
+        ortho_mouse_col="mouse_gene_symbol",
+        ortho_human_col="human_gene_symbol",
+        ortho_type_col="mouse_homology_type",
+        orthology_type="ortholog_one2one",
+    )
+    mouse_hsap = mouse_hsap.loc[:, ["previous_ids", "human_gene_symbol"]].drop_duplicates()
+    mouse_hsap = mouse_hsap.rename(columns={"human_gene_symbol": "reference_id"})
+    if verbose:
+        print("------------    combine results    ------------")
+    both_mapped = combine_dataframe_columns(
+        df1=mapped_hsap, df2=mouse_hsap, id_column="previous_ids", reference_id_column="reference_id", verbose=verbose
+    )
+    both_mapped = both_mapped.loc[:, ["previous_ids", "reference_id", "discrepancy_flag"]].drop_duplicates()
+    return both_mapped
+def map_mouse_human2(data_frame, query_column, human_map_db, mouse_map_db, orthology_db, verbose=False):
+    if verbose:
+        print("------------    map human gene ids    ------------")
+    mapped_hsap = map_genes(
+        expr_mat=data_frame,
+        expr_ids=query_column,
+        annot_mat=human_map_db,
+        annot_from="id",
+        annot_to="reference_id",
+        return_unmapped=True,
+        keep_prev_ids=True,
+        verbose=verbose,
+    )
+    if verbose:
+        print("------------    map mouse gene ids    ------------")
+    mapped_mus = map_genes(
+        expr_mat=data_frame,
+        expr_ids=query_column,
+        annot_mat=mouse_map_db,
+        annot_from="id",
+        annot_to="reference_id",
+        return_unmapped=True,
+        keep_prev_ids=True,
+        verbose=verbose,
+    )
+    if verbose:
+        print("------------    mouse to human orthologs    ------------")
+    mouse_hsap = orthologs_to_human(
+        mouse_df=mapped_mus,
+        mouse_col="reference_id",
+        orthology_df=orthology_db,
+        ortho_mouse_col="mouse_gene_symbol",
+        ortho_human_col="human_gene_symbol",
+        ortho_type_col="mouse_homology_type",
+        orthology_type="ortholog_one2one",
+    )
+    ## this testing confirms that the filtering step produces the same result as the script below that takes ENSMUSG to fill the NA from orthologs that are not one2one
+    ## however not filtering causes discrepancies when combinding the two data_processing frames. this step is reqiured to avoid that
+    ## filter on mouse gene symbol - if not mapped then the input was not a mouse gene (or not a mouse gene that can be mapped)
+    ## alternative is to filter on ENSMUSG - but this will only work if the input list is ensembl gene ids, other ids will not be matched
+    if verbose:
+        print(mouse_hsap.shape)
+    mouse_hsap_filt = mouse_hsap.loc[
+        (mouse_hsap.previous_ids.str.contains("ENSMUS")) | (~mouse_hsap.mouse_gene_symbol.isnull()), :
+    ]
+    # mouse_hsap_remainder=mouse_hsap.loc[~((mouse_hsap.previous_ids.str.contains('ENSMUS')) | (~mouse_hsap.mouse_gene_symbol.isnull())),:]
+    if verbose:
+        print(mouse_hsap_filt.shape)
+    # (mouse_hsap_remainder)
+    mouse_hsap = mouse_hsap_filt
+    ## convert all gene human gene symbols to NA if they are not one2one orthologs
+    mouse_hsap.loc[mouse_hsap["mouse_homology_type"] != "ortholog_one2one", "human_gene_symbol"] = pd.NA
+    if verbose:
+        print("\n=========\tcount missing\t=========")
+        print(sum(mouse_hsap.human_gene_symbol.isnull()))
+        # fill missing human gene symbols with ENSMUSG
+    mouse_hsap["human_gene_symbol"] = mouse_hsap["human_gene_symbol"].fillna(mouse_hsap["previous_ids"])
+    if verbose:
+        print(sum(mouse_hsap.human_gene_symbol.str.contains("ENSMUSG")))
+    if verbose:
+        print("\n=========\tdoes not contain ENSMUSG\t=========")
+        print(mouse_hsap["previous_ids"][~mouse_hsap["previous_ids"].str.contains("ENSMUSG")].shape)
+        print(mouse_hsap["human_gene_symbol"][~mouse_hsap["human_gene_symbol"].str.contains("ENSMUSG")].shape)
+        print("\n=========\tcount missing\t=========")
+        print(sum(mouse_hsap.human_gene_symbol.isnull()))
+    mouse_hsap = mouse_hsap.loc[:, ["previous_ids", "human_gene_symbol"]].drop_duplicates()
+    mouse_hsap = mouse_hsap.rename(columns={"human_gene_symbol": "reference_id"})
+    if verbose:
+        print("------------    combine results    ------------")
+    both_mapped = combine_dataframe_columns(
+        df1=mapped_hsap, df2=mouse_hsap, id_column="previous_ids", reference_id_column="reference_id", verbose=verbose
+    )
+    both_mapped = both_mapped.loc[:, ["previous_ids", "reference_id", "discrepancy_flag"]].drop_duplicates()
+    return both_mapped
+def combine_dataframe_columns(df1, df2, id_column, reference_id_column, verbose=True):
+    """
+    Combines two dataframes by merging on a common ID column and flags discrepancies in reference IDs.
+    Args:
+        df1 (pd.DataFrame): First dataframe to merge.
+        df2 (pd.DataFrame): Second dataframe to merge.
+        id_column (str): Column name to merge on.
+        reference_id_column (str): Column name containing reference IDs.
+        verbose (bool): Whether to print detailed logs during processing.
+    Returns:
+        pd.DataFrame: A merged dataframe with discrepancies flagged.
+    """
+    # Standardize missing values by replacing empty strings with NaN
+    df1[reference_id_column] = df1[reference_id_column].replace("", pd.NA)
+    df2[reference_id_column] = df2[reference_id_column].replace("", pd.NA)
+    if verbose:
+        # Calculate and print the number of missing values in the reference_id columns of each dataframe
+        missing_df1 = df1[reference_id_column].isna().sum()
+        missing_df2 = df2[reference_id_column].isna().sum()
+        print(f"Missing values in {reference_id_column} of df1: {missing_df1}")
+        print(f"Missing values in {reference_id_column} of df2: {missing_df2}")
+    # Merge the dataframes on the specified 'id' column
+    merged_df = pd.merge(df1, df2, on=id_column, how="outer", suffixes=("_df1", "_df2"))
+    # Flag discrepancies where both reference IDs are present but do not match
+    merged_df["discrepancy_flag"] = np.where(
+        (merged_df[f"{reference_id_column}_df1"].notna())
+        & (merged_df[f"{reference_id_column}_df2"].notna())
+        & (merged_df[f"{reference_id_column}_df1"] != merged_df[f"{reference_id_column}_df2"]),
+        True,
+        False,
+    )
+    # Use numpy.where to combine the 'reference_id' columns, preferring non-null values from df1
+    merged_df[reference_id_column] = np.where(
+        merged_df[f"{reference_id_column}_df1"].notna(),
+        merged_df[f"{reference_id_column}_df1"],
+        merged_df[f"{reference_id_column}_df2"],
+    )
+    # Replace NaN with empty strings in the final dataframe
+    final_df = merged_df[
+        [id_column, reference_id_column, f"{reference_id_column}_df1", f"{reference_id_column}_df2", "discrepancy_flag"]
+    ].fillna("")
+    if verbose:
+        # Calculate and print the number of missing values in the final result
+        missing_final = final_df[reference_id_column].isna().sum()
+        print(f"Missing values in final merged {reference_id_column}: {missing_final}")
+        # Print a warning if there are any discrepancies
+        if final_df["discrepancy_flag"].any():
+            print("Warning: There are discrepancies in the reference IDs between the two dataframes.")
+    return final_df
+def orthologs_to_human(
+    mouse_df,
+    orthology_df,
+    mouse_col,
+    ortho_mouse_col,
+    ortho_human_col,
+    ortho_type_col,
+    orthology_type="ortholog_one2one",
+):
+    """
+    Merges a mouse data_processing frame with an orthology data_processing frame to convert mouse gene symbols to human gene symbols.
+    Parameters:
+    - mouse_df: pd.DataFrame - The data_processing frame containing mouse gene symbols.
+    - orthology_df: pd.DataFrame - The data_processing frame containing orthology information.
+    - mouse_col: str - The column name in the mouse_df that contains mouse gene symbols.
+    - ortho_mouse_col: str - The column name in the orthology_df that contains mouse gene symbols.
+    - ortho_human_col: str - The column name in the orthology_df that contains human gene symbols.
+    - ortho_type_col: str - The column name in the orthology_df that contains the orthology type.
+    - orthology_type: str - The type of orthology to keep (default is 'ortholog_one2one').
+    Returns:
+    - merged_df: pd.DataFrame - The merged data_processing frame with human gene symbols included.
+    """
+    # Check if the specified orthology type exists in the orthology dataframe
+    unique_ortho_types = orthology_df[ortho_type_col].unique()
+    if orthology_type not in unique_ortho_types:
+        print(f"Error: Specified orthology type '{orthology_type}' not found.")
+        print("Available orthology types are:", unique_ortho_types)
+        return None
+    # Filter the orthology dataframe based on the specified orthology type
+    filtered_orthology_df = orthology_df[orthology_df[ortho_type_col] == orthology_type]
+    # Merge the mouse dataframe with the filtered orthology dataframe
+    merged_df = mouse_df.merge(
+        filtered_orthology_df[[ortho_mouse_col, ortho_human_col, ortho_type_col]],
+        left_on=mouse_col,
+        right_on=ortho_mouse_col,
+        how="left",
+    )
+    return merged_df
+# Example usage:
+# merged_df = merge_with_orthology(mouse_df, orthology_df, 'mouse_gene_column', 'ortho_mouse_gene_column', 'ortho_human_gene_column', 'orthology_type_column', 'ortholog_one2one')
+def preprocess_wide_to_long(df, reference_id, sep="|", keep_id_type=True):
+    """
+    Transforms the given DataFrame into a long format table where one specified column represents reference IDs
+    and all the entries from the other columns, including the specified column, are put into the second column.
+    Entries separated by a specified separator are split into individual values. Removes any duplicate values.
+    Handles NaN values appropriately by skipping them and removes rows with NaN in the reference_id column.
+    Args:
+    df (pd.DataFrame): The input DataFrame with gene information.
+    reference_id (str): The column name to be used as the reference identifier.
+    sep (str): The separator used to split entries in the ID columns.
+    keep_id_type (bool): Whether to keep the id_type column in the final output.
+    Returns:
+    pd.DataFrame: The transformed long format DataFrame with split values.
+    """
+    # Check for duplicate column names
+    if df.columns.duplicated().any():
+        raise ValueError("Duplicate column names detected in the DataFrame.")
+    # Remove rows where reference_id is NaN
+    initial_row_count = df.shape[0]
+    df = df.dropna(subset=[reference_id])
+    final_row_count = df.shape[0]
+    if initial_row_count != final_row_count:
+        print(
+            f"Removed {initial_row_count - final_row_count} rows with NaN in '{reference_id}'. {final_row_count} rows remain."
+        )
+    else:
+        print("No rows with NaN in the reference_id were found.")
+    # Check for duplicate values in reference_id column
+    if df[reference_id].duplicated().any():
+        print(
+            f"Warning: Duplicate values found in the '{reference_id}' column. This may cause issues with the transformation."
+        )
+    long_format_data = []
+    # Process each column except the reference_id
+    for col in df.columns:
+        if col != reference_id:
+            # Convert numeric columns to string
+            if pd.api.types.is_numeric_dtype(df[col]):
+                df[col] = df[col].astype(str)
+            # Split the values by the separator and create a new DataFrame for each column
+            exploded_df = df[[reference_id, col]].dropna().assign(**{col: df[col].str.split(sep)})
+            exploded_df = exploded_df.explode(col)
+            exploded_df["id_type"] = col
+            exploded_df = exploded_df.rename(columns={col: "id"})
+            long_format_data.append(exploded_df)
+    # Concatenate all the long format DataFrames
+    long_df = pd.concat(long_format_data)
+    # Add the reference_id as its own column
+    reference_id_df = df[[reference_id]].dropna()
+    reference_id_df["id_type"] = reference_id
+    reference_id_df["id"] = reference_id_df[reference_id]
+    long_df = pd.concat([long_df, reference_id_df], ignore_index=True)
+    # Rename the reference_id column to "reference_id"
+    long_df = long_df.rename(columns={reference_id: "reference_id"})
+    # Drop duplicate values
+    long_df.drop_duplicates(inplace=True)
+    if not keep_id_type:
+        # Drop the id_type column and remove duplicates based only on 'id' and 'reference_id'
+        long_df = long_df.drop(columns=["id_type"]).drop_duplicates()
+    # Reorder the columns
+    columns_order = ["id", "reference_id"] if not keep_id_type else ["id", "id_type", "reference_id"]
+    long_df = long_df[columns_order]
+    return long_df
+def categorise_mapping(df, ids_from_col, ids_to_col):
+    # Calculate the occurrences of each id and each gene_name
+    id_counts = df[ids_from_col].value_counts()
+    gene_counts = df[ids_to_col].value_counts()
+    # Map the counts back to the dataframe
+    df["id_count"] = df[ids_from_col].map(id_counts)
+    df["gene_count"] = df[ids_to_col].map(gene_counts)
+    # Determine match type based on counts
+    conditions = [(df["id_count"] > 1) & (df["gene_count"] > 1), (df["id_count"] > 1), (df["gene_count"] > 1)]
+    choices = ["many2many", "one2many", "many2one"]
+    df["match_type"] = np.select(conditions, choices, default="one2one")
+    # Drop the temporary columns used for counts
+    df.drop(columns=["id_count", "gene_count"], inplace=True)
+    return df
+def remove_whitespace(series):
+    # return series.astype(str).str.replace(r'^\s+|\s+$', '', regex=True)
+    return series.astype(str).str.strip()
+def unlist(nested_list):
+    """
+    Recursively flattens a nested list.
+    Args:
+    nested_list (list): A list that may contain nested lists.
+    Returns:
+    list: A flattened list.
+    """
+    flattened = []
+    for item in nested_list:
+        if isinstance(item, list):
+            flattened.extend(unlist(item))
+        else:
+            flattened.append(item)
+    return flattened
+def map_genes(
+    expr_mat,
+    expr_ids=None,
+    annot_mat=None,
+    annot_from="id",
+    annot_to="hgnc_symbol",
+    return_unmapped=False,
+    verbose=True,
+    error=False,
+    keep_prev_ids=False,
+):
+    """TODO: The code currently breaks when expr_mat already has a column called referene_id. This is because the mapped = pd.merge(...) does not merge the reference_id columns. Try to fix this."""
+    if expr_ids is not None:
+        expr_mat = expr_mat.rename(columns={expr_ids: "previous_ids"})
+        expr_ids = "previous_ids"
+    if expr_ids is None:
+        expr_ids = "previous_ids"
+        expr_mat[expr_ids] = expr_mat.index
+    with warnings.catch_warnings():
+        warnings.simplefilter(action="ignore", category=pd.errors.SettingWithCopyWarning)
+        # Remove any whitespace - trailing or otherwise
+        expr_mat[expr_ids] = remove_whitespace(expr_mat[expr_ids])
+        if verbose:
+            print("\n [ gene ID mapping ] \n")
+            print(
+                f"\tdataset contains  : {len(expr_mat['previous_ids'])} ids, of which unique: {len(expr_mat['previous_ids'].unique())} - {round(len(expr_mat['previous_ids'].unique()) / len(expr_mat['previous_ids']) * 100, 1)}%"
+            )
+        # Remove any missing ids
+        missing_genes = expr_mat[expr_mat[expr_ids].isin([None, "", "nan"])]
+        if not missing_genes.empty:
+            if verbose:
+                print(f"\tfound {len(missing_genes)} missing ids", list(missing_genes[expr_ids].unique())[:5])
+            expr_mat = expr_mat[~expr_mat[expr_ids].isin([None, "", "nan"])]
+        # Check for ids that are already mapping
+        premapped = expr_mat[expr_mat["previous_ids"].isin(annot_mat[annot_to])]
+        premapped.loc[:, annot_to] = premapped["previous_ids"]
+        if verbose:
+            print(
+                f'\n\texpr_mat - of {len(expr_mat["previous_ids"].unique())} ids  {len(premapped["previous_ids"].unique())} - {round(len(premapped["previous_ids"].unique()) / len(expr_mat["previous_ids"].unique()) * 100, 3)}% directly map to annot_mat${annot_to}\n'
+            )
+        # Map using exact match
+        unmapped_hgnc = expr_mat[~expr_mat["previous_ids"].isin(premapped["previous_ids"])]
+        if unmapped_hgnc.empty:
+            if keep_prev_ids:
+                return premapped.drop_duplicates()
+            return premapped.drop(columns=["previous_ids"], errors="ignore").drop_duplicates()
+        mapped = pd.merge(
+            expr_mat[~expr_mat["previous_ids"].isin(premapped["previous_ids"])],
+            annot_mat[[annot_from, annot_to]].drop_duplicates(),
+            left_on="previous_ids",
+            right_on=annot_from,
+            how="inner",
+        )
+        mapped = pd.concat([mapped, premapped if not premapped.empty else None])
+        # Map the remainder using lowercase
+        remap = expr_mat[~expr_mat["previous_ids"].isin(mapped["previous_ids"])]
+        remap.loc[:, "previous_ids"] = remap["previous_ids"].str.lower()
+        reannot = annot_mat[[annot_from, annot_to]].drop_duplicates()
+        reannot[annot_from] = reannot[annot_from].str.lower()
+        remap = pd.merge(remap, reannot, left_on="previous_ids", right_on=annot_from, how="inner")
+        mapped = pd.concat([mapped, remap]).drop_duplicates()
+        dups = mapped[mapped.duplicated(subset=[annot_to], keep=False)][annot_to].unique()
+        uniq = mapped[~mapped[annot_to].isin(dups)][annot_to].unique()
+        if verbose:
+            print(f'\tone2one: {len(uniq)}\t{", ".join(uniq[:5])}')
+            print(f'\tmany2one: {len(dups)}\t{", ".join(dups[:5])}')
+        unmapped = expr_mat["previous_ids"][
+            ~expr_mat["previous_ids"].str.lower().isin(mapped["previous_ids"].str.lower())
+        ].unique()
+        if verbose:
+            print(f'\n\tunmapped genes: {len(unmapped)}\t::  {", ".join(unmapped[:5])}\n')
+            print("\n\n")
+        result = mapped
+        if return_unmapped:
+            unmapped_expr_mat = expr_mat[expr_mat["previous_ids"].isin(unmapped)]
+            if not unmapped_expr_mat.empty:
+                unmapped_expr_mat.loc[:, annot_to] = ""
+                result = pd.concat([result, unmapped_expr_mat])
+        result = result.loc[:, result.columns.isin(unlist([list(expr_mat.columns.values), annot_to]))]
+    if keep_prev_ids:
+        return result.drop_duplicates()
+    return result.drop(columns=["previous_ids"], errors="ignore").drop_duplicates()
+##========================================================================================================================
+##==========    Test functions    ================================================================================
+##========================================================================================================================
+def test_transform_function():
+    """
+    Test case for the transform_and_split_to_long_format function using a toy example.
+    """
+    data = {
+        "Gene stable ID": ["ID1|ID2", "ID3", "ID4|ID5"],
+        "Gene stable ID version": ["ID1.1", "ID3.1", None],
+        "Gene Synonym": ["Syn1", None, "Syn4"],
+        "Gene name": ["GeneA", "GeneB", "GeneC"],
+    }
+    df = pd.DataFrame(data)
+    expected_data = {
+        "id": ["ID1", "ID2", "ID1.1", "Syn1", "GeneA", "ID3", "ID3.1", "GeneB", "ID4", "ID5", "Syn4", "GeneC"],
+        "id_type": [
+            "Gene stable ID",
+            "Gene stable ID",
+            "Gene stable ID version",
+            "Gene Synonym",
+            "Gene name",
+            "Gene stable ID",
+            "Gene stable ID version",
+            "Gene name",
+            "Gene stable ID",
+            "Gene stable ID",
+            "Gene Synonym",
+            "Gene name",
+        ],
+        "reference_id": [
+            "GeneA",
+            "GeneA",
+            "GeneA",
+            "GeneA",
+            "GeneA",
+            "GeneB",
+            "GeneB",
+            "GeneB",
+            "GeneC",
+            "GeneC",
+            "GeneC",
+            "GeneC",
+        ],
+    }
+    expected_df = pd.DataFrame(expected_data)
+    # Transform the DataFrame
+    long_df = transform_and_split_to_long_format(df, "Gene name")  # noqa
+    # Sort the DataFrame for comparison
+    long_df = long_df.sort_values(by=["id", "id_type", "reference_id"]).reset_index(drop=True)
+    expected_df = expected_df.sort_values(by=["id", "id_type", "reference_id"]).reset_index(drop=True)
+    # Check if the transformed DataFrame matches the expected DataFrame
+    assert long_df.equals(expected_df), "test_transform_function\t\t- did not produce expected result"
+    print("test_transform_function\t\t- passed")
+# Run tests
+def test_categorise_function():
+    mapping_test_data = {
+        "ids": ["id1", "id2", "id3", "id4", "id1", "id5"],
+        "gene_names": ["gene1", "gene2", "gene3", "gene3", "gene4", "gene5"],
+        "expected_match_type": ["one2many", "one2one", "many2one", "many2one", "one2many", "one2one"],
+    }
+    mapping_test_data = pd.DataFrame(mapping_test_data)
+    test_data = {
+        "ids": ["id1", "id2", "id3", "id4", "id1", "id5"],
+        "gene_names": ["gene1", "gene2", "gene3", "gene3", "gene4", "gene5"],
+    }
+    df_test = pd.DataFrame(test_data)
+    print("\nRunning optimized version:")
+    annotated_df_optimized = categorise_mapping(df_test.copy(), "ids", "gene_names")
+    print(annotated_df_optimized)
+    # Verify the results
+    assert (
+        annotated_df_optimized["match type"].tolist() == mapping_test_data["expected_match_type"].tolist()
+    ), "Optimized version failed"
+    print("\ntest_categorise_function\t\t- passed")
+# Only scripts the test if this script is executed directly (not imported)
+if __name__ == "__main__":
+    test_transform_function()
+    test_categorise_function()

teddy/data_processing/utils/medians/data/teddy_gene_medians.json ADDED Viewed

The diff for this file is too large to render. See raw diff

teddy/models/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

teddy/models/__init__.py ADDED Viewed

File without changes

teddy/models/classification_heads.py ADDED Viewed

	@@ -0,0 +1,285 @@

+"""
+Module: classification_heads.py
+This module defines various classification and decoder heads for use in transformer-based models,
+specifically tailored for single-cell biology tasks. These heads are designed to handle tasks such as
+classification, regression, and expression value prediction, and they integrate seamlessly with
+transformer architectures.
+Main Features:
+- **ClsDecoder**: A simple decoder for classification tasks, supporting multiple layers and activations.
+- **ClassificationHead**: A RoBERTa-style classification head for downstream tasks.
+- **ClassificationHeadAnalysis**: An extended classification head that provides intermediate hidden states for analysis.
+- **ClsDecoderAnalysis**: A classification decoder with support for hidden state extraction.
+- **TrainingHead**: A dense layer with activation and normalization for training tasks.
+- **AnnotationDecoderHead**: A lightweight decoder for annotation tasks with simplified weight initialization.
+- **ExprDecoder**: A decoder for predicting gene expression values, with optional explicit zero probability prediction.
+- **AffineExprDecoder**: A decoder for predicting gene expression values in an affine form (Ax + b), with support for
+  advanced features like adaptive bias and explicit zero probabilities.
+Dependencies:
+- PyTorch: For defining and training neural network components.
+- Transformers: For activation functions and integration with transformer-based models.
+Usage:
+Import the desired classification or decoder head into your model:
+   ```python
+   from teddy.models.classification_heads import ClsDecoder, ClassificationHead
+   ```
+"""
+from typing import Dict, Optional
+import torch
+import torch.nn as nn
+from torch import Tensor
+from transformers.activations import ACT2FN
+class ClsDecoder(nn.Module):  # taken from scGPT. Delete when not needed any more.
+    """
+    Decoder for classification task.
+    """
+    def __init__(
+        self,
+        d_model: int,
+        n_cls: int,
+        nlayers: int = 1,
+        activation: callable = nn.ReLU,
+    ):
+        super().__init__()
+        # module list
+        self._decoder = nn.ModuleList()
+        for _i in range(nlayers - 1):
+            self._decoder.append(nn.Linear(d_model, d_model))
+            self._decoder.append(activation())
+            self._decoder.append(nn.LayerNorm(d_model))
+        self.out_layer = nn.Linear(d_model, n_cls)
+    def forward(self, x: Tensor) -> Tensor:
+        """
+        Args:
+            x: Tensor, shape [batch_size, embsize]
+        """
+        for layer in self._decoder:
+            x = layer(x)
+        return {"output": self.out_layer(x)}
+class ClassificationHead(nn.Module):
+    """RoBERTa-style classification head"""
+    def __init__(self, config, n_cls, nlayers):
+        super().__init__()
+        self._decoder = nn.ModuleList()
+        self.activation = nn.ReLU() if config.layer_activation == "relu" else nn.GELU()
+        for _i in range(nlayers):
+            self._decoder.append(nn.Dropout(config.dropout))
+            self._decoder.append(nn.Linear(config.d_model, config.d_model))
+            self._decoder.append(self.activation)
+            self._decoder.append(nn.Dropout(config.dropout))
+        self._decoder.append(nn.Linear(config.d_model, n_cls))
+    def forward(self, x):
+        for module in self._decoder:
+            x = module(x)
+        return {"output": x}
+class ClassificationHeadAnalysis(nn.Module):
+    """RoBERTa-style classification head"""
+    def __init__(self, config, n_cls, nlayers):
+        super().__init__()
+        self.dropout = nn.Dropout(config.dropout)
+        self._decoder = nn.ModuleList()
+        self.activation = nn.ReLU() if config.layer_activation == "relu" else nn.GELU()
+        for _i in range(nlayers):
+            self._decoder.append(self.dropout)
+            self._decoder.append(nn.Linear(config.d_model, config.d_model))
+            self._decoder.append(self.activation)
+            self._decoder.append(self.dropout)
+        self._decoder.append(nn.Linear(config.d_model, n_cls))
+    def forward(self, x):
+        hidden_states = []
+        for module in self._decoder:
+            x = module(x)
+            if isinstance(module, nn.Linear):
+                hidden_states.append(x)
+        return {"output": x, "hidden_states": hidden_states}
+class ClsDecoderAnalysis(nn.Module):
+    """
+    Decoder for classification task.
+    """
+    def __init__(
+        self,
+        d_model: int,
+        n_cls: int,
+        nlayers: int = 3,
+        activation: callable = nn.ReLU,
+    ):
+        super().__init__()
+        # module list
+        self._decoder = nn.ModuleList()
+        for _i in range(nlayers - 1):
+            self._decoder.append(nn.Linear(d_model, d_model))
+            self._decoder.append(activation())
+            self._decoder.append(nn.LayerNorm(d_model))
+        self.out_layer = nn.Linear(d_model, n_cls)
+    def forward(self, x: Tensor) -> Tensor:
+        """
+        Args:
+            x: Tensor, shape [batch_size, embsize]
+        """
+        hidden_states = []
+        for layer in self._decoder:
+            x = layer(x)
+            hidden_states.append(x)
+        return {"output": self.out_layer(x), "hidden_states": hidden_states}
+class TrainingHead(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.d_model, config.d_model)
+        self.activation = ACT2FN[config.layer_activation]
+        self.LayerNorm = nn.LayerNorm(config.d_model, config.layer_norm_eps)
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.activation(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states)
+        return hidden_states
+class AnnotationDecoderHead(nn.Linear):
+    """Small class to make weight initialization easier"""
+    def __init__(self, d_model, n_token):
+        super().__init__(d_model, n_token, bias=False)
+class ExprDecoder(nn.Module):
+    def __init__(
+        self,
+        d_model: int,
+        explicit_zero_prob: bool = False,
+        use_batch_labels: bool = False,
+    ):
+        super().__init__()
+        d_in = d_model * 2 if use_batch_labels else d_model
+        self.fc = nn.Sequential(
+            nn.Linear(d_in, d_model),
+            nn.LeakyReLU(),
+            nn.Linear(d_model, d_model),
+            nn.LeakyReLU(),
+            nn.Linear(d_model, 1),
+        )
+        self.explicit_zero_prob = explicit_zero_prob
+        if explicit_zero_prob:
+            self.zero_logit = nn.Sequential(
+                nn.Linear(d_in, d_model),
+                nn.LeakyReLU(),
+                nn.Linear(d_model, d_model),
+                nn.LeakyReLU(),
+                nn.Linear(d_model, 1),
+            )
+    def forward(self, x: Tensor, values: Tensor = None) -> Dict[str, Tensor]:
+        """x is the output of the transformer, (batch, seq_len, d_model)"""
+        pred_value = self.fc(x).squeeze(-1)  # (batch, seq_len)
+        if not self.explicit_zero_prob:
+            return {"pred": pred_value}
+        zero_logits = self.zero_logit(x).squeeze(-1)  # (batch, seq_len)
+        zero_probs = torch.sigmoid(zero_logits)
+        return {"pred": pred_value, "zero_probs": zero_probs}
+        # TODO: note that the return currently is only for training. Since decoder
+        # is not used in the test setting for the integration task, the experiments/inference
+        # logic is not implemented yet. However, remember to implement it when
+        # the decoder is used in any test setting. The inference logic will need
+        # to sample from the bernoulli distribution with the zero_probs.
+class AffineExprDecoder(nn.Module):
+    def __init__(
+        self,
+        d_model: int,
+        explicit_zero_prob: bool = False,
+        activation: Optional[str] = None,
+        tanh_coeff: bool = False,
+        adaptive_bias: bool = False,
+    ):
+        """
+        Predict the expression value of each gene in an affine like form of Ax + b.
+        This decoder takes two ExprDecoder intrinsically to genrate the coefficient A and bias b.
+        Args:
+            d_model: The embedding dimension.
+            explicit_zero_prob: If True, predict the probability of each gene being
+                zero.
+            activation: The activation function for the coefficient A and bias b.
+            tanh_coeff: If True, use tanh activation for the coefficient A.
+            adaptive_bias: If True, use a learnable bias for the bias b.
+        """
+        super().__init__()
+        self.explicit_zero_prob = explicit_zero_prob
+        self.tanh_coeff = tanh_coeff
+        self.adaptive_bias = adaptive_bias
+        self.coeff_decoder = ExprDecoder(d_model, explicit_zero_prob=explicit_zero_prob)
+        self.bias_decoder = ExprDecoder(d_model, explicit_zero_prob=explicit_zero_prob)
+        self.activation = activation
+        if activation is not None:
+            # Normalize activation name to lowercase for flexibility
+            activation = activation.lower()
+            # Mapping of known activation functions
+            activations_map = {
+                "gelu": "GELU",
+                "relu": "ReLU",
+                "tanh": "Tanh",
+                "sigmoid": "Sigmoid",
+            }
+            assert activation in activations_map, f"Unknown activation: {activation}"
+            assert hasattr(nn, activations_map[activation]), f"Unknown activation: {activation}"
+            self.activation = getattr(nn, activations_map[activation])()
+    def forward(self, x: Tensor, values: Tensor) -> Tensor:
+        """
+        Args:
+            x: Tensor, shape [batch_size, seq_len, embsize]
+            values: Tensor, shape [batch_size, seq_len]
+        Returns:
+            output Tensor of shape [batch_size, seq_len]
+        """
+        coeff = self.coeff_decoder(x)
+        bias = self.bias_decoder(x)
+        if self.activation is not None:
+            coeff["pred"] = self.activation(coeff["pred"])
+            bias["pred"] = self.activation(bias["pred"])
+        # if self.tanh_coeff:
+        #     coeff["pred"] = 1 + torch.tanh(coeff["pred"])
+        if self.adaptive_bias:
+            # bias["pred"] = bias["pred"] * values.mean(dim=1, keepdim=True)
+            non_zero_value_mean = values.sum(dim=1, keepdim=True) / (values != 0).sum(dim=1, keepdim=True)
+            bias["pred"] = bias["pred"] * non_zero_value_mean
+        if self.explicit_zero_prob:
+            return {
+                "pred": coeff["pred"] * values + bias["pred"],
+                "zero_probs": coeff["zero_probs"],
+            }
+        return {"pred": coeff["pred"] * values + bias["pred"]}

teddy/models/model_directory.py ADDED Viewed

	@@ -0,0 +1,53 @@

+"""
+Module: model_directory.py
+This module provides a centralized directory for managing and accessing different model architectures
+used in the TEDDY project. It defines a dictionary of supported models and their configurations,
+allowing for easy integration and dynamic loading of models based on their names or paths.
+Main Features:
+- **model_dict**: A dictionary mapping model names to their corresponding classes, configurations,
+  and masking keys. This enables seamless switching between different model architectures.
+- **get_architecture**: A utility function to retrieve the architecture name from a model's configuration file.
+Dependencies:
+- json: For loading model configuration files.
+- os: For handling file paths.
+- teddy.models.teddy_g.model: For importing the `TeddyGModel`, `TeddyGConfig`, and `TeddyGModelAnalysis` classes.
+Usage:
+1. Access a model and its configuration from the `model_dict`:
+   ```python
+   model_info = model_dict["TeddyGModel"]
+   model_cls = model_info["model_cls"]
+   config_cls = model_info["config_cls"]
+   ```
+2. Retrieve the architecture name from a model's configuration file:
+   ```python
+   architecture = get_architecture(model_name_or_path)
+   ```
+"""
+import json
+import os
+from teddy.models.teddy_g.model import (
+    TeddyGConfig,
+    TeddyGModel,
+    TeddyGModelAnalysis,
+)
+model_dict = {
+    "TeddyGModel": {"model_cls": TeddyGModel, "config_cls": TeddyGConfig, "masking_key": "gene_ids"},
+    "TeddyGModelAnalysis": {
+        "model_cls": TeddyGModelAnalysis,
+        "config_cls": TeddyGConfig,
+        "masking_key": "gene_ids",
+    },
+}
+def get_architecture(model_name_or_path):
+    with open(os.path.join(model_name_or_path, "config.json")) as f:
+        config = json.load(f)
+    return config["architectures"][0]

teddy/models/teddy_g/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

teddy/models/teddy_g/160M/added_tokens.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "<cls>": 43811,
+  "<mask>": 43812,
+  "<pad>": 43810,
+  "<sep>": 43809,
+  "<unk>": 43808
+}

teddy/models/teddy_g/160M/config.json ADDED Viewed

	@@ -0,0 +1,26 @@

+{
+  "architectures": [
+    "TeddyGModel"
+  ],
+  "cls_loss": false,
+  "annotation_loss_weight": null,
+  "modeling_loss_weight": null,
+  "d_hid": 3072,
+  "d_model": 768,
+  "dropout": 0.02,
+  "gradient_checkpointing": false,
+  "initializer_range": 0.02,
+  "layer_activation": "gelu",
+  "mask_token": "<mask>",
+  "mask_token_id": 1,
+  "masking_loss": false,
+  "max_position_embeddings": 2048,
+  "n_cls": 0,
+  "n_layers_cls": 0,
+  "nheads": 12,
+  "nlayers": 12,
+  "ntoken": 43840,
+  "pad_token_id": -100,
+  "pre_norm": false,
+  "torch_dtype": "float32"
+}