{ "cells": [ { "cell_type": "markdown", "id": "a17d8764", "metadata": {}, "source": [ "#### Preprocessing Notebook" ] }, { "cell_type": "code", "execution_count": 53, "id": "7eba927e-1ea6-4588-a5e8-48a1ec9a212a", "metadata": {}, "outputs": [], "source": [ "import logging\n", "from pathlib import Path\n", "import numpy as np\n", "import pandas as pd\n", "\n", "# 1) Import config & internal modules\n", "from preprocessing_pipeline import config\n", "from preprocessing_pipeline.download import (\n", " download_genome_references\n", ")\n", "from preprocessing_pipeline.data_io import (\n", " create_dir_if_not_exists,\n", " load_anndata,\n", " save_processed_datasets\n", ")\n", "from preprocessing_pipeline.filtering import (\n", " intersect_cells,\n", " remove_mitochondrial_genes\n", ")\n", "from preprocessing_pipeline.gene_selection import (\n", " load_gtf,\n", " filter_protein_coding_genes,\n", " compute_hvgs_and_tfs\n", ")\n", "from preprocessing_pipeline.peak_selection import keep_promoters_and_select_hv_peaks\n", "from preprocessing_pipeline.metacells import create_metacells\n", "from preprocessing_pipeline.motif_scanning import (\n", " run_bedtools_intersect,\n", " load_motif_database,\n", " compute_motif_scores\n", ")\n", "from preprocessing_pipeline.correlation import compute_in_silico_chipseq\n", "from preprocessing_pipeline.utils import create_extended_gene_bed,compute_gene_peak_distance_matrix\n", "\n", "logger = logging.getLogger(__name__)" ] }, { "cell_type": "code", "execution_count": 54, "id": "a3e28ad1-c8c2-41df-8f66-97a79bbe59cc", "metadata": {}, "outputs": [], "source": [ "from importlib import reload\n", "import sys\n", "\n", "for module_name in list(sys.modules.keys()):\n", " if module_name.startswith(\"preprocessing_pipeline\"):\n", " reload(sys.modules[module_name])" ] }, { "cell_type": "code", "execution_count": 3, "id": "e8d9fa35-37b5-4f3b-bb08-eff1703e5926", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "INFO:__main__:=== Starting multi-ome preprocessing pipeline ===\n" ] } ], "source": [ "logging.getLogger().setLevel(config.logging_level)\n", "\n", "logger.info(\"=== Starting multi-ome preprocessing pipeline ===\")" ] }, { "cell_type": "markdown", "id": "7cb47cf7", "metadata": {}, "source": [ "##### 1. Prepare directories" ] }, { "cell_type": "code", "execution_count": null, "id": "76ccb5d0-c564-4988-8c06-2bb903086668", "metadata": {}, "outputs": [], "source": [ "\n", "data_dir = Path(config.data_dir)\n", "genome_dir = Path(config.genome_dir)\n", "motif_dir = Path(config.motif_directory)\n", "out_dir = data_dir / config.output_subdir_name\n", "\n", "create_dir_if_not_exists(genome_dir)\n", "create_dir_if_not_exists(motif_dir)\n", "create_dir_if_not_exists(out_dir)" ] }, { "cell_type": "markdown", "id": "cdffc8a7", "metadata": {}, "source": [ "##### 2. Download reference genome, gene annotations and chromosome sizes" ] }, { "cell_type": "code", "execution_count": null, "id": "4f7904a5-021f-4e47-9447-7b778435844a", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "INFO:preprocessing_pipeline.download:Using genome references for species='mouse', assembly='mm10'.\n", "GTF: https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_mouse/release_M18/gencode.vM18.basic.annotation.gtf.gz\n", "Chrom.sizes: https://hgdownload.cse.ucsc.edu/goldenpath/mm10/bigZips/mm10.chrom.sizes\n", "FASTA: https://hgdownload.soe.ucsc.edu/goldenPath/mm10/bigZips/mm10.fa.gz\n", "INFO:preprocessing_pipeline.download:Reference files are ready in /data/saraswat/new_metacells/mouse_genome_files\n" ] } ], "source": [ "\n", "download_genome_references(\n", " genome_dir=genome_dir,\n", " species=config.species,\n", " assembly=config.genome_assembly,\n", " gtf_url=config.gtf_url,\n", " chrom_sizes_url=config.chrom_sizes_url,\n", " fasta_url=config.fasta_url\n", ")\n", "#download_motif_database(motif_dir, config.motif_database, config.species)" ] }, { "cell_type": "markdown", "id": "ed7bbd46", "metadata": {}, "source": [ "##### 3. Load RNA and ATAC anndata" ] }, { "cell_type": "code", "execution_count": null, "id": "abc4b74f-c240-4380-8626-a7395da9e2e2", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "INFO:preprocessing_pipeline.data_io:Loading RNA from /data/saraswat/new_metacells/data_gastrulation_single_cell/anndata.h5ad, ATAC from /data/saraswat/new_metacells/data_gastrulation_single_cell/PeakMatrix_anndata.h5ad\n" ] } ], "source": [ "\n", "data_rna, data_atac = load_anndata(\n", " data_dir,\n", " config.rna_adata_file_name,\n", " config.atac_adata_file_name\n", ")" ] }, { "cell_type": "markdown", "id": "52756f25", "metadata": {}, "source": [ "##### 4. Find cells common to both modalities and remove mito genes" ] }, { "cell_type": "code", "execution_count": null, "id": "07b6a579-ef7f-4932-bac4-bb092c5b1123", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "INFO:preprocessing_pipeline.filtering:Intersected cells: now RNA=(56861, 32285), ATAC=(56861, 192251)\n", "INFO:preprocessing_pipeline.filtering:Removed 13 mitochondrial genes with prefix=mt-\n" ] } ], "source": [ "\n", "data_rna, data_atac = intersect_cells(data_rna, data_atac)\n", "data_rna = remove_mitochondrial_genes(data_rna, mito_prefix=config.mitochondrial_prefix)\n" ] }, { "cell_type": "markdown", "id": "d956a76a", "metadata": {}, "source": [ "##### 5. Optionally filter anndata to protein-coding genes" ] }, { "cell_type": "code", "execution_count": null, "id": "830435dd-347e-41c9-b40a-dfb2beecc377", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "INFO:preprocessing_pipeline.gene_selection:Loading GTF from /data/saraswat/new_metacells/mouse_genome_files/annotation.gtf\n", "INFO:root:Extracted GTF attributes: ['gene_id', 'gene_type', 'gene_name', 'level', 'havana_gene', 'transcript_id', 'transcript_type', 'transcript_name', 'transcript_support_level', 'tag', 'havana_transcript', 'exon_number', 'exon_id', 'protein_id', 'ccdsid', 'ont']\n", "INFO:preprocessing_pipeline.gene_selection:Filtered to protein-coding genes: 19374 genes left.\n" ] } ], "source": [ "# \n", "gtf_file = genome_dir / \"annotation.gtf\"\n", "gtf_df = load_gtf(gtf_file)\n", "data_rna = filter_protein_coding_genes(data_rna, gtf_df)" ] }, { "cell_type": "markdown", "id": "cf26f71f", "metadata": {}, "source": [ "##### 6. Selecting highly variable genes and TFs\n", "First we find which TFs have a motif present in the database provided.\n", "User provided sets of genes and TFs are included in final list by default; highly variable computations are performed to obtain remaining genes and TFs" ] }, { "cell_type": "code", "execution_count": null, "id": "c840c910-127d-4083-8d06-bb16160271c4", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "INFO:preprocessing_pipeline.gene_selection:Selecting HVGs and TFs...\n", "INFO:preprocessing_pipeline.gene_selection:Selected 3700 HVGs + 300 TFs.\n" ] } ], "source": [ "\n", "motif_path = motif_dir / f\"{config.motif_database}_{config.species}.meme\"\n", "tf_names_all = []\n", "with open(motif_path, \"r\") as f:\n", " for line in f:\n", " if line.startswith(\"MOTIF\"):\n", " parts = line.strip().split()\n", " if len(parts) >= 3:\n", " tf_name = parts[2].split(\"_\")[0].strip(\"()\").strip()\n", " tf_names_all.append(tf_name)\n", "tf_names_all = sorted(list(set(tf_names_all)))\n", "\n", "data_rna, final_genes, final_tfs = compute_hvgs_and_tfs(\n", " data_rna=data_rna,\n", " tf_names=tf_names_all,\n", " user_genes=config.genes_user,\n", " user_tfs=config.tfs_user,\n", " num_genes=config.num_genes,\n", " num_tfs=config.num_tfs,\n", " min_cells=config.min_cells_per_gene\n", ")" ] }, { "cell_type": "markdown", "id": "dcf20b7f", "metadata": {}, "source": [ "##### 7. Create extended gene bed file \n", "Here we extend the gene body to the user defined genomic window for processing later\n", "\n" ] }, { "cell_type": "code", "execution_count": null, "id": "66604c35-44ce-4456-a771-fd572a682227", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "INFO:__main__:Created extended gene bed => /data/saraswat/new_metacells/data_gastrulation_single_cell/generated/genes_extended_80kb.bed\n" ] } ], "source": [ "\n", "chrom_sizes_path = genome_dir / f\"{config.genome_assembly}.chrom.sizes\"\n", "extended_genes_bed_df = create_extended_gene_bed(\n", " gtf_df,\n", " final_genes + final_tfs, # if we want to include TF genes too\n", " window_size=config.window_size,\n", " chrom_sizes_path=chrom_sizes_path\n", ")\n", "\n", "gene_bed_file = out_dir / f\"genes_extended_{config.window_size//1000}kb.bed\"\n", "extended_genes_bed_df.to_csv(gene_bed_file, sep=\"\\t\", header=False, index=False)\n", "logger.info(f\"Created extended gene bed => {gene_bed_file}\")" ] }, { "cell_type": "code", "execution_count": 11, "id": "30ceb73a-c5a1-4fbb-a8a8-b1963e56bbbe", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | idx | \n", "
---|---|
chr1:3035602-3036202 | \n", "1 | \n", "
chr1:3062653-3063253 | \n", "2 | \n", "
chr1:3072313-3072913 | \n", "3 | \n", "
chr1:3191496-3192096 | \n", "4 | \n", "
chr1:3340575-3341175 | \n", "5 | \n", "
... | \n", "... | \n", "
chrX:169902806-169903406 | \n", "3284 | \n", "
chrX:169905921-169906521 | \n", "3285 | \n", "
chrX:169915616-169916216 | \n", "3286 | \n", "
chrX:169925487-169926087 | \n", "3287 | \n", "
chrX:169937064-169937664 | \n", "3288 | \n", "
192251 rows × 1 columns
\n", "