{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "a17d8764",
   "metadata": {},
   "source": [
    "#### Preprocessing Notebook"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "id": "7eba927e-1ea6-4588-a5e8-48a1ec9a212a",
   "metadata": {},
   "outputs": [],
   "source": [
    "import logging\n",
    "from pathlib import Path\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "\n",
    "# 1) Import config & internal modules\n",
    "from preprocessing_pipeline import config\n",
    "from preprocessing_pipeline.download import (\n",
    "    download_genome_references\n",
    ")\n",
    "from preprocessing_pipeline.data_io import (\n",
    "    create_dir_if_not_exists,\n",
    "    load_anndata,\n",
    "    save_processed_datasets\n",
    ")\n",
    "from preprocessing_pipeline.filtering import (\n",
    "    intersect_cells,\n",
    "    remove_mitochondrial_genes\n",
    ")\n",
    "from preprocessing_pipeline.gene_selection import (\n",
    "    load_gtf,\n",
    "    filter_protein_coding_genes,\n",
    "    compute_hvgs_and_tfs\n",
    ")\n",
    "from preprocessing_pipeline.peak_selection import keep_promoters_and_select_hv_peaks\n",
    "from preprocessing_pipeline.metacells import create_metacells\n",
    "from preprocessing_pipeline.motif_scanning import (\n",
    "    run_bedtools_intersect,\n",
    "    load_motif_database,\n",
    "    compute_motif_scores\n",
    ")\n",
    "from preprocessing_pipeline.correlation import compute_in_silico_chipseq\n",
    "from preprocessing_pipeline.utils import create_extended_gene_bed,compute_gene_peak_distance_matrix\n",
    "\n",
    "logger = logging.getLogger(__name__)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "id": "a3e28ad1-c8c2-41df-8f66-97a79bbe59cc",
   "metadata": {},
   "outputs": [],
   "source": [
    "from importlib import reload\n",
    "import sys\n",
    "\n",
    "for module_name in list(sys.modules.keys()):\n",
    "    if module_name.startswith(\"preprocessing_pipeline\"):\n",
    "        reload(sys.modules[module_name])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "e8d9fa35-37b5-4f3b-bb08-eff1703e5926",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:__main__:=== Starting multi-ome preprocessing pipeline ===\n"
     ]
    }
   ],
   "source": [
    "logging.getLogger().setLevel(config.logging_level)\n",
    "\n",
    "logger.info(\"=== Starting multi-ome preprocessing pipeline ===\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "7cb47cf7",
   "metadata": {},
   "source": [
    "##### 1. Prepare directories"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "76ccb5d0-c564-4988-8c06-2bb903086668",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "data_dir = Path(config.data_dir)\n",
    "genome_dir = Path(config.genome_dir)\n",
    "motif_dir = Path(config.motif_directory)\n",
    "out_dir = data_dir / config.output_subdir_name\n",
    "\n",
    "create_dir_if_not_exists(genome_dir)\n",
    "create_dir_if_not_exists(motif_dir)\n",
    "create_dir_if_not_exists(out_dir)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "cdffc8a7",
   "metadata": {},
   "source": [
    "##### 2. Download reference genome, gene annotations and chromosome sizes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4f7904a5-021f-4e47-9447-7b778435844a",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:preprocessing_pipeline.download:Using genome references for species='mouse', assembly='mm10'.\n",
      "GTF: https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_mouse/release_M18/gencode.vM18.basic.annotation.gtf.gz\n",
      "Chrom.sizes: https://hgdownload.cse.ucsc.edu/goldenpath/mm10/bigZips/mm10.chrom.sizes\n",
      "FASTA: https://hgdownload.soe.ucsc.edu/goldenPath/mm10/bigZips/mm10.fa.gz\n",
      "INFO:preprocessing_pipeline.download:Reference files are ready in /data/saraswat/new_metacells/mouse_genome_files\n"
     ]
    }
   ],
   "source": [
    "\n",
    "download_genome_references(\n",
    "    genome_dir=genome_dir,\n",
    "    species=config.species,\n",
    "    assembly=config.genome_assembly,\n",
    "    gtf_url=config.gtf_url,\n",
    "    chrom_sizes_url=config.chrom_sizes_url,\n",
    "    fasta_url=config.fasta_url\n",
    ")\n",
    "#download_motif_database(motif_dir, config.motif_database, config.species)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ed7bbd46",
   "metadata": {},
   "source": [
    "##### 3. Load RNA and ATAC anndata"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "abc4b74f-c240-4380-8626-a7395da9e2e2",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:preprocessing_pipeline.data_io:Loading RNA from /data/saraswat/new_metacells/data_gastrulation_single_cell/anndata.h5ad, ATAC from /data/saraswat/new_metacells/data_gastrulation_single_cell/PeakMatrix_anndata.h5ad\n"
     ]
    }
   ],
   "source": [
    "\n",
    "data_rna, data_atac = load_anndata(\n",
    "    data_dir,\n",
    "    config.rna_adata_file_name,\n",
    "    config.atac_adata_file_name\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "52756f25",
   "metadata": {},
   "source": [
    "##### 4. Find cells common to both modalities and remove mito genes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "07b6a579-ef7f-4932-bac4-bb092c5b1123",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:preprocessing_pipeline.filtering:Intersected cells: now RNA=(56861, 32285), ATAC=(56861, 192251)\n",
      "INFO:preprocessing_pipeline.filtering:Removed 13 mitochondrial genes with prefix=mt-\n"
     ]
    }
   ],
   "source": [
    "\n",
    "data_rna, data_atac = intersect_cells(data_rna, data_atac)\n",
    "data_rna = remove_mitochondrial_genes(data_rna, mito_prefix=config.mitochondrial_prefix)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d956a76a",
   "metadata": {},
   "source": [
    "##### 5. Optionally filter anndata to protein-coding genes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "830435dd-347e-41c9-b40a-dfb2beecc377",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:preprocessing_pipeline.gene_selection:Loading GTF from /data/saraswat/new_metacells/mouse_genome_files/annotation.gtf\n",
      "INFO:root:Extracted GTF attributes: ['gene_id', 'gene_type', 'gene_name', 'level', 'havana_gene', 'transcript_id', 'transcript_type', 'transcript_name', 'transcript_support_level', 'tag', 'havana_transcript', 'exon_number', 'exon_id', 'protein_id', 'ccdsid', 'ont']\n",
      "INFO:preprocessing_pipeline.gene_selection:Filtered to protein-coding genes: 19374 genes left.\n"
     ]
    }
   ],
   "source": [
    "# \n",
    "gtf_file = genome_dir / \"annotation.gtf\"\n",
    "gtf_df = load_gtf(gtf_file)\n",
    "data_rna = filter_protein_coding_genes(data_rna, gtf_df)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "cf26f71f",
   "metadata": {},
   "source": [
    "##### 6. Selecting highly variable genes and TFs\n",
    "First we find which TFs have a motif present in the database provided.\n",
    "User provided sets of genes and TFs are included in final list by default; highly variable computations are performed to obtain remaining genes and TFs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c840c910-127d-4083-8d06-bb16160271c4",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:preprocessing_pipeline.gene_selection:Selecting HVGs and TFs...\n",
      "INFO:preprocessing_pipeline.gene_selection:Selected 3700 HVGs + 300 TFs.\n"
     ]
    }
   ],
   "source": [
    "\n",
    "motif_path = motif_dir / f\"{config.motif_database}_{config.species}.meme\"\n",
    "tf_names_all = []\n",
    "with open(motif_path, \"r\") as f:\n",
    "    for line in f:\n",
    "        if line.startswith(\"MOTIF\"):\n",
    "            parts = line.strip().split()\n",
    "            if len(parts) >= 3:\n",
    "                tf_name = parts[2].split(\"_\")[0].strip(\"()\").strip()\n",
    "                tf_names_all.append(tf_name)\n",
    "tf_names_all = sorted(list(set(tf_names_all)))\n",
    "\n",
    "data_rna, final_genes, final_tfs = compute_hvgs_and_tfs(\n",
    "    data_rna=data_rna,\n",
    "    tf_names=tf_names_all,\n",
    "    user_genes=config.genes_user,\n",
    "    user_tfs=config.tfs_user,\n",
    "    num_genes=config.num_genes,\n",
    "    num_tfs=config.num_tfs,\n",
    "    min_cells=config.min_cells_per_gene\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "dcf20b7f",
   "metadata": {},
   "source": [
    "##### 7. Create extended gene bed file \n",
    "Here we extend the gene body to the user defined genomic window for processing later\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "66604c35-44ce-4456-a771-fd572a682227",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:__main__:Created extended gene bed => /data/saraswat/new_metacells/data_gastrulation_single_cell/generated/genes_extended_80kb.bed\n"
     ]
    }
   ],
   "source": [
    "\n",
    "chrom_sizes_path = genome_dir / f\"{config.genome_assembly}.chrom.sizes\"\n",
    "extended_genes_bed_df = create_extended_gene_bed(\n",
    "    gtf_df,\n",
    "    final_genes + final_tfs,  # if we want to include TF genes too\n",
    "    window_size=config.window_size,\n",
    "    chrom_sizes_path=chrom_sizes_path\n",
    ")\n",
    "\n",
    "gene_bed_file = out_dir / f\"genes_extended_{config.window_size//1000}kb.bed\"\n",
    "extended_genes_bed_df.to_csv(gene_bed_file, sep=\"\\t\", header=False, index=False)\n",
    "logger.info(f\"Created extended gene bed => {gene_bed_file}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "30ceb73a-c5a1-4fbb-a8a8-b1963e56bbbe",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>idx</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>chr1:3035602-3036202</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>chr1:3062653-3063253</th>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>chr1:3072313-3072913</th>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>chr1:3191496-3192096</th>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>chr1:3340575-3341175</th>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>chrX:169902806-169903406</th>\n",
       "      <td>3284</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>chrX:169905921-169906521</th>\n",
       "      <td>3285</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>chrX:169915616-169916216</th>\n",
       "      <td>3286</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>chrX:169925487-169926087</th>\n",
       "      <td>3287</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>chrX:169937064-169937664</th>\n",
       "      <td>3288</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>192251 rows × 1 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                           idx\n",
       "chr1:3035602-3036202         1\n",
       "chr1:3062653-3063253         2\n",
       "chr1:3072313-3072913         3\n",
       "chr1:3191496-3192096         4\n",
       "chr1:3340575-3341175         5\n",
       "...                        ...\n",
       "chrX:169902806-169903406  3284\n",
       "chrX:169905921-169906521  3285\n",
       "chrX:169915616-169916216  3286\n",
       "chrX:169925487-169926087  3287\n",
       "chrX:169937064-169937664  3288\n",
       "\n",
       "[192251 rows x 1 columns]"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_atac.var"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "dd112a39",
   "metadata": {},
   "source": [
    "##### 8. Create bed file for all peaks\n",
    "Of note, that peak anndata var should have chr, start, end and peak_name columns. If not, obtain them"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "efe7b58d-ef38-4449-911f-a7fa87619011",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "data_atac.var[\"chr\"] = [x.split(\":\")[0] for x in data_atac.var.index]\n",
    "data_atac.var[\"start\"] = [int(x.split(\":\")[1].split(\"-\")[0]) for x in data_atac.var.index]\n",
    "data_atac.var[\"end\"] = [int(x.split(\":\")[1].split(\"-\")[1]) for x in data_atac.var.index]\n",
    "data_atac.var[\"peak_name\"] = data_atac.var.index\n",
    "all_peaks_bed = out_dir / \"peaks_all.bed\"\n",
    "data_atac.var[[\"chr\",\"start\",\"end\",\"peak_name\"]].to_csv(all_peaks_bed, sep=\"\\t\", header=False, index=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a64098dd",
   "metadata": {},
   "source": [
    "##### 9. intersect peaks with extended gene window\n",
    "\n",
    "Here we subset to peaks which are within a user-defined genomic window of atleast one (selected) gene."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8e768a28-b448-4841-822e-ffe600baecd4",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:preprocessing_pipeline.motif_scanning:Running: bedtools intersect -u -wa -a /data/saraswat/new_metacells/data_gastrulation_single_cell/generated/peaks_all.bed -b /data/saraswat/new_metacells/data_gastrulation_single_cell/generated/genes_extended_80kb.bed > /data/saraswat/new_metacells/data_gastrulation_single_cell/generated/peaks_intersected.bed\n",
      "INFO:__main__:After gene-window filtering => shape=(56861, 98566)\n"
     ]
    }
   ],
   "source": [
    "# \n",
    "intersected_bed = out_dir / \"peaks_intersected.bed\"\n",
    "run_bedtools_intersect(a_bed=all_peaks_bed, b_bed=gene_bed_file, out_bed=intersected_bed)\n",
    "\n",
    "peaks_intersected = pd.read_csv(intersected_bed, sep=\"\\t\", header=None)\n",
    "peaks_intersected.columns = [\"chr\",\"start\",\"end\",\"peak_name\"]\n",
    "windowed_set = set(peaks_intersected[\"peak_name\"])\n",
    "\n",
    "# Subset data_atac to these peaks\n",
    "data_atac = data_atac[:, list(windowed_set)].copy()\n",
    "logger.info(f\"After gene-window filtering => shape={data_atac.shape}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "6bbb8a7b",
   "metadata": {},
   "source": [
    "##### 10. Create metacells and store in .obs[\"leiden\"]\n",
    "\n",
    "Here we obtain metacells using fine-grained leiden clustering on RNA modality. These metacells ar eused to calculate highly variable peaks and to calculate insilico-chipseq scores."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f26b9d5f-37fc-4412-bd26-05d9d44ec5a1",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:preprocessing_pipeline.metacells:Creating metacells with resolution=10 (grouping key=leiden).\n",
      "2025-03-14 20:19:19,432 - harmonypy - INFO - Computing initial centroids with sklearn.KMeans...\n",
      "INFO:harmonypy:Computing initial centroids with sklearn.KMeans...\n",
      "2025-03-14 20:19:20,834 - harmonypy - INFO - sklearn.KMeans initialization complete.\n",
      "INFO:harmonypy:sklearn.KMeans initialization complete.\n",
      "2025-03-14 20:19:21,003 - harmonypy - INFO - Iteration 1 of 10\n",
      "INFO:harmonypy:Iteration 1 of 10\n",
      "2025-03-14 20:19:27,921 - harmonypy - INFO - Iteration 2 of 10\n",
      "INFO:harmonypy:Iteration 2 of 10\n",
      "2025-03-14 20:19:34,804 - harmonypy - INFO - Iteration 3 of 10\n",
      "INFO:harmonypy:Iteration 3 of 10\n",
      "2025-03-14 20:19:41,695 - harmonypy - INFO - Iteration 4 of 10\n",
      "INFO:harmonypy:Iteration 4 of 10\n",
      "2025-03-14 20:19:48,586 - harmonypy - INFO - Iteration 5 of 10\n",
      "INFO:harmonypy:Iteration 5 of 10\n",
      "2025-03-14 20:19:55,489 - harmonypy - INFO - Iteration 6 of 10\n",
      "INFO:harmonypy:Iteration 6 of 10\n",
      "2025-03-14 20:20:02,383 - harmonypy - INFO - Iteration 7 of 10\n",
      "INFO:harmonypy:Iteration 7 of 10\n",
      "2025-03-14 20:20:09,297 - harmonypy - INFO - Converged after 7 iterations\n",
      "INFO:harmonypy:Converged after 7 iterations\n",
      "/data/saraswat/new_metacells/scdori_pip/preprocessing_pipeline/metacells.py:50: FutureWarning: In the future, the default backend for leiden will be igraph instead of leidenalg.\n",
      "\n",
      " To achieve the future defaults please pass: flavor=\"igraph\" and n_iterations=2.  directed must also be False to work with igraph's implementation.\n",
      "  sc.tl.leiden(data_rna, resolution=resolution, key_added=grouping_key)\n",
      "/data/saraswat/new_metacells/scdori_pip/preprocessing_pipeline/metacells.py:54: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.\n",
      "  cluster_groups = data_rna.obs.groupby(grouping_key)\n",
      "INFO:preprocessing_pipeline.metacells:Metacell shapes: RNA=(140, 4000), ATAC=(140, 98566)\n"
     ]
    }
   ],
   "source": [
    "#\n",
    "rna_metacell, atac_metacell = create_metacells(\n",
    "    data_rna, data_atac,\n",
    "    grouping_key=\"leiden\",\n",
    "    resolution=config.leiden_resolution,\n",
    "    batch_key=config.batch_key\n",
    ")\n",
    " # Copy labels\n",
    "data_atac.obs[\"leiden\"] = data_rna.obs[\"leiden\"]"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "14cd76d3",
   "metadata": {},
   "source": [
    "##### 11. Keep promoter peaks and highly variable peaks from the rest => total # = num_peaks"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4d121135-a2e1-4652-b4c8-784c0627e098",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "WARNING:preprocessing_pipeline.peak_selection:Column promoter_col not found in data_atac.var; no special promoter logic.\n",
      "/data/saraswat/new_metacells/scdori_pip/preprocessing_pipeline/peak_selection.py:22: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.\n",
      "  cluster_groups = data_atac.obs.groupby(cluster_key)\n",
      "INFO:preprocessing_pipeline.peak_selection:Selected top 90000 variable peaks (by std across leiden).\n",
      "INFO:__main__:Final shape after combining promoters + HV => (56861, 90000)\n"
     ]
    }
   ],
   "source": [
    "# \n",
    "data_atac = keep_promoters_and_select_hv_peaks(\n",
    "    data_atac=data_atac,\n",
    "    total_n_peaks=config.num_peaks,\n",
    "    cluster_key=\"leiden\",\n",
    "    promoter_col=config.promoter_col  # column in data_atac.var\n",
    ")\n",
    "\n",
    "logger.info(f\"Final shape after combining promoters + HV => {data_atac.shape}\")\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f006d500",
   "metadata": {},
   "source": [
    "##### 12. Save processed ATAC and RNA data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a4600d1e-30d7-4e44-8550-db5b804c10cf",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "... storing 'gene_type' as categorical\n",
      "... storing 'chr' as categorical\n",
      "INFO:preprocessing_pipeline.data_io:Saved processed RNA to /data/saraswat/new_metacells/data_gastrulation_single_cell/generated/rna_processed.h5ad with shape=(56861, 4000)\n",
      "INFO:preprocessing_pipeline.data_io:Saved processed ATAC to /data/saraswat/new_metacells/data_gastrulation_single_cell/generated/atac_processed.h5ad with shape=(56861, 90000)\n"
     ]
    }
   ],
   "source": [
    "\n",
    "save_processed_datasets(data_rna, data_atac, out_dir)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "7f8f490f",
   "metadata": {},
   "source": [
    "##### 13. Make bed file for final set of peaks (post selection)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "07118348-33f9-471d-9235-9eedb5a62642",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "\n",
    "# \n",
    "data_atac.var[\"chr\"] = [v.split(\":\")[0] for v in data_atac.var_names]\n",
    "data_atac.var[\"start\"] = [int(v.split(\":\")[1].split(\"-\")[0]) for v in data_atac.var_names]\n",
    "data_atac.var[\"end\"] = [int(v.split(\":\")[1].split(\"-\")[1]) for v in data_atac.var_names]\n",
    "data_atac.var[\"peak_name\"] = data_atac.var_names\n",
    "peaks_bed = out_dir / \"peaks_selected.bed\"\n",
    "data_atac.var[[\"chr\",\"start\",\"end\",\"peak_name\"]].to_csv(\n",
    "    peaks_bed, sep=\"\\t\", header=False, index=False\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "4e2049f2",
   "metadata": {},
   "source": [
    "##### 14. Compute motif matches for peaks\n",
    "\n",
    "We use FIMO module from tangermeme (https://tangermeme.readthedocs.io/en/latest/tutorials/Tutorial_D1_FIMO.html) to score the motifs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e0813fab-0845-40c7-95e2-cabf16661fc0",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:preprocessing_pipeline.motif_scanning:Reading motif file: /data/saraswat/new_metacells/motif_database/cisbp_mouse.meme\n",
      "INFO:preprocessing_pipeline.motif_scanning:Subselected 300 motifs for 300 TFs.\n",
      "INFO:preprocessing_pipeline.motif_scanning:Computing motif scores for /data/saraswat/new_metacells/data_gastrulation_single_cell/generated/peaks_selected.bed (n_peaks=90000) with window=500\n",
      "100%|██████████| 300/300 [00:05<00:00, 57.26it/s]\n",
      "INFO:preprocessing_pipeline.motif_scanning:Finished computing motif scores: (90000, 300)\n"
     ]
    }
   ],
   "source": [
    "# \n",
    "motif_path = Path(config.motif_directory) / f\"{config.motif_database}_{config.species}.meme\"\n",
    "pwms_sub, key_to_tf = load_motif_database(motif_path, final_tfs)\n",
    "fasta_path = genome_dir / f\"{config.genome_assembly}.fa\"\n",
    "df_motif_scores = compute_motif_scores(\n",
    "    bed_file=peaks_bed,\n",
    "    fasta_file=fasta_path,\n",
    "    pwms_sub=pwms_sub,\n",
    "    key_to_tf=key_to_tf,\n",
    "    n_peaks=data_atac.shape[1],\n",
    "    window=500,threshold= config.motif_match_pvalue_threshold\n",
    ")\n",
    "df_motif_scores=df_motif_scores[final_tfs]  "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "id": "a09c2d2b-49c7-4de8-b124-f60df687937a",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_motif_scores.to_csv(out_dir / \"motif_scores.tsv\", sep=\"\\t\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "da4e68ff",
   "metadata": {},
   "source": [
    "##### 15. compute insilico-chipseq\n",
    "We first subset the previously computed ATAC metacell matrix to selected peaks and use it calculate correlation of TF-peak expression-accesibility. These correlations are thresholded based on an empirically determined cutoff ( from non-motif matched peaks per TF) and then multiplied by motif matching scores from FIMO to obtain insilico-chipseq scores ( adapted from https://www.biorxiv.org/content/10.1101/2022.06.15.496239v1 and DiFFTF https://pubmed.ncbi.nlm.nih.gov/31801079/)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "id": "d84d122d-6c8b-48d4-9f99-c26fb6948d8c",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:preprocessing_pipeline.correlation:Computing in-silico ChIP-seq correlation...\n",
      "Thresholding correlation: 100%|██████████| 300/300 [00:01<00:00, 217.98it/s]\n",
      "INFO:preprocessing_pipeline.correlation:Finished in-silico ChIP-seq computation.\n"
     ]
    }
   ],
   "source": [
    "# 14) Recompute metacells for correlation with selected peaks\n",
    "    #     Or subset existing atac_metacell to the new set of peaks\n",
    "# then compute insilico-chipseq\n",
    "atac_metacell = atac_metacell[:, data_atac.var_names].copy()\n",
    "tf_mask = rna_metacell.var[\"gene_type\"] == \"TF\"\n",
    "rna_matrix = rna_metacell.X[:, tf_mask]  # shape=(n_meta, n_tfs)\n",
    "atac_matrix = atac_metacell.X  # shape=(n_meta, n_peaks)\n",
    "\n",
    "insilico_chipseq_act, insilico_chipseq_rep = compute_in_silico_chipseq(\n",
    "    atac_matrix=atac_matrix,\n",
    "    rna_matrix=rna_matrix,\n",
    "    motif_scores=df_motif_scores,\n",
    "    percentile=config.correlation_percentile,\n",
    "    n_bg=config.n_bg_peaks_for_corr\n",
    ")\n",
    "np.save(out_dir / \"insilico_chipseq_act.npy\", insilico_chipseq_act)\n",
    "np.save(out_dir / \"insilico_chipseq_rep.npy\", insilico_chipseq_rep)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2304d255",
   "metadata": {},
   "outputs": [],
   "source": [
    "##### 16. Compute distance matrix between peaks and genes\n",
    "\n",
    "distance is set to 0 if the peak midpoint is within gene-body or promoter (5kb upstream of TSS by default)\n",
    "distance is -1 if peak-gene pairs on different chromosomes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "62a1d1ee-6162-4294-b34d-37bd01956b06",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:preprocessing_pipeline.utils:Starting computation of gene-peak distances...\n",
      "INFO:preprocessing_pipeline.utils:Number of genes: 4000, Number of peaks: 90000\n",
      "100%|██████████| 4000/4000 [05:07<00:00, 13.00it/s]\n",
      "INFO:preprocessing_pipeline.utils:Gene-peak distance matrix computed with shape: (4000, 90000)\n"
     ]
    }
   ],
   "source": [
    "\n",
    "data_atac.var[\"index_int\"] = range(data_atac.shape[1])\n",
    "selected_peak_indices = data_atac.var[\"index_int\"].values\n",
    "\n",
    "# Subset GTF to final genes\n",
    "gene_info = gtf_df[gtf_df.feature == \"gene\"].drop_duplicates(\"gene_name\")\n",
    "gene_info['gene']=gene_info['gene_name'].values\n",
    "gene_info = gene_info.set_index(\"gene_name\")\n",
    "gene_info = gene_info.loc[\n",
    "    data_rna.var_names.intersection(gene_info.index)\n",
    "]\n",
    "\n",
    "\n",
    "gene_info[\"chr\"] = gene_info[\"seqname\"]  # rename col for consistency\n",
    "# Create gene_coordinates_intersect with necessary columns\n",
    "gene_info = gene_info[[\n",
    "    \"chr\", \"start\", \"end\", \"strand\",\"gene\"\n",
    "]].copy()\n",
    "gene_info.columns = [\"chr_gene\", \"start\", \"end\", \"strand\",\"gene\"]\n",
    "\n",
    "dist_matrix = compute_gene_peak_distance_matrix(\n",
    "    data_rna=data_rna,\n",
    "    data_atac=data_atac,\n",
    "    gene_coordinates_intersect=gene_info\n",
    "    \n",
    ")\n",
    "np.save(out_dir / \"gene_peak_distance_raw.npy\", dist_matrix)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "id": "f98a3423-4958-456c-ab56-27b7579ffd2c",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[-1, 19948529, -1, ..., -1, -1, -1],\n",
       "       [-1, -1, -1, ..., -1, -1, -1],\n",
       "       [-1, -1, -1, ..., -1, -1, -1],\n",
       "       ...,\n",
       "       [-1, -1, -1, ..., -1, -1, -1],\n",
       "       [-1, -1, -1, ..., -1, -1, -1],\n",
       "       [-1, -1, -1, ..., -1, -1, -1]], dtype=object)"
      ]
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dist_matrix # -1 denotes peaks on different chromosome"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "39b02b61",
   "metadata": {},
   "outputs": [],
   "source": [
    "##### 17. obtaining distance based decay terms to initialise peak-gene matrix for training scDoRI "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "770b2cc7-4862-4bbd-b248-8dd5783ffb99",
   "metadata": {},
   "outputs": [],
   "source": [
    "# \n",
    "dist_matrix[dist_matrix < 0 ]= 1e8\n",
    "dist_matrix = np.exp(-1 * dist_matrix.astype(float) / config.peak_distance_scaling_factor)\n",
    "dist_matrix = np.where(dist_matrix < config.peak_distance_min_cutoff, 0, dist_matrix)\n",
    "np.save(out_dir / \"gene_peak_distance_exp.npy\", dist_matrix)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "0ddbeef1",
   "metadata": {},
   "source": [
    "##### 18. Final Logging, completed preprocessing"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "id": "f523ca11-d8d4-4ffa-ac0e-28b0fe6cffd4",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:__main__:=== Pipeline completed successfully ===\n"
     ]
    }
   ],
   "source": [
    "logger.info(\"=== Pipeline completed successfully ===\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b0e61211-4daf-47ed-9b37-cebbc66bca34",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.13.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}