Preprocessing Notebook#
[53]:
import logging
from pathlib import Path
import numpy as np
import pandas as pd
# 1) Import config & internal modules
from preprocessing_pipeline import config
from preprocessing_pipeline.download import (
download_genome_references
)
from preprocessing_pipeline.data_io import (
create_dir_if_not_exists,
load_anndata,
save_processed_datasets
)
from preprocessing_pipeline.filtering import (
intersect_cells,
remove_mitochondrial_genes
)
from preprocessing_pipeline.gene_selection import (
load_gtf,
filter_protein_coding_genes,
compute_hvgs_and_tfs
)
from preprocessing_pipeline.peak_selection import keep_promoters_and_select_hv_peaks
from preprocessing_pipeline.metacells import create_metacells
from preprocessing_pipeline.motif_scanning import (
run_bedtools_intersect,
load_motif_database,
compute_motif_scores
)
from preprocessing_pipeline.correlation import compute_in_silico_chipseq
from preprocessing_pipeline.utils import create_extended_gene_bed,compute_gene_peak_distance_matrix
logger = logging.getLogger(__name__)
[54]:
from importlib import reload
import sys
for module_name in list(sys.modules.keys()):
if module_name.startswith("preprocessing_pipeline"):
reload(sys.modules[module_name])
[3]:
logging.getLogger().setLevel(config.logging_level)
logger.info("=== Starting multi-ome preprocessing pipeline ===")
INFO:__main__:=== Starting multi-ome preprocessing pipeline ===
1. Prepare directories#
[ ]:
data_dir = Path(config.data_dir)
genome_dir = Path(config.genome_dir)
motif_dir = Path(config.motif_directory)
out_dir = data_dir / config.output_subdir_name
create_dir_if_not_exists(genome_dir)
create_dir_if_not_exists(motif_dir)
create_dir_if_not_exists(out_dir)
2. Download reference genome, gene annotations and chromosome sizes#
[ ]:
download_genome_references(
genome_dir=genome_dir,
species=config.species,
assembly=config.genome_assembly,
gtf_url=config.gtf_url,
chrom_sizes_url=config.chrom_sizes_url,
fasta_url=config.fasta_url
)
#download_motif_database(motif_dir, config.motif_database, config.species)
INFO:preprocessing_pipeline.download:Using genome references for species='mouse', assembly='mm10'.
GTF: https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_mouse/release_M18/gencode.vM18.basic.annotation.gtf.gz
Chrom.sizes: https://hgdownload.cse.ucsc.edu/goldenpath/mm10/bigZips/mm10.chrom.sizes
FASTA: https://hgdownload.soe.ucsc.edu/goldenPath/mm10/bigZips/mm10.fa.gz
INFO:preprocessing_pipeline.download:Reference files are ready in /data/saraswat/new_metacells/mouse_genome_files
3. Load RNA and ATAC anndata#
[ ]:
data_rna, data_atac = load_anndata(
data_dir,
config.rna_adata_file_name,
config.atac_adata_file_name
)
INFO:preprocessing_pipeline.data_io:Loading RNA from /data/saraswat/new_metacells/data_gastrulation_single_cell/anndata.h5ad, ATAC from /data/saraswat/new_metacells/data_gastrulation_single_cell/PeakMatrix_anndata.h5ad
4. Find cells common to both modalities and remove mito genes#
[ ]:
data_rna, data_atac = intersect_cells(data_rna, data_atac)
data_rna = remove_mitochondrial_genes(data_rna, mito_prefix=config.mitochondrial_prefix)
INFO:preprocessing_pipeline.filtering:Intersected cells: now RNA=(56861, 32285), ATAC=(56861, 192251)
INFO:preprocessing_pipeline.filtering:Removed 13 mitochondrial genes with prefix=mt-
5. Optionally filter anndata to protein-coding genes#
[ ]:
#
gtf_file = genome_dir / "annotation.gtf"
gtf_df = load_gtf(gtf_file)
data_rna = filter_protein_coding_genes(data_rna, gtf_df)
INFO:preprocessing_pipeline.gene_selection:Loading GTF from /data/saraswat/new_metacells/mouse_genome_files/annotation.gtf
INFO:root:Extracted GTF attributes: ['gene_id', 'gene_type', 'gene_name', 'level', 'havana_gene', 'transcript_id', 'transcript_type', 'transcript_name', 'transcript_support_level', 'tag', 'havana_transcript', 'exon_number', 'exon_id', 'protein_id', 'ccdsid', 'ont']
INFO:preprocessing_pipeline.gene_selection:Filtered to protein-coding genes: 19374 genes left.
6. Selecting highly variable genes and TFs#
First we find which TFs have a motif present in the database provided. User provided sets of genes and TFs are included in final list by default; highly variable computations are performed to obtain remaining genes and TFs
[ ]:
motif_path = motif_dir / f"{config.motif_database}_{config.species}.meme"
tf_names_all = []
with open(motif_path, "r") as f:
for line in f:
if line.startswith("MOTIF"):
parts = line.strip().split()
if len(parts) >= 3:
tf_name = parts[2].split("_")[0].strip("()").strip()
tf_names_all.append(tf_name)
tf_names_all = sorted(list(set(tf_names_all)))
data_rna, final_genes, final_tfs = compute_hvgs_and_tfs(
data_rna=data_rna,
tf_names=tf_names_all,
user_genes=config.genes_user,
user_tfs=config.tfs_user,
num_genes=config.num_genes,
num_tfs=config.num_tfs,
min_cells=config.min_cells_per_gene
)
INFO:preprocessing_pipeline.gene_selection:Selecting HVGs and TFs...
INFO:preprocessing_pipeline.gene_selection:Selected 3700 HVGs + 300 TFs.
7. Create extended gene bed file#
Here we extend the gene body to the user defined genomic window for processing later
[ ]:
chrom_sizes_path = genome_dir / f"{config.genome_assembly}.chrom.sizes"
extended_genes_bed_df = create_extended_gene_bed(
gtf_df,
final_genes + final_tfs, # if we want to include TF genes too
window_size=config.window_size,
chrom_sizes_path=chrom_sizes_path
)
gene_bed_file = out_dir / f"genes_extended_{config.window_size//1000}kb.bed"
extended_genes_bed_df.to_csv(gene_bed_file, sep="\t", header=False, index=False)
logger.info(f"Created extended gene bed => {gene_bed_file}")
INFO:__main__:Created extended gene bed => /data/saraswat/new_metacells/data_gastrulation_single_cell/generated/genes_extended_80kb.bed
[11]:
data_atac.var
[11]:
idx | |
---|---|
chr1:3035602-3036202 | 1 |
chr1:3062653-3063253 | 2 |
chr1:3072313-3072913 | 3 |
chr1:3191496-3192096 | 4 |
chr1:3340575-3341175 | 5 |
... | ... |
chrX:169902806-169903406 | 3284 |
chrX:169905921-169906521 | 3285 |
chrX:169915616-169916216 | 3286 |
chrX:169925487-169926087 | 3287 |
chrX:169937064-169937664 | 3288 |
192251 rows × 1 columns
8. Create bed file for all peaks#
Of note, that peak anndata var should have chr, start, end and peak_name columns. If not, obtain them
[ ]:
data_atac.var["chr"] = [x.split(":")[0] for x in data_atac.var.index]
data_atac.var["start"] = [int(x.split(":")[1].split("-")[0]) for x in data_atac.var.index]
data_atac.var["end"] = [int(x.split(":")[1].split("-")[1]) for x in data_atac.var.index]
data_atac.var["peak_name"] = data_atac.var.index
all_peaks_bed = out_dir / "peaks_all.bed"
data_atac.var[["chr","start","end","peak_name"]].to_csv(all_peaks_bed, sep="\t", header=False, index=False)
9. intersect peaks with extended gene window#
Here we subset to peaks which are within a user-defined genomic window of atleast one (selected) gene.
[ ]:
#
intersected_bed = out_dir / "peaks_intersected.bed"
run_bedtools_intersect(a_bed=all_peaks_bed, b_bed=gene_bed_file, out_bed=intersected_bed)
peaks_intersected = pd.read_csv(intersected_bed, sep="\t", header=None)
peaks_intersected.columns = ["chr","start","end","peak_name"]
windowed_set = set(peaks_intersected["peak_name"])
# Subset data_atac to these peaks
data_atac = data_atac[:, list(windowed_set)].copy()
logger.info(f"After gene-window filtering => shape={data_atac.shape}")
INFO:preprocessing_pipeline.motif_scanning:Running: bedtools intersect -u -wa -a /data/saraswat/new_metacells/data_gastrulation_single_cell/generated/peaks_all.bed -b /data/saraswat/new_metacells/data_gastrulation_single_cell/generated/genes_extended_80kb.bed > /data/saraswat/new_metacells/data_gastrulation_single_cell/generated/peaks_intersected.bed
INFO:__main__:After gene-window filtering => shape=(56861, 98566)
10. Create metacells and store in .obs[“leiden”]#
Here we obtain metacells using fine-grained leiden clustering on RNA modality. These metacells ar eused to calculate highly variable peaks and to calculate insilico-chipseq scores.
[ ]:
#
rna_metacell, atac_metacell = create_metacells(
data_rna, data_atac,
grouping_key="leiden",
resolution=config.leiden_resolution,
batch_key=config.batch_key
)
# Copy labels
data_atac.obs["leiden"] = data_rna.obs["leiden"]
INFO:preprocessing_pipeline.metacells:Creating metacells with resolution=10 (grouping key=leiden).
2025-03-14 20:19:19,432 - harmonypy - INFO - Computing initial centroids with sklearn.KMeans...
INFO:harmonypy:Computing initial centroids with sklearn.KMeans...
2025-03-14 20:19:20,834 - harmonypy - INFO - sklearn.KMeans initialization complete.
INFO:harmonypy:sklearn.KMeans initialization complete.
2025-03-14 20:19:21,003 - harmonypy - INFO - Iteration 1 of 10
INFO:harmonypy:Iteration 1 of 10
2025-03-14 20:19:27,921 - harmonypy - INFO - Iteration 2 of 10
INFO:harmonypy:Iteration 2 of 10
2025-03-14 20:19:34,804 - harmonypy - INFO - Iteration 3 of 10
INFO:harmonypy:Iteration 3 of 10
2025-03-14 20:19:41,695 - harmonypy - INFO - Iteration 4 of 10
INFO:harmonypy:Iteration 4 of 10
2025-03-14 20:19:48,586 - harmonypy - INFO - Iteration 5 of 10
INFO:harmonypy:Iteration 5 of 10
2025-03-14 20:19:55,489 - harmonypy - INFO - Iteration 6 of 10
INFO:harmonypy:Iteration 6 of 10
2025-03-14 20:20:02,383 - harmonypy - INFO - Iteration 7 of 10
INFO:harmonypy:Iteration 7 of 10
2025-03-14 20:20:09,297 - harmonypy - INFO - Converged after 7 iterations
INFO:harmonypy:Converged after 7 iterations
/data/saraswat/new_metacells/scdori_pip/preprocessing_pipeline/metacells.py:50: FutureWarning: In the future, the default backend for leiden will be igraph instead of leidenalg.
To achieve the future defaults please pass: flavor="igraph" and n_iterations=2. directed must also be False to work with igraph's implementation.
sc.tl.leiden(data_rna, resolution=resolution, key_added=grouping_key)
/data/saraswat/new_metacells/scdori_pip/preprocessing_pipeline/metacells.py:54: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
cluster_groups = data_rna.obs.groupby(grouping_key)
INFO:preprocessing_pipeline.metacells:Metacell shapes: RNA=(140, 4000), ATAC=(140, 98566)
11. Keep promoter peaks and highly variable peaks from the rest => total # = num_peaks#
[ ]:
#
data_atac = keep_promoters_and_select_hv_peaks(
data_atac=data_atac,
total_n_peaks=config.num_peaks,
cluster_key="leiden",
promoter_col=config.promoter_col # column in data_atac.var
)
logger.info(f"Final shape after combining promoters + HV => {data_atac.shape}")
WARNING:preprocessing_pipeline.peak_selection:Column promoter_col not found in data_atac.var; no special promoter logic.
/data/saraswat/new_metacells/scdori_pip/preprocessing_pipeline/peak_selection.py:22: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
cluster_groups = data_atac.obs.groupby(cluster_key)
INFO:preprocessing_pipeline.peak_selection:Selected top 90000 variable peaks (by std across leiden).
INFO:__main__:Final shape after combining promoters + HV => (56861, 90000)
12. Save processed ATAC and RNA data#
[ ]:
save_processed_datasets(data_rna, data_atac, out_dir)
... storing 'gene_type' as categorical
... storing 'chr' as categorical
INFO:preprocessing_pipeline.data_io:Saved processed RNA to /data/saraswat/new_metacells/data_gastrulation_single_cell/generated/rna_processed.h5ad with shape=(56861, 4000)
INFO:preprocessing_pipeline.data_io:Saved processed ATAC to /data/saraswat/new_metacells/data_gastrulation_single_cell/generated/atac_processed.h5ad with shape=(56861, 90000)
13. Make bed file for final set of peaks (post selection)#
[ ]:
#
data_atac.var["chr"] = [v.split(":")[0] for v in data_atac.var_names]
data_atac.var["start"] = [int(v.split(":")[1].split("-")[0]) for v in data_atac.var_names]
data_atac.var["end"] = [int(v.split(":")[1].split("-")[1]) for v in data_atac.var_names]
data_atac.var["peak_name"] = data_atac.var_names
peaks_bed = out_dir / "peaks_selected.bed"
data_atac.var[["chr","start","end","peak_name"]].to_csv(
peaks_bed, sep="\t", header=False, index=False
)
14. Compute motif matches for peaks#
We use FIMO module from tangermeme (https://tangermeme.readthedocs.io/en/latest/tutorials/Tutorial_D1_FIMO.html) to score the motifs
[ ]:
#
motif_path = Path(config.motif_directory) / f"{config.motif_database}_{config.species}.meme"
pwms_sub, key_to_tf = load_motif_database(motif_path, final_tfs)
fasta_path = genome_dir / f"{config.genome_assembly}.fa"
df_motif_scores = compute_motif_scores(
bed_file=peaks_bed,
fasta_file=fasta_path,
pwms_sub=pwms_sub,
key_to_tf=key_to_tf,
n_peaks=data_atac.shape[1],
window=500,threshold= config.motif_match_pvalue_threshold
)
df_motif_scores=df_motif_scores[final_tfs]
INFO:preprocessing_pipeline.motif_scanning:Reading motif file: /data/saraswat/new_metacells/motif_database/cisbp_mouse.meme
INFO:preprocessing_pipeline.motif_scanning:Subselected 300 motifs for 300 TFs.
INFO:preprocessing_pipeline.motif_scanning:Computing motif scores for /data/saraswat/new_metacells/data_gastrulation_single_cell/generated/peaks_selected.bed (n_peaks=90000) with window=500
100%|██████████| 300/300 [00:05<00:00, 57.26it/s]
INFO:preprocessing_pipeline.motif_scanning:Finished computing motif scores: (90000, 300)
[58]:
df_motif_scores.to_csv(out_dir / "motif_scores.tsv", sep="\t")
15. compute insilico-chipseq#
We first subset the previously computed ATAC metacell matrix to selected peaks and use it calculate correlation of TF-peak expression-accesibility. These correlations are thresholded based on an empirically determined cutoff ( from non-motif matched peaks per TF) and then multiplied by motif matching scores from FIMO to obtain insilico-chipseq scores ( adapted from https://www.biorxiv.org/content/10.1101/2022.06.15.496239v1 and DiFFTF https://pubmed.ncbi.nlm.nih.gov/31801079/)
[28]:
# 14) Recompute metacells for correlation with selected peaks
# Or subset existing atac_metacell to the new set of peaks
# then compute insilico-chipseq
atac_metacell = atac_metacell[:, data_atac.var_names].copy()
tf_mask = rna_metacell.var["gene_type"] == "TF"
rna_matrix = rna_metacell.X[:, tf_mask] # shape=(n_meta, n_tfs)
atac_matrix = atac_metacell.X # shape=(n_meta, n_peaks)
insilico_chipseq_act, insilico_chipseq_rep = compute_in_silico_chipseq(
atac_matrix=atac_matrix,
rna_matrix=rna_matrix,
motif_scores=df_motif_scores,
percentile=config.correlation_percentile,
n_bg=config.n_bg_peaks_for_corr
)
np.save(out_dir / "insilico_chipseq_act.npy", insilico_chipseq_act)
np.save(out_dir / "insilico_chipseq_rep.npy", insilico_chipseq_rep)
INFO:preprocessing_pipeline.correlation:Computing in-silico ChIP-seq correlation...
Thresholding correlation: 100%|██████████| 300/300 [00:01<00:00, 217.98it/s]
INFO:preprocessing_pipeline.correlation:Finished in-silico ChIP-seq computation.
[ ]:
##### 16. Compute distance matrix between peaks and genes
distance is set to 0 if the peak midpoint is within gene-body or promoter (5kb upstream of TSS by default)
distance is -1 if peak-gene pairs on different chromosomes
[ ]:
data_atac.var["index_int"] = range(data_atac.shape[1])
selected_peak_indices = data_atac.var["index_int"].values
# Subset GTF to final genes
gene_info = gtf_df[gtf_df.feature == "gene"].drop_duplicates("gene_name")
gene_info['gene']=gene_info['gene_name'].values
gene_info = gene_info.set_index("gene_name")
gene_info = gene_info.loc[
data_rna.var_names.intersection(gene_info.index)
]
gene_info["chr"] = gene_info["seqname"] # rename col for consistency
# Create gene_coordinates_intersect with necessary columns
gene_info = gene_info[[
"chr", "start", "end", "strand","gene"
]].copy()
gene_info.columns = ["chr_gene", "start", "end", "strand","gene"]
dist_matrix = compute_gene_peak_distance_matrix(
data_rna=data_rna,
data_atac=data_atac,
gene_coordinates_intersect=gene_info
)
np.save(out_dir / "gene_peak_distance_raw.npy", dist_matrix)
INFO:preprocessing_pipeline.utils:Starting computation of gene-peak distances...
INFO:preprocessing_pipeline.utils:Number of genes: 4000, Number of peaks: 90000
100%|██████████| 4000/4000 [05:07<00:00, 13.00it/s]
INFO:preprocessing_pipeline.utils:Gene-peak distance matrix computed with shape: (4000, 90000)
[31]:
dist_matrix # -1 denotes peaks on different chromosome
[31]:
array([[-1, 19948529, -1, ..., -1, -1, -1],
[-1, -1, -1, ..., -1, -1, -1],
[-1, -1, -1, ..., -1, -1, -1],
...,
[-1, -1, -1, ..., -1, -1, -1],
[-1, -1, -1, ..., -1, -1, -1],
[-1, -1, -1, ..., -1, -1, -1]], dtype=object)
[ ]:
##### 17. obtaining distance based decay terms to initialise peak-gene matrix for training scDoRI
[ ]:
#
dist_matrix[dist_matrix < 0 ]= 1e8
dist_matrix = np.exp(-1 * dist_matrix.astype(float) / config.peak_distance_scaling_factor)
dist_matrix = np.where(dist_matrix < config.peak_distance_min_cutoff, 0, dist_matrix)
np.save(out_dir / "gene_peak_distance_exp.npy", dist_matrix)
18. Final Logging, completed preprocessing#
[33]:
logger.info("=== Pipeline completed successfully ===")
INFO:__main__:=== Pipeline completed successfully ===
[ ]: