Source code for preprocessing_pipeline.filtering
# filtering.py
import anndata as ad
import logging
logger = logging.getLogger(__name__)
[docs]
def intersect_cells(
data_rna: ad.AnnData,
data_atac: ad.AnnData
) -> tuple[ad.AnnData, ad.AnnData]:
"""
Keep only the cells that are present in both RNA and ATAC datasets.
Parameters
----------
data_rna : anndata.AnnData
The RNA single-cell data.
data_atac : anndata.AnnData
The ATAC single-cell data.
Returns
-------
(data_rna_sub, data_atac_sub) : tuple of anndata.AnnData
Two AnnData objects that share the same set of cells, in the same order.
Notes
-----
- The function finds the intersection of `obs_names` (cell barcodes) between
the two AnnData objects.
- It logs the new shapes of the intersected RNA and ATAC data.
"""
common_idx = data_rna.obs_names.intersection(data_atac.obs_names)
data_rna_sub = data_rna[common_idx].copy()
data_atac_sub = data_atac[common_idx].copy()
logger.info(f"Intersected cells: now RNA={data_rna_sub.shape}, ATAC={data_atac_sub.shape}")
return data_rna_sub, data_atac_sub
[docs]
def remove_mitochondrial_genes(
data_rna: ad.AnnData,
mito_prefix: str = "mt-"
) -> ad.AnnData:
"""
Remove mitochondrial genes from the RNA data based on a gene name prefix.
Parameters
----------
data_rna : anndata.AnnData
The RNA AnnData containing gene expression counts.
mito_prefix : str, optional
The prefix used to identify mitochondrial genes. Default is "mt-".
Returns
-------
anndata.AnnData
A new AnnData object without mitochondrial genes. The original data
is not modified in-place.
Notes
-----
- Mitochondrial genes are identified by checking if gene names start with
`mito_prefix` (case-insensitive).
- A boolean column "mt" is added to `data_rna.var` to indicate whether
each gene was marked as mitochondrial. This subset is then removed
from the data.
- Logs the number of removed genes.
"""
data_rna.var["mt"] = [gene.lower().startswith(mito_prefix) for gene in data_rna.var_names]
keep_mask = ~data_rna.var["mt"]
data_rna_sub = data_rna[:, keep_mask].copy()
dropped = data_rna.shape[1] - data_rna_sub.shape[1]
logger.info(f"Removed {dropped} mitochondrial genes with prefix={mito_prefix}")
return data_rna_sub