Source code for preprocessing_pipeline.data_io

import scanpy as sc
import anndata as ad
from pathlib import Path
import logging

logger = logging.getLogger(__name__)

[docs] def create_dir_if_not_exists(directory: Path) -> None: """ Create a directory (and parent directories) if it does not already exist. Parameters ---------- directory : pathlib.Path The directory path to create. Returns ------- None If the directory already exists, does nothing. Otherwise, it is created along with any necessary parent folders. """ if not directory.exists(): logger.info(f"Creating directory: {directory}") directory.mkdir(parents=True, exist_ok=True)
[docs] def load_anndata( data_dir: Path, rna_file: str, atac_file: str ) -> tuple[ad.AnnData, ad.AnnData]: """ Load RNA and ATAC data from disk into AnnData objects. Parameters ---------- data_dir : pathlib.Path Base directory containing the `.h5ad` files. rna_file : str The filename for the RNA data (H5AD). atac_file : str The filename for the ATAC data (H5AD). Returns ------- (AnnData, AnnData) A tuple containing: - data_rna: AnnData The RNA AnnData object. - data_atac: AnnData The ATAC AnnData object. Notes ----- Both `.h5ad` files are expected to be located in `data_dir`. This function logs the path from which each file is loaded. """ rna_path = data_dir / rna_file atac_path = data_dir / atac_file logger.info(f"Loading RNA from {rna_path}, ATAC from {atac_path}") data_rna = sc.read_h5ad(rna_path) data_atac = sc.read_h5ad(atac_path) return data_rna, data_atac
[docs] def save_processed_datasets( data_rna: ad.AnnData, data_atac: ad.AnnData, out_dir: Path ) -> None: """ Save processed RNA and ATAC AnnData objects with matching cell order. Parameters ---------- data_rna : anndata.AnnData The RNA AnnData object, potentially subset or processed. data_atac : anndata.AnnData The ATAC AnnData object, potentially subset or processed. out_dir : pathlib.Path The directory where the processed `.h5ad` files will be saved. Returns ------- None Writes two files, `rna_processed.h5ad` and `atac_processed.h5ad`, ensuring both have the same set and order of cells. Notes ----- - This function intersects the cell indices (obs_names) of `data_rna` and `data_atac` to keep only the common cells. - The final shapes of the saved AnnData objects are logged. """ # Ensure same cell order common_cells = data_rna.obs_names.intersection(data_atac.obs_names) data_rna = data_rna[common_cells].copy() data_atac = data_atac[common_cells].copy() # Save rna_path = out_dir / "rna_processed.h5ad" atac_path = out_dir / "atac_processed.h5ad" data_rna.write_h5ad(rna_path) data_atac.write_h5ad(atac_path) logger.info(f"Saved processed RNA to {rna_path} with shape={data_rna.shape}") logger.info(f"Saved processed ATAC to {atac_path} with shape={data_atac.shape}")