Source code for preprocessing_pipeline.data_io
import scanpy as sc
import anndata as ad
from pathlib import Path
import logging
logger = logging.getLogger(__name__)
[docs]
def create_dir_if_not_exists(directory: Path) -> None:
"""
Create a directory (and parent directories) if it does not already exist.
Parameters
----------
directory : pathlib.Path
The directory path to create.
Returns
-------
None
If the directory already exists, does nothing. Otherwise, it is created
along with any necessary parent folders.
"""
if not directory.exists():
logger.info(f"Creating directory: {directory}")
directory.mkdir(parents=True, exist_ok=True)
[docs]
def load_anndata(
data_dir: Path,
rna_file: str,
atac_file: str
) -> tuple[ad.AnnData, ad.AnnData]:
"""
Load RNA and ATAC data from disk into AnnData objects.
Parameters
----------
data_dir : pathlib.Path
Base directory containing the `.h5ad` files.
rna_file : str
The filename for the RNA data (H5AD).
atac_file : str
The filename for the ATAC data (H5AD).
Returns
-------
(AnnData, AnnData)
A tuple containing:
- data_rna: AnnData
The RNA AnnData object.
- data_atac: AnnData
The ATAC AnnData object.
Notes
-----
Both `.h5ad` files are expected to be located in `data_dir`.
This function logs the path from which each file is loaded.
"""
rna_path = data_dir / rna_file
atac_path = data_dir / atac_file
logger.info(f"Loading RNA from {rna_path}, ATAC from {atac_path}")
data_rna = sc.read_h5ad(rna_path)
data_atac = sc.read_h5ad(atac_path)
return data_rna, data_atac
[docs]
def save_processed_datasets(
data_rna: ad.AnnData,
data_atac: ad.AnnData,
out_dir: Path
) -> None:
"""
Save processed RNA and ATAC AnnData objects with matching cell order.
Parameters
----------
data_rna : anndata.AnnData
The RNA AnnData object, potentially subset or processed.
data_atac : anndata.AnnData
The ATAC AnnData object, potentially subset or processed.
out_dir : pathlib.Path
The directory where the processed `.h5ad` files will be saved.
Returns
-------
None
Writes two files, `rna_processed.h5ad` and `atac_processed.h5ad`,
ensuring both have the same set and order of cells.
Notes
-----
- This function intersects the cell indices (obs_names) of `data_rna` and `data_atac` to keep only the common cells.
- The final shapes of the saved AnnData objects are logged.
"""
# Ensure same cell order
common_cells = data_rna.obs_names.intersection(data_atac.obs_names)
data_rna = data_rna[common_cells].copy()
data_atac = data_atac[common_cells].copy()
# Save
rna_path = out_dir / "rna_processed.h5ad"
atac_path = out_dir / "atac_processed.h5ad"
data_rna.write_h5ad(rna_path)
data_atac.write_h5ad(atac_path)
logger.info(f"Saved processed RNA to {rna_path} with shape={data_rna.shape}")
logger.info(f"Saved processed ATAC to {atac_path} with shape={data_atac.shape}")