Source code for preprocessing_pipeline.correlation

import numpy as np
import pandas as pd
import logging
from tqdm import tqdm
from sklearn.preprocessing import MinMaxScaler

logger = logging.getLogger(__name__)


[docs]
def compute_in_silico_chipseq(
    atac_matrix: np.ndarray,
    rna_matrix: np.ndarray,
    motif_scores: pd.DataFrame,
    percentile: float = 99.9,
    n_bg: int = 10000
) -> tuple[np.ndarray, np.ndarray]:
    """
    Compute correlation-based in-silico ChIP-seq embeddings, separating activator
    and repressor signals.

    Steps
    -----
    1. Compute the Pearson correlation for each (peak, TF) pair.
    2. Retain only the positive part as an "activator" correlation and the negative part as a "repressor" correlation.
    3. For each TF, estimate significance thresholds for correlations from background peaks (i.e., peaks with minimal motif scores for each TF).
    4. Multiply thresholded correlations by the motif scores to produce final activator and repressor embeddings.
    5. Apply MinMax scaling to each TF separately.

    Parameters
    ----------
    atac_matrix : np.ndarray
        Shape (n_metacells, n_peaks). Matrix of peak accessibility (or insertion counts)
        where each row is a metacell and each column is a peak.
    rna_matrix : np.ndarray
        Shape (n_metacells, n_tfs). Matrix of TF expression where each row is a metacell
        and each column is a TF.
    motif_scores : pd.DataFrame
        Shape (n_peaks, n_tfs). DataFrame containing motif scores for each (peak, TF) pair.
    percentile : float, optional
        The percentile threshold used to determine correlation significance (for activators
        and repressors). Default is 99.9.
    n_bg : int, optional
        Number of peaks per TF used to form the "background" distribution (lowest motif
        scores). Default is 10000.

    Returns
    -------
    insilico_chipseq_act_sig : np.ndarray
        Shape (n_peaks, n_tfs). Final activator scores after thresholding correlations
        and combining with motif scores. Scaled to [0, 1].
    insilico_chipseq_rep_sig : np.ndarray
        Shape (n_peaks, n_tfs). Final repressor scores after thresholding correlations
        and combining with motif scores. Scaled to [-1, 0].

    Notes
    -----
    - Positive correlation is interpreted as activator-like; negative correlation is repressor-like.
    - A background distribution is formed from peaks with the smallest motif scores, then used to find correlation thresholds at the requested percentile.
    - The final arrays are MinMax-scaled:
      * The repressor array is scaled into the range [-1, 0].
      * The activator array is scaled into the range [0, 1].
    """
    logger.info("Computing in-silico ChIP-seq correlation...")

    n_cells, n_peaks = atac_matrix.shape
    _, n_tfs = rna_matrix.shape
    if motif_scores.shape != (n_peaks, n_tfs):
        logger.warning("motif_scores dimension does not match (n_peaks x n_tfs).")

    # Z-score peaks & TF expression
    X = (atac_matrix - atac_matrix.mean(axis=0)) / (atac_matrix.std(axis=0) + 1e-8)
    Y = (rna_matrix - rna_matrix.mean(axis=0)) / (rna_matrix.std(axis=0) + 1e-8)

    # Pearson correlation => (n_peaks x n_tfs)
    pearson_r = (X.T @ Y) / n_cells
    pearson_r = np.nan_to_num(pearson_r)

    pearson_r_act = np.clip(pearson_r, 0, None)   # only positive
    pearson_r_rep = np.clip(pearson_r, None, 0)   # only negative

    pearson_r_act_sig = np.zeros_like(pearson_r_act)
    pearson_r_rep_sig = np.zeros_like(pearson_r_rep)

    tf_list = motif_scores.columns

    # Thresholding
    for t in tqdm(range(n_tfs), desc="Thresholding correlation"):
        tf_name = tf_list[t]
        # Find background peaks with smallest motif score
        scores_t = motif_scores[tf_name].values
        order = np.argsort(scores_t)
        bg_idx = order[:min(n_bg, n_peaks)]  # top n_bg smallest motif peaks

        # Activator significance
        bg_vals_act = pearson_r_act[bg_idx, t]
        cutoff_act = np.percentile(bg_vals_act, percentile)

        # Repressor significance
        bg_vals_rep = pearson_r_rep[bg_idx, t]
        cutoff_rep = np.percentile(bg_vals_rep, 100 - percentile)

        act_vec = pearson_r_act[:, t]
        rep_vec = pearson_r_rep[:, t]

        pearson_r_act_sig[:, t] = np.where(act_vec > cutoff_act, act_vec, 0)
        pearson_r_rep_sig[:, t] = np.where(rep_vec < cutoff_rep, rep_vec, 0)

    # Combine with motif
    insilico_chipseq_act_sig = motif_scores.values * pearson_r_act_sig
    insilico_chipseq_rep_sig = motif_scores.values * pearson_r_rep_sig

    # Scale repressor to [-1, 0]
    scaler = MinMaxScaler(feature_range=(-1, 0))
    insilico_chipseq_rep_sig = scaler.fit_transform(insilico_chipseq_rep_sig)

    # Scale activator to [0, 1]
    scaler = MinMaxScaler(feature_range=(0, 1))
    insilico_chipseq_act_sig = scaler.fit_transform(insilico_chipseq_act_sig)

    logger.info("Finished in-silico ChIP-seq computation.")
    return insilico_chipseq_act_sig, insilico_chipseq_rep_sig