Source code for BLRun.pearsonRunner

import pandas as pd

from BLRun.runner import Runner


[docs]class PearsonRunner(Runner):
    """Concrete runner for pairwise Pearson correlation GRN inference.
    Runs entirely within the BEELINE conda environment; no Docker image is used.
    The image field in the config should be set to 'local'."""

[docs]    def generateInputs(self):
        '''
        Verifies that the expression data file exists in the input directory.
        No file copying is required because Pearson runs locally without Docker.

        :param self.input_dir: Path — directory containing input files
        :param self.exprData: str — expression data filename
        :raises FileNotFoundError: if the expression data file is missing
        '''
        if not (self.input_dir / self.exprData).exists():
            raise FileNotFoundError(
                f"Expression data file not found: {self.input_dir / self.exprData}")

[docs]    def run(self):
        '''
        Computes pairwise Pearson correlation between all gene pairs.
        Each gene's expression is first normalized by its maximum expression value
        across all cells, bringing values into the range [0, 1] for non-negative data.
        Genes with zero maximum expression are left unnormalized (divisor replaced with 1).
        Writes the full (genes x genes) correlation matrix to working_dir/outFile.txt.

        :param self.input_dir: Path — directory containing expression data
        :param self.exprData: str — CSV filename; rows = genes, columns = cells
        :param self.working_dir: Path — output location for outFile.txt
        :output working_dir/outFile.txt: tab-separated (genes x genes) correlation matrix
        '''
        # Read expression data: rows = genes, columns = cells
        ExpressionData = pd.read_csv(
            self.input_dir / self.exprData, header=0, index_col=0)
        if not isinstance(ExpressionData, pd.DataFrame):
            raise TypeError(f"ExpressionData must be a DataFrame, got {type(ExpressionData)}")

        # Normalize each gene (row) by its maximum expression value.
        # max=0 is replaced with 1 to avoid division by zero for silent genes.
        max_per_gene = ExpressionData.max(axis=1).replace(0, 1)
        normalized = ExpressionData.div(max_per_gene, axis=0)

        # Transpose to (cells x genes) so .corr() produces a (genes x genes) matrix.
        corr = normalized.T.corr(method='pearson')

        corr.to_csv(self.working_dir / 'outFile.txt', sep='\t')

[docs]    def parseOutput(self):
        '''
        Reads the gene x gene correlation matrix from working_dir/outFile.txt and
        writes a ranked edge list to output_dir/rankedEdges.csv.
        Both directions of each gene pair are included (the matrix is symmetric).
        Self-correlations (Gene1 == Gene2) are excluded.
        Edges are ranked by absolute correlation value, descending.

        :param self.working_dir: Path — directory containing outFile.txt
        :output output_dir/rankedEdges.csv: tab-separated edge list with columns
            Gene1 (str), Gene2 (str), EdgeWeight (float, signed Pearson r)
        '''
        outFile = self.working_dir / 'outFile.txt'
        if not outFile.exists():
            print(str(outFile) + ' does not exist, skipping...')
            return

        # Read square correlation matrix (genes x genes)
        CorrDF = pd.read_csv(outFile, sep='\t', header=0, index_col=0)
        if not isinstance(CorrDF, pd.DataFrame):
            raise TypeError(f"CorrDF must be a DataFrame, got {type(CorrDF)}")

        # Convert to long-format edge list and drop self-correlations
        stacked = CorrDF.stack().reset_index()
        stacked.columns = ['Gene1', 'Gene2', 'EdgeWeight']
        OutDF = stacked[stacked['Gene1'] != stacked['Gene2']].copy()

        # Rank by absolute correlation value, descending; retain signed EdgeWeight
        OutDF = OutDF.iloc[OutDF['EdgeWeight'].abs().argsort()[::-1]]

        self._write_ranked_edges(OutDF[['Gene1', 'Gene2', 'EdgeWeight']])