Source code for BLRun.scribeRunner

import os
import pandas as pd
from pathlib import Path
import numpy as np

[docs]def generateInputs(RunnerObj):
    '''
    Function to generate desired inputs for SCRIBE.
    If the folder/files under RunnerObj.datadir exist, 
    this function will not do anything.
    '''
    if not RunnerObj.inputDir.joinpath("SCRIBE").exists():
        print("Input folder for SCRIBE does not exist, creating input folder...")
        RunnerObj.inputDir.joinpath("SCRIBE").mkdir(exist_ok = False)
    
    
    ExpressionData = pd.read_csv(RunnerObj.inputDir.joinpath(RunnerObj.exprData),
                                     header = 0, index_col = 0)
    PTData = pd.read_csv(RunnerObj.inputDir.joinpath(RunnerObj.cellData),
                             header = 0, index_col = 0)

    colNames = PTData.columns
    for idx in range(len(colNames)):
        # Select cells belonging to each pseudotime trajectory
        colName = colNames[idx]
        index = PTData[colName].index[PTData[colName].notnull()]
        exprName = "SCRIBE/ExpressionData"+str(idx)+".csv"
        ExpressionData.loc[:,index].to_csv(RunnerObj.inputDir.joinpath(exprName),
                                 sep = ',', header  = True, index = True)
        cellName = "SCRIBE/CellData"+str(idx)+".csv"
        ptDF = PTData.loc[index,[colName]]        
        # Scribe expects a column labeled Time.
        ptDF.rename(columns = {colName:'Time'}, inplace = True)
        
        ptDF.to_csv(RunnerObj.inputDir.joinpath(cellName),
                                 sep = ',', header  = True, index = True)
        
    if not RunnerObj.inputDir.joinpath("SCRIBE/GeneData.csv").exists():
        # required column!!
        geneDict = {}
        geneDict['gene_short_name'] = [gene.replace('x_', '') for gene in ExpressionData.index]
        
        geneDF = pd.DataFrame(geneDict, index = ExpressionData.index)
        geneDF.to_csv(RunnerObj.inputDir.joinpath("SCRIBE/GeneData.csv"), 
                      sep = ',', header = True)
    
[docs]def run(RunnerObj):
    '''
    Function to run SCRIBE algorithm.
    To see all the inputs runScribe.R script takes, run:
    docker run scribe:base /bin/sh -c "Rscript runScribe.R -h"
    '''
    
    inputPath = "data"+str(RunnerObj.inputDir).split(str(Path.cwd()))[1]+"/SCRIBE/"

    
    # required inputs
    delay = str(RunnerObj.params['delay'])
    method = str(RunnerObj.params['method'])
    low = str(RunnerObj.params['lowerDetectionLimit'])
    fam = str(RunnerObj.params['expressionFamily'])

    # optional inputs
    log = str(RunnerObj.params['log'])
    ignorePT = str(RunnerObj.params['ignorePT'])
    
    # make output dirs if they do not exist:
    outDir = "outputs/"+str(RunnerObj.inputDir).split("inputs/")[1]+"/SCRIBE/"
    os.makedirs(outDir, exist_ok = True)

    # Build the command to run Scribe
    PTData = pd.read_csv(RunnerObj.inputDir.joinpath(RunnerObj.cellData),
                             header = 0, index_col = 0)
    colNames = PTData.columns
    
    for idx in range(len(colNames)):
        # Specify file names for inputs and outputs
        exprName = "ExpressionData"+str(idx)+".csv"
        cellName = "CellData"+str(idx)+".csv"
        outFile = "outFile"+str(idx)+".csv"
        timeFile = 'time'+str(idx)+".txt"
        
        cmdToRun = ' '.join(['docker run --rm -v', str(Path.cwd())+':/data/ scribe:base /bin/sh -c \"time -v -o', "data/" + str(outDir) + timeFile, 'Rscript runScribe.R',
                       '-e',inputPath +exprName, '-c',inputPath + cellName, 
                       '-g',inputPath + 'GeneData.csv', '-o data/'+outDir, '-d',delay, '-l', low,
                       '-m', method, '-x',fam, '--outFile '+outFile])

        if str(RunnerObj.params['log']) == 'True':
            cmdToRun += ' --log'
        if str(RunnerObj.params['ignorePT']) == 'True':
            cmdToRun += ' -i'

        cmdToRun += '\"'

        print(cmdToRun)

        os.system(cmdToRun)



[docs]def parseOutput(RunnerObj):
    '''
    Function to parse outputs from SCRIBE.
    '''
    outDir = "outputs/"+str(RunnerObj.inputDir).split("inputs/")[1]+"/SCRIBE/"

    PTData = pd.read_csv(RunnerObj.inputDir.joinpath(RunnerObj.cellData),
                             header = 0, index_col = 0)
    colNames = PTData.columns
    OutSubDF = [0]*len(colNames)
    for idx in range(len(colNames)):
        # Read output
        outFile = 'outFile'+str(idx)+'.csv'
        if not Path(outDir+outFile).exists():
            # Quit if output file does not exist

            print(outDir+outFile+' does not exist, skipping...')
            return
        OutSubDF[idx] = pd.read_csv(outDir+outFile, sep = ' ', header = None)

    # megre the dataframe by taking the maximum value from each DF
    # From here: https://stackoverflow.com/questions/20383647/pandas-selecting-by-label-sometimes-return-series-sometimes-returns-dataframe
    outDF = pd.concat(OutSubDF)
    outDF.columns= ['Gene1','Gene2','EdgeWeight']
    # Group by rows code is from here:
    # https://stackoverflow.com/questions/53114609/pandas-how-to-remove-duplicate-rows-but-keep-all-rows-with-max-value
    res = outDF[outDF['EdgeWeight'] == outDF.groupby(['Gene1','Gene2'])['EdgeWeight'].transform('max')]
    # Sort values in the dataframe   
    finalDF = res.sort_values('EdgeWeight',ascending=False)  
    
    finalDF.to_csv(outDir+'rankedEdges.csv',sep='\t', index = False)