Source code for BLRun.jump3Runner

import os
import pandas as pd
from pathlib import Path
import numpy as np
from sklearn import preprocessing

[docs]def generateInputs(RunnerObj):
    '''
    Function to generate desired inputs for JUMP3.
    If the folder/files under RunnerObj.datadir exist, 
    this function will not do anything.
    '''
    if not RunnerObj.inputDir.joinpath("JUMP3").exists():
        print("Input folder for JUMP3 does not exist, creating input folder...")
        RunnerObj.inputDir.joinpath("JUMP3").mkdir(exist_ok = False)
        
    if not RunnerObj.inputDir.joinpath("JUMP3/ExpressionData.csv").exists():
        ExpressionData = pd.read_csv(RunnerObj.inputDir.joinpath(RunnerObj.exprData),
                                     header = 0, index_col = 0)
        newExpressionData = ExpressionData.T.copy()
        PTData = pd.read_csv(RunnerObj.inputDir.joinpath(RunnerObj.cellData),
                             header = 0, index_col = 0)
        # make sure the indices are strings for both dataframes
        newExpressionData.index = newExpressionData.index.map(str) 
        PTData.index = PTData.index.map(str) 
        # Acc. to JUMP3:
        # In input argument Time, the first time point of each time series must be 0.
        # Also has to be an integer!
        newExpressionData['Time'] = PTData['PseudoTime']-PTData['PseudoTime'].min()
        if 'Experiment' in PTData:
            newExpressionData['Experiment'] = PTData['Experiment']
        else:
            # generate it from cell number Ex_y, where x is experiment number
            #newExpressionData['Experiment'] = [int(x.split('_')[0].strip('E')) for x in PTData.index.astype(str)]
            newExpressionData['Experiment'] = 1
            
        newExpressionData.to_csv(RunnerObj.inputDir.joinpath("JUMP3/ExpressionData.csv"),
                             sep = ',', header  = True, index = False)
    
    
[docs]def run(RunnerObj):
    '''
    Function to run GRN-VBEM algorithm
    '''
    inputPath = "data" + str(RunnerObj.inputDir).split(str(Path.cwd()))[1] + \
                    "/JUMP3/ExpressionData.csv"
    
    # make output dirs if they do not exist:
    outDir = "outputs/"+str(RunnerObj.inputDir).split("inputs/")[1]+"/JUMP3/"
    os.makedirs(outDir, exist_ok = True)
    
    outPath = "data/" +  str(outDir) + 'outFile.txt'
    cmdToRun = ' '.join(['docker run --rm -v', str(Path.cwd())+':/JUMP3/data/ jump3:base /bin/sh -c \"time -v -o', "data/" + str(outDir) + 'time.txt', './runJump3',
                         inputPath, outPath, '\"'])
    print(cmdToRun)
    os.system(cmdToRun)



[docs]def parseOutput(RunnerObj):
    '''
    Function to parse outputs from JUMP3.
    '''
    # Quit if output directory does not exist
    outDir = "outputs/"+str(RunnerObj.inputDir).split("inputs/")[1]+"/JUMP3/"
    if not Path(outDir+'outFile.txt').exists():
        print(outDir+'outFile.txt'+'does not exist, skipping...')
        return
        
    # Read output
    OutDF = pd.read_csv(outDir+'outFile.txt', sep = ',')
    
    # Sort values in a matrix using code from:
    # https://stackoverflow.com/questions/21922806/sort-values-of-matrix-in-python
    OutMatrix = np.abs(OutDF.values)
    idx = np.argsort(OutMatrix, axis = None)[::-1]
    rows, cols = np.unravel_index(idx, OutDF.shape)    
    DFSorted = OutMatrix[rows, cols]
    
    # read input file for list of gene names
    ExpressionData = pd.read_csv(RunnerObj.inputDir.joinpath('ExpressionData.csv'),
                                     header = 0, index_col = 0)
    GeneList = list(ExpressionData.index)
    
    outFile = open(outDir + 'rankedEdges.csv','w')
    outFile.write('Gene1'+'\t'+'Gene2'+'\t'+'EdgeWeight'+'\n')

    for row, col, val in zip(rows, cols, DFSorted):
        outFile.write('\t'.join([GeneList[row],GeneList[col],str(val)])+'\n')
    outFile.close()