Source code for BLRun.scingeRunner

import os
import subprocess
import pandas as pd
from pathlib import Path
import numpy as np

[docs]def generateInputs(RunnerObj): ''' Function to generate desired inputs for SCINGE. If the folder/files under RunnerObj.datadir exist, this function will not do anything. ''' if not RunnerObj.inputDir.joinpath("SCINGE").exists(): print("Input folder for SCINGE does not exist, creating input folder...") RunnerObj.inputDir.joinpath("SCINGE").mkdir(exist_ok = False) ExpressionData = pd.read_csv(RunnerObj.inputDir.joinpath(RunnerObj.exprData), header = 0, index_col = 0) PTData = pd.read_csv(RunnerObj.inputDir.joinpath(RunnerObj.cellData), header = 0, index_col = 0) colNames = PTData.columns for idx in range(len(colNames)): # Select cells belonging to each pseudotime trajectory colName = colNames[idx] index = PTData[colName].index[PTData[colName].notnull()] exprName = "SCINGE/ExpressionData"+str(idx)+".csv" newExpressionData = ExpressionData.loc[:,index].T newExpressionData['PseudoTime'] = PTData.loc[index,colName] newExpressionData.to_csv(RunnerObj.inputDir.joinpath(exprName), sep = ',', header = True, index = False)
[docs]def run(RunnerObj): ''' Function to run SCINGE algorithm ''' inputPath = "data" + str(RunnerObj.inputDir).split(str(Path.cwd()))[1] + \ "/SCINGE/" # make output dirs if they do not exist: outDir = "outputs/"+str(RunnerObj.inputDir).split("inputs/")[1]+"/SCINGE/" os.makedirs(outDir, exist_ok = True) # if the parameters aren't specified, then use default parameters # TODO allow passing in multiple sets of hyperparameters # these must be in the right order! params_order = [ 'lambda', 'dT', 'num_lags', 'kernel_width', 'prob_zero_removal', 'prob_remove_samples', 'family', 'num_replicates', ] default_params = { 'lambda': '0.01', 'dT': '10', 'num_lags': '5', 'kernel_width': '4', 'prob_zero_removal': '0', 'prob_remove_samples': '0.2', 'family': 'gaussian', 'num_replicates': '2', } params = RunnerObj.params for param, val in default_params.items(): if param not in params: params[param] = val params_str = ' '.join(str(params[p]) for p in params_order) PTData = pd.read_csv(RunnerObj.inputDir.joinpath(RunnerObj.cellData), header = 0, index_col = 0) colNames = PTData.columns for idx in range(len(colNames)): outPath = str(outDir) + str(idx) + "/" os.makedirs(outPath, exist_ok = True) outFile = "data/" + outPath inputFile = inputPath + "ExpressionData"+str(idx)+".csv" cmdToRun = ' '.join(['docker run --rm -v', str(Path.cwd())+':/runSCINGE/data/ scinge:base /bin/sh -c \"time -v -o', "data/" + str(outDir) + 'time'+str(idx)+'.txt', './runSCINGE ', inputFile, outFile, params_str, '\"']) print(cmdToRun) # also print the parameters print("\tParameters: %s" % (', '.join("%s: %s" % (p, str(params[p])) for p in params_order))) subprocess.check_call(cmdToRun, shell=True)
[docs]def parseOutput(RunnerObj): ''' Function to parse outputs from SCINGE. ''' outDir = "outputs/"+str(RunnerObj.inputDir).split("inputs/")[1]+"/SCINGE/" PTData = pd.read_csv(RunnerObj.inputDir.joinpath(RunnerObj.cellData), header = 0, index_col = 0) colNames = PTData.columns OutSubDF = [0]*len(colNames) for idx in range(len(colNames)): # Quit if output directory does not exist if not Path(outDir+ str(idx)+'/SCINGE_Ranked_Edge_List.txt').exists(): print(outDir+ str(idx)+'/SCINGE_Ranked_Edge_List.txt does not exist, skipping...') return # Read output OutSubDF[idx] = pd.read_csv(outDir+ str(idx)+'/SCINGE_Ranked_Edge_List.txt', sep = '\t', header = 0) # megre the dataframe by taking the maximum value from each DF # Code from here: # https://stackoverflow.com/questions/20383647/pandas-selecting-by-label-sometimes-return-series-sometimes-returns-dataframe outDF = pd.concat(OutSubDF) outDF.columns= ['Gene1','Gene2','EdgeWeight'] # Group by rows code is from here: # https://stackoverflow.com/questions/53114609/pandas-how-to-remove-duplicate-rows-but-keep-all-rows-with-max-value res = outDF[outDF['EdgeWeight'] == outDF.groupby(['Gene1','Gene2'])['EdgeWeight'].transform('max')] # Sort values in the dataframe finalDF = res.sort_values('EdgeWeight', ascending=False) finalDF.to_csv(outDir+'rankedEdges.csv',sep='\t', index = False)