Source code for BLRun.sinceritiesRunner

import os
import pandas as pd
from pathlib import Path
import numpy as np


[docs]def generateInputs(RunnerObj): ''' Function to generate desired inputs for SINCERITIES. If the folder/files under RunnerObj.datadir exist, this function will not do anything. :param RunnerObj: An instance of the :class:`BLRun` ''' if not RunnerObj.inputDir.joinpath("SINCERITIES").exists(): print("Input folder for SINCERITIES does not exist, creating input folder...") RunnerObj.inputDir.joinpath("SINCERITIES").mkdir(exist_ok = False) ExpressionData = pd.read_csv(RunnerObj.inputDir.joinpath(RunnerObj.exprData), header = 0, index_col = 0) PTData = pd.read_csv(RunnerObj.inputDir.joinpath(RunnerObj.cellData), header = 0, index_col = 0) colNames = PTData.columns for idx in range(len(colNames)): # Select cells belonging to each pseudotime trajectory colName = colNames[idx] index = PTData[colName].index[PTData[colName].notnull()] exprName = "SINCERITIES/ExpressionData"+str(idx)+".csv" newExpressionData = ExpressionData.loc[:,index].T # Perform quantile binning as recommeded in the paper # http://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.qcut.html#pandas.qcut nBins = int(RunnerObj.params['nBins']) tQuantiles = pd.qcut(PTData.loc[index,colName], q = nBins, duplicates ='drop') mid = [(a.left + a.right)/2 for a in tQuantiles] newExpressionData['Time'] = mid newExpressionData.to_csv(RunnerObj.inputDir.joinpath(exprName), sep = ',', header = True, index = False)
[docs]def run(RunnerObj): ''' Function to run SINCERITIES algorithm :param RunnerObj: A BLRun object ''' inputPath = "data" + str(RunnerObj.inputDir).split(str(Path.cwd()))[1] + \ "/SINCERITIES/" # make output dirs if they do not exist: outDir = "outputs/"+str(RunnerObj.inputDir).split("inputs/")[1]+"/SINCERITIES/" os.makedirs(outDir, exist_ok = True) PTData = pd.read_csv(RunnerObj.inputDir.joinpath(RunnerObj.cellData), header = 0, index_col = 0) colNames = PTData.columns for idx in range(len(colNames)): inFile = "ExpressionData"+str(idx)+".csv" outPath = 'data/' + str(outDir) + 'outFile'+str(idx)+'.txt' cmdToRun = ' '.join(['docker run --rm -v', str(Path.cwd())+':/SINCERITIES/data/ sincerities:base /bin/sh -c \"time -v -o', "data/" + str(outDir) + 'time'+str(idx)+'.txt', 'Rscript MAIN.R', inputPath+inFile, outPath, '\"']) print(cmdToRun) os.system(cmdToRun)
[docs]def parseOutput(RunnerObj): ''' Function to parse outputs from SINCERITIES. :param RunnerObj: An instance of the :class:`BLRun` ''' outDir = "outputs/"+str(RunnerObj.inputDir).split("inputs/")[1]+"/SINCERITIES/" PTData = pd.read_csv(RunnerObj.inputDir.joinpath(RunnerObj.cellData), header = 0, index_col = 0) colNames = PTData.columns OutSubDF = [0]*len(colNames) for idx in range(len(colNames)): # Read output outFile = 'outFile'+str(idx)+'.txt' if not Path(outDir+outFile).exists(): # Quit if output file does not exist print(outDir+outFile+' does not exist, skipping...') return OutSubDF[idx] = pd.read_csv(outDir+outFile, sep = ',', header = 0) # megre the dataframe by taking the maximum value from each DF # From here: https://stackoverflow.com/questions/20383647/pandas-selecting-by-label-sometimes-return-series-sometimes-returns-dataframe outDF = pd.concat(OutSubDF) # Group by rows code is from here: # https://stackoverflow.com/questions/53114609/pandas-how-to-remove-duplicate-rows-but-keep-all-rows-with-max-value res = outDF[outDF['Interaction'] == outDF.groupby(['SourceGENES','TargetGENES'])['Interaction'].transform('max')] # Sort values in the dataframe finalDF = res.sort_values('Interaction',ascending=False) finalDF.drop(labels = 'Edges',axis = 'columns', inplace = True) # SINCERITIES output is incorrectly orderd finalDF.columns = ['Gene2','Gene1','EdgeWeight'] finalDF.to_csv(outDir+'rankedEdges.csv',sep='\t', columns = ['Gene1','Gene2','EdgeWeight'],index = False)