Source code for BLRun.scnsRunner
import os
from pathlib import Path
import pandas as pd
from itertools import permutations
from collections import Counter
import re
[docs]def generateInputs(RunnerObj):
'''
Function to generate desired inputs for SCNS.
If the folder/files under RunnerObj.datadir exist,
this function will not do anything.
'''
if not RunnerObj.inputDir.joinpath("SCNS").exists():
print("Input folder for SCNS does not exist, creating input folder...")
RunnerObj.inputDir.joinpath("SCNS").mkdir(exist_ok = False)
# input file
ExpressionData = pd.read_csv(RunnerObj.inputDir.joinpath(RunnerObj.exprData),
header = 0, index_col = 0)
# Convert input expression to boolean
# If the gene's expression value is >= it's avg. expression across cells
# it receieves a "True", else "False"
BinExpression = ExpressionData.T >= ExpressionData.mean(axis = 'columns')
BinExpression.drop_duplicates(inplace= True)
# Write unique cells x genes output to a file
BinExpression.to_csv(RunnerObj.inputDir.joinpath("SCNS/ExpressionData.csv"))
# Read PseudoTime file to figure out
# initial and final states
PseudoTimeDF = pd.read_csv(RunnerObj.inputDir.joinpath(RunnerObj.cellData),
header = 0, index_col = 0)
# Get the Time corresponding to cells in BinExpression dataframe
# Identify cells in initial and final states from them
# cells in initial states are the ones with the earliest time pts (<10th percentile)
# cells in target states are the ones with the latest time pts (>90th percentile)
StateDF = PseudoTimeDF.loc[BinExpression.index].max(axis='columns')
initialStates = open(RunnerObj.inputDir.joinpath("SCNS/initial.txt"),'w')
for ix in StateDF[StateDF <= StateDF.quantile(0.1)].index:
initialStates.write(ix +'\n')
initialStates.close()
targetStates = open(RunnerObj.inputDir.joinpath("SCNS/target.txt"),'w')
for ix in StateDF[StateDF >= StateDF.quantile(0.9)].index:
targetStates.write(ix +'\n')
targetStates.close()
parameters = open(RunnerObj.inputDir.joinpath("SCNS/Parameters.csv"),'w')
refNetwork = open(RunnerObj.inputDir.joinpath("refNetwork.csv"),'r')
countA = {gene:0 for gene in BinExpression.columns}
countR = {gene:0 for gene in BinExpression.columns}
for line in refNetwork:
items = line.strip().split(',')
if items[2] == '+':
countA[items[1]] += 1
elif items[2] == '-':
countR[items[1]] += 1
else:
continue
refNetwork.close()
parameters.write('Gene,MaxActivators,MaxRepressors,Threshold\n')
for cols in BinExpression.columns:
parameters.write(cols+', '+str(min(2,countA[cols]))+', '+str(min(2,countA[cols]))+ ', 95\n')
parameters.close()
# generate Edges file
# two cells are connected by an edge
# if they only differ in the boolean
# expression of one gene
States = open(RunnerObj.inputDir.joinpath("SCNS/Edges.csv"),'w')
States.write('Gene,StateA,StateB\n')
colNames = BinExpression.columns
for idx_i, row_i in BinExpression.iterrows():
for idx_j, row_j in BinExpression.iterrows():
if list(row_i == row_j).count(False) == 1:
States.write(str(colNames[list(row_i == row_j).index(False)])+','
+str(idx_i)+','+str(idx_j)+'\n')
States.close()
[docs]def run(RunnerObj):
'''
Function to run SCNS algorithm
'''
inputPath = "data" + str(RunnerObj.inputDir).split(str(Path.cwd()))[1] + "/SCNS/"
# make output dirs if they do not exist:
outDir = "outputs/"+str(RunnerObj.inputDir).split("inputs/")[1]+"/SCNS/"
os.makedirs(outDir, exist_ok = True)
outPath = "data/" + str(outDir)
cmdToRun = ' '.join(['docker run --rm -v', str(Path.cwd())+':/SCNS-Toolkit/SynthesisEngine/data/',
'scns:base /bin/sh -c \"time -v -o', "data/" + str(outDir) + 'time.txt',
'mono SynthesisEngine.exe', inputPath+'ExpressionData.csv',
inputPath+'Edges.csv', inputPath+'Parameters.csv',
inputPath+'initial.txt', inputPath+'target.txt',
outPath, '\"'])
print(cmdToRun)
os.system(cmdToRun)
[docs]def parseOutput(RunnerObj):
'''
Function to parse output from SCNS
'''
# Get list of input genes
ExprDF = pd.read_csv(RunnerObj.inputDir.joinpath("SCNS/ExpressionData.csv"), index_col = 0, header = 0)
outDir = "outputs/"+str(RunnerObj.inputDir).split("inputs/")[1]+"/SCNS/"
geneList = list(ExprDF.columns)
# Initialize ranked egdes file
possibleEdges = list(permutations(geneList, r = 2))
trueEdges = {'|'.join(p):0 for p in possibleEdges}
rankedEdges = pd.DataFrame(index = trueEdges.keys(),columns=['Gene1','Gene2','EdgeWeight'])
for gene in geneList:
outFile = gene+'.txt'
if Path(outDir+outFile).exists():
count = {}
for w in re.split('\t| |\(|\)|\n|,', open(outDir+outFile).read()):
if w in count:
count[w] += 1
else:
count[w] = 1
for gene2 in geneList:
if gene2 in count.keys():
trueEdges[gene+'|'+gene2] = count[gene2]
if gene != gene2:
# ignoring self-edges
rankedEdges.loc[gene+'|'+gene2,'EdgeWeight'] = trueEdges[gene+'|'+gene2]
rankedEdges.loc[gene+'|'+gene2,'Gene1'] = gene2
rankedEdges.loc[gene+'|'+gene2,'Gene2'] = gene
else:
# Skip if output file does not exist
print(outDir+outFile+' does not exist, skipping...')
rankedEdges.sort_values('EdgeWeight', ascending=False, inplace=True)
rankedEdges.to_csv(outDir+'rankedEdges.csv',sep='\t', index = False)