Source code for BLRun.scnsRunner

import os
from pathlib import Path
import pandas as pd
from itertools import permutations
from collections import Counter
import re

[docs]def generateInputs(RunnerObj):
    '''
    Function to generate desired inputs for SCNS.
    If the folder/files under RunnerObj.datadir exist, 
    this function will not do anything.
    '''
    
    if not RunnerObj.inputDir.joinpath("SCNS").exists():
        print("Input folder for SCNS does not exist, creating input folder...")
        RunnerObj.inputDir.joinpath("SCNS").mkdir(exist_ok = False)
        
    # input file
    ExpressionData = pd.read_csv(RunnerObj.inputDir.joinpath(RunnerObj.exprData),
                                 header = 0, index_col = 0)

    # Convert input expression to boolean
    # If  the gene's expression value is >= it's avg. expression across cells
    # it receieves a "True", else "False"
    BinExpression = ExpressionData.T >= ExpressionData.mean(axis = 'columns')
    BinExpression.drop_duplicates(inplace= True)
    # Write unique cells x genes output to a file
    BinExpression.to_csv(RunnerObj.inputDir.joinpath("SCNS/ExpressionData.csv")) 

    # Read PseudoTime file to figure out
    # initial and final states
    PseudoTimeDF = pd.read_csv(RunnerObj.inputDir.joinpath(RunnerObj.cellData),
                                          header = 0, index_col = 0)
    # Get the Time corresponding to cells in BinExpression dataframe
    # Identify cells in initial and final states from them
    # cells in initial states are the ones with the earliest time pts (<10th percentile)
    # cells in target states are the ones with the latest time pts (>90th percentile)

    StateDF = PseudoTimeDF.loc[BinExpression.index].max(axis='columns')
    initialStates =  open(RunnerObj.inputDir.joinpath("SCNS/initial.txt"),'w')                           
    for ix in StateDF[StateDF <= StateDF.quantile(0.1)].index:
           initialStates.write(ix +'\n')
    initialStates.close()

    targetStates =  open(RunnerObj.inputDir.joinpath("SCNS/target.txt"),'w')                           
    for ix in StateDF[StateDF >= StateDF.quantile(0.9)].index:
           targetStates.write(ix +'\n')
    targetStates.close()


    parameters =  open(RunnerObj.inputDir.joinpath("SCNS/Parameters.csv"),'w')
    refNetwork =  open(RunnerObj.inputDir.joinpath("refNetwork.csv"),'r')
    countA = {gene:0 for gene in BinExpression.columns}
    countR = {gene:0 for gene in BinExpression.columns}
    for line in refNetwork:
        items = line.strip().split(',')
        if items[2] == '+':
            countA[items[1]] += 1
        elif items[2] == '-':
            countR[items[1]] += 1
        else:
            continue
            
    refNetwork.close()

    parameters.write('Gene,MaxActivators,MaxRepressors,Threshold\n')
    for cols in BinExpression.columns:
           parameters.write(cols+', '+str(min(2,countA[cols]))+', '+str(min(2,countA[cols]))+ ', 95\n')
    parameters.close()

    # generate Edges file
    # two cells are connected by an edge
    # if they only differ in the boolean 
    # expression of one gene
    States =  open(RunnerObj.inputDir.joinpath("SCNS/Edges.csv"),'w') 
    States.write('Gene,StateA,StateB\n')
    colNames =  BinExpression.columns
    for idx_i, row_i in BinExpression.iterrows():
        for idx_j, row_j in BinExpression.iterrows():
            if list(row_i == row_j).count(False) == 1:
                States.write(str(colNames[list(row_i == row_j).index(False)])+','
                             +str(idx_i)+','+str(idx_j)+'\n')
    States.close()

[docs]def run(RunnerObj):
    '''
    Function to run SCNS algorithm
    '''

    inputPath = "data" + str(RunnerObj.inputDir).split(str(Path.cwd()))[1] + "/SCNS/"
                    
    
    # make output dirs if they do not exist:
    outDir = "outputs/"+str(RunnerObj.inputDir).split("inputs/")[1]+"/SCNS/"
    os.makedirs(outDir, exist_ok = True)
    
    outPath = "data/" +  str(outDir)
    cmdToRun = ' '.join(['docker run --rm -v', str(Path.cwd())+':/SCNS-Toolkit/SynthesisEngine/data/', 
                         'scns:base /bin/sh -c \"time -v -o', "data/" + str(outDir) + 'time.txt',
                         'mono SynthesisEngine.exe', inputPath+'ExpressionData.csv',
                          inputPath+'Edges.csv',  inputPath+'Parameters.csv',
                          inputPath+'initial.txt',  inputPath+'target.txt',
                          outPath, '\"'])

    print(cmdToRun)
    os.system(cmdToRun)
                                   
[docs]def parseOutput(RunnerObj):
    '''
    Function to parse output from SCNS
    '''
    # Get list of input genes
    ExprDF = pd.read_csv(RunnerObj.inputDir.joinpath("SCNS/ExpressionData.csv"), index_col = 0, header = 0)
    outDir = "outputs/"+str(RunnerObj.inputDir).split("inputs/")[1]+"/SCNS/"
    geneList = list(ExprDF.columns)
    
    # Initialize ranked egdes file 
    possibleEdges = list(permutations(geneList, r = 2))
    trueEdges =  {'|'.join(p):0 for p in possibleEdges}
    rankedEdges = pd.DataFrame(index = trueEdges.keys(),columns=['Gene1','Gene2','EdgeWeight'])
        
    for gene in geneList:
        outFile = gene+'.txt'
        if Path(outDir+outFile).exists():
            count = {}
            for w in re.split('\t| |\(|\)|\n|,', open(outDir+outFile).read()):
                if w in count:
                    count[w] += 1
                else:
                    count[w] = 1
            for gene2 in geneList:
                if gene2 in count.keys():
                    trueEdges[gene+'|'+gene2] = count[gene2]
                if gene != gene2:
                    # ignoring self-edges
                    rankedEdges.loc[gene+'|'+gene2,'EdgeWeight'] = trueEdges[gene+'|'+gene2]
                    rankedEdges.loc[gene+'|'+gene2,'Gene1'] = gene2
                    rankedEdges.loc[gene+'|'+gene2,'Gene2'] = gene
        else:
            # Skip if output file does not exist
            print(outDir+outFile+' does not exist, skipping...')
        
    rankedEdges.sort_values('EdgeWeight', ascending=False, inplace=True)
    rankedEdges.to_csv(outDir+'rankedEdges.csv',sep='\t', index = False)