Source code for src.BoolODE

#!/usr/bin/env python
# coding: utf-8
__author__ = 'Amogh Jalihal'
import sys
import numpy as np
import scipy as sc
import pandas as pd
from itertools import combinations
from scipy.integrate import odeint
from sklearn.cluster import KMeans
import time
import importlib
import warnings
from tqdm import tqdm 
from optparse import OptionParser
from multiprocessing import Process, Lock, Manager
import ast
from importlib.machinery import SourceFileLoader
import utils 
import simulator 

np.seterr(all='raise')
import os

[docs]def readBooleanRules(path, parameterInputsPath, outPrefix='',
                     add_dummy=False, max_parents=1):
    """
    Reads a rule file from path and stores the rules in a dictionary
    
    Parameters
    ----------
    :param path: Path to Boolean Rule file
    :type path: str
    :param parameterInputsPath: Path to file containing input parameters to the model
    :type parameterInputsPath: str
    :param add_dummy: Experimental feature, adds dummy targets to each gene in the model. Useful to change the density of the network
    :type add_dummy: bool
    :param max_parents: Experimental feature, specifies number of parent genes per dummy gene added
    :type max_parents: int

    :returns:
        - DF: Dataframe containing rules
        - withoutrules: list of nodes in input file without rules
    """
    DF = pd.read_csv(path,sep='\t',engine='python')
    withRules = list(DF['Gene'].values)
    allnodes = set()
    for ind,row in DF.iterrows():
        rhs = row['Rule']
        rhs = rhs.replace('(',' ')
        rhs = rhs.replace(')',' ')
        tokens = rhs.split(' ')

        reg = [t for t in tokens if t not in ['not','and','or','']]
        allnodes.update(set(reg))

    withoutRules = allnodes.difference(set(withRules))
    for n in withoutRules:
        if len(parameterInputsPath) == 0:
            print(n, "has no rule, adding self-activation.")
            DF = DF.append({'Gene':n,'Rule':n},ignore_index=True)
        else:
            print("Treating %s as parameter" % {n})
    ## Add dummy genes
    ## determinisitcally add parents
    if add_dummy:
        print('using max_parents=' + str(max_parents))
        num_dummy = 2*len(allnodes)
        for dg in range(num_dummy):
            ## Variable number of parents
            # DF = DF.append({'Gene':'dummy' + str(dg),
            #                 'Rule':' or '.join([s for s in \
            #                                     np.random.choice(list(allnodes),
            #                                                      size=np.random.choice([i+1 for i in range(len(allnodes)-1)]))])},
            #                ignore_index = True)
            DF = DF.append({'Gene':'dummy' + str(dg),
                            'Rule':' or '.join([s for s in \
                                                np.random.choice(list(allnodes),
                                                                 size=max_parents)])},
                           ignore_index = True)
            DF.to_csv(outPrefix + 'rules-with-added-genes.csv')
            
    print(DF)
    return DF, withoutRules

[docs]def getParameters(DF,identicalPars,
                  samplePars,
                  sampleStd,
                  genelist,
                  proteinlist,
                  withoutRules,
                  parameterInputsDF,
                  parameterSetDF,
                  interactionStrengthDF):
    """
    Create dictionary of parameters and values. Assigns
    parameter values by evaluating the Boolean expression
    for each variable.

    :param DF: Table of values with two columns, 'Gene' specifies target, 'Rule' specifies Boolean function
    :type DF: pandas DataFrame
    :param identicalPars: Passed to utils.getSaneNval to set identical parameters
    :type identicalPars: bool
    :param samplePars: Sample kinetic parameters using a Gaussian distribution  centered around the default parameters
    :type samplePars: bool
    :param parameterInputsDF: Optional table that specifies input parameter values. Useful to specify experimental conditions.
    :type parameterInputsDF: pandas DataFrame
    :param parameterSetDF: Optional table that specifies predefined parameter set.
    :type parameterSetDF: pandas DataFrame
    :param interactionStrengthDF: Optional table that specifies interaction strengths. When not specified, default strength is set to 1.
    :type interactionStrengthDF: pandas DataFrame
    :returns:
        pars : dict
            Dictionary of parameters 
    """
    ## Set default parameters
    mRNATranscription = 20.
    mRNADegradation = mRNATranscription/2.
    proteinTranslation = 10
    proteinDegradation = 1
    ## The threshold is calculated as the max value of the species
    ## divided by 2.
    y_max = (proteinTranslation/proteinDegradation)*\
            (mRNATranscription/mRNADegradation)
    x_max = (mRNATranscription/mRNADegradation)
    hillThreshold = y_max/2

    # Chosen arbitrarily
    signalingtimescale = 5.0
    hillCoefficient = 10
    interactionStrength = 1.0

    parameterNamePrefixAndDefaultsAll = {
        # Hill coefficients
        'n_':hillCoefficient,
        # Thresholds
        'k_':hillThreshold,
    }     
    parameterNamePrefixAndDefaultsGenes = {
        # mRNA transcription rates
        'm_':mRNATranscription,
        # mRNA degradation rates
        'l_x_':mRNADegradation,
        # Protein translation rate
        'r_':proteinTranslation,
        # protein degradation rates
        'l_p_':proteinDegradation
    }     
    
    par = dict()
    ## If there is even one signaling protein,
    ## create the y_max parameter
    if len(proteinlist) > 0:
        par['y_max'] = y_max
        par['signalingtimescale'] = signalingtimescale
        
    parameterInputsDict  = {}
    interactionStrengths = {}

    ## Get set of species which have rules specified for them
    species = set(DF['Gene'].values)
    
    ## Check 1:
    ## If a parameter input file is specified,
    ## Every row in parameterInputsDF contains two
    ## columns:
    ##     'Inputs' is a python list of strings
    ## specifying the parameters.
    ##     'Values' is a python list of floats
    ## each corresponding to the value of a parameter.
    ## Now, we create a fake hill term for this input, even
    ## though there is no equation corresponding to it. Thus,
    ## we create a hillThreshold and hillCoefficient 
    ## term for it. This is useful so we can treat this input
    ## parameter as a "regulator" and leave the structure of
    ## the logic term unchanged.
    if parameterInputsDF is not None:
        for i, row in parameterInputsDF.iterrows():
            # initialize
            parameterInputsDict[i] = {p:0 for p in withoutRules}
            inputParams = ast.literal_eval(row['Inputs'])
            inputValues = ast.literal_eval(row['Values'])
            # Set to a max value
            parameterInputsDict[i].update({p:hillThreshold*2 for p,v in\
                                       zip(inputParams,inputValues) if v > 0})
            
        # Add input parameters, set to 0 by default
        par.update({p:0 for p in parameterInputsDict[0].keys()})
        par.update({'k_'+p:hillThreshold for p in parameterInputsDict[0].keys()})
        par.update({'n_'+p:hillCoefficient for p in parameterInputsDict[0].keys()})


    ## Check 2:
    ## If interaction strengths are specified.
    ## Create a new parameter specifying the strength
    ## of a given interaction. For all other equations in
    ## which "regulator" appears in, its hillThreshold value
    ## will remain the default value.
    if interactionStrengthDF is not None:
        for i,row in interactionStrengthDF.iterrows():
            regulator = row['Gene2']
            target = row['Gene1']
            interactionStrength = row['Strength']
            par.update({'k_' + regulator + '_' + target: hillThreshold/interactionStrength})

    ## Check 3:
    ## Whether to sample parameters or stick to defaults
    if samplePars:
        print("Sampling parameters")
        sigmamult = sampleStd
        print("Using std=" + str(sampleStd))
        lomult = 0.9
        himult = 1.1
        for parPrefix, parDefault in parameterNamePrefixAndDefaultsAll.items():
            sampledParameterValues = utils.getSaneNval(len(species),\
                                                 lo=lomult*parDefault,\
                                                 hi=himult*parDefault,\
                                                 mu=parDefault,\
                                                 sig=sigmamult*parDefault,\
                                                 identicalPars=identicalPars)
            for sp, sparval in zip(species, sampledParameterValues):
                if sp in genelist:
                    par[parPrefix + sp] = sparval
                    
        transcriptionRate = 0.0
        mRNADegradationRate = 0.0
        for parPrefix, parDefault in parameterNamePrefixAndDefaultsGenes.items():
            sampledParameterValues = utils.getSaneNval(len(species),\
                                                 lo=lomult*parDefault,\
                                                 hi=himult*parDefault,\
                                                 mu=parDefault,\
                                                 sig=sigmamult*parDefault,\
                                                 identicalPars=identicalPars)
            if identicalPars:
                if parPrefix == 'm_':
                    transcriptionRate = sampledParameterValues[0]
                if parPrefix == 'l_x_':
                    mRNADegradationRate = sampledParameterValues[0]
            else:
                if parPrefix == 'm_':
                    transcriptionRate = parDefault
                if parPrefix == 'l_x_':
                    mRNADegradationRate = parDefault
            
            for sp, sparval in zip(species, sampledParameterValues):
                if sp in genelist:
                    par[parPrefix + sp] = sparval
        ## Based on these samples, estimate max value a
        ## gene can take at _steady-state_
        x_max = (transcriptionRate/mRNADegradationRate)
    else:
        print("Fixing rate parameters to defaults")
        for parPrefix, parDefault in parameterNamePrefixAndDefaultsAll.items():
            for sp in species:
                par[parPrefix + sp] = parDefault

        for parPrefix, parDefault in parameterNamePrefixAndDefaultsGenes.items():
            for sp in species:
                if sp in genelist:
                    par[parPrefix + sp] = parDefault
    ## Final Check.
    ## If parameterSetDF is specified, reassign
    ## parameter values to those from table
    ## This guarantees that all the parameters are defined without
    ## specific checks in parameterSetDF
    if parameterSetDF is not None:
        for pname, pvalue in parameterSetDF.iterrows():
            par[pname] = pvalue[1]
    return par, x_max, parameterInputsDict

[docs]def generateModelDict(DF,identicalPars,
                      samplePars,
                      sampleStd,
                      withoutRules,
                      speciesTypeDF,
                      parameterInputsDF,
                      parameterSetDF,
                      interactionStrengthDF):
    """
    Take a DataFrame object with Boolean rules,
    construct ODE equations for each variable.
    Takes optional parameter inputs and interaction
    strengths

    :param DF: Table of values with two columns, 'Gene' specifies target, 'Rule' specifies Boolean function
    :type DF: pandas DataFrame
    :param identicalPars: Passed to utils.getSaneNval to set identical parameters
    :type identicalPars: bool
    :param samplePars: Sample kinetic parameters using a Gaussian distribution  centered around the default parameters
    :type samplePars: bool
    :param sampleStd: Sample from a distribution of mean=par, sigma=sampleStd*par
    :type sampleStd: float
    :param speciesTypeDF: Table defining type of each species. Takes values  'gene' or 'protein'
    :type speciesTypeDF: pandas DataFrame
    :param parameterInputsDF: Optional table that specifies parameter input values. Useful to specify experimental conditions.
    :type parameterInputsDF: pandas DataFrame
    :param parameterSetDF: Optional table that specifies a predefined parameter set.
    :type parameterSetDF: pandas DataFrame
    :param interactionStrengthDF: Optional table that specifies interaction strengths. When not specified, default strength is set to 1.
    :type interactionStrengthDF: pandas DataFrame
    :returns:
        ModelSpec : dict
            Dictionary of dictionaries. 
            'varspecs' {variable:ODE equation}
            'ics' {variable:initial condition}
            'pars' {parameter:parameter value}
    """
    species = set(DF['Gene'].values)

    # Variables:
    ## Check:
    ## If speciesType is provided, initialize the
    ## correct species names
    ## Rule:
    ## Every gene (x_species) gets a corresponding
    ## protein (p_species). The gene equation contains
    ## regulatory logic, protein equation contains production
    ## degradation terms.
    ## Every 'protein' (p_species) only gets one variable,
    ## and the corresponding equation contains the regulatory
    ## logic term. There is no production/degradation associated
    ## with this species.
    varspecs = {}
    genelist = []
    proteinlist = []
    if speciesTypeDF is None:
        # Assume everything is a gene.
        for sp in species:
            varspecs['x_' + sp] = ''
            varspecs['p_' + sp] = ''
            genelist.append(sp)
    else:
        for i, row in speciesTypeDF.iterrows():
            sp = row['Species']
            if row['Type'] == 'protein':
                varspecs['p_' + sp] = ''
                proteinlist.append(sp)
            elif row['Type'] == 'gene':
                varspecs['x_' + sp] = ''
                varspecs['p_' + sp] = ''
                genelist.append(sp)
        specified = set(speciesTypeDF['Species'].values)
        for sp in species:
            if sp not in specified:
                genelist.append(sp)
                
    ## Useful for debugging. 
    # print(genelist)
    # print(proteinlist)

    par, x_max, parameterInputs = getParameters(DF,identicalPars,
                                                samplePars,
                                                sampleStd,
                                                genelist,
                                                proteinlist,
                                                withoutRules,
                                                parameterInputsDF,
                                                parameterSetDF,
                                                interactionStrengthDF)

    ##########################################################
    ## Assign values to alpha parameters representing the logic
    ## relationships between variables
    # Initialize new namespace
    boolodespace = {}
    for i,row in DF.iterrows():
        # Initialize species to 0
        tempStr = row['Gene'] + " = 0"  
        exec(tempStr, boolodespace)

    if parameterInputsDF is None:
        inputs = set()
    else:
        inputs = set(withoutRules)
        for k in parameterInputs[0].keys():
            # Initialize variables to 0
            tempStr = k + " = 0"  
            exec(tempStr, boolodespace)
            
    for i,row in DF.iterrows():
        # Basal alpha:
        # Execute the rule to figure out
        # the value of alpha_0
        exec('booleval = ' + row['Rule'], boolodespace) 
        par['alpha_'+row['Gene']] = int(boolodespace['booleval'])

    for i,row in DF.iterrows():
        rhs = row['Rule']
        rhs = rhs.replace('(',' ')
        rhs = rhs.replace(')',' ')
        tokens = rhs.split(' ')

        allreg = set([t for t in tokens if (t in species or t in inputs)])
        regulatorySpecies = set([t for t in tokens if t in species])
        inputreg = set([t for t in tokens if t in inputs])
        
        currSp = row['Gene']
        num = '( alpha_' + currSp
        den = '( 1'

        strengthSpecified = False
        
        if interactionStrengthDF is not None:
            if currSp in interactionStrengthDF['Gene1'].values:
                strengthSpecified = True
                # Get the list of regulators of currSp
                # whose interaction strengths have been specified
                regulatorsWithStrength = set(interactionStrengthDF[\
                                                                   interactionStrengthDF['Gene1']\
                                                                   == currSp]['Gene2'].values)
                print(regulatorsWithStrength)
                
        for i in range(1,len(allreg) + 1):
            for c in combinations(allreg,i):
                # Create the hill function terms for each regulator
                hills = []
                for ci in c:
                    if strengthSpecified and ci in regulatorsWithStrength:
                            hillThresholdName = 'k_' + ci + '_' + currSp
                    else:
                        hillThresholdName = 'k_' + ci
                        
                    if ci in regulatorySpecies:
                        # Note: Only proteins can be regulators
                        hills.append('(p_'+ci+'/'+hillThresholdName+')^n_'+ci)
                    elif ci in inputreg:
                        hills.append('('+ci+'/'+hillThresholdName+')^n_'+ci)
                mult = '*'.join(hills)
                # Create Numerator and Denominator
                den += ' +' +  mult                
                num += ' + a_' + currSp +'_'  + '_'.join(list(c)) + '*' + mult
                
                for i1, row1 in DF.iterrows():
                    exec(row1['Gene'] + ' = 0', boolodespace)

                for geneInList in c:
                    exec(geneInList + ' = 1', boolodespace)
                exec('boolval = ' + row['Rule'], boolodespace)

                par['a_' + currSp +'_'  + '_'.join(list(c))] = int(boolodespace['boolval']) 

        num += ' )'
        den += ' )'
        
        if currSp in proteinlist:
            Production =  '(' +num + '/' + den + ')'
            Degradation = 'p_' + currSp
            varspecs['p_' + currSp] = 'signalingtimescale*(y_max*' + Production \
                                       + '-' + Degradation + ')'
        else:
            Production = 'm_'+ currSp + '*(' +num + '/' + den + ')'
            Degradation = 'l_x_'  + currSp + '*x_' + currSp
            varspecs['x_' + currSp] =  Production \
                                       + '-' + Degradation
            # Create the corresponding translated protein equation
            varspecs['p_' + currSp] = 'r_'+currSp+'*'+'x_' +currSp + '- l_p_'+currSp+'*'+'p_' + currSp
            
    ##########################################################                                       
        
    # Initialize variables between 0 and 1, Doesn't matter.
    xvals = [1. for _ in range(len(genelist))]
    pvals = [20. for _ in range(len(proteinlist))]    
    ics = {}

    for sp, xv in zip(genelist, xvals):
        ics['x_' + sp] = xv
        ics['p_' + sp] = 0
    for sp, pv in zip(proteinlist, pvals):
        ics['p_' + sp] = pv

    ModelSpec = {}
    ModelSpec['varspecs'] = varspecs
    ModelSpec['pars'] = par
    ModelSpec['ics'] = ics
    return ModelSpec, parameterInputs, genelist, proteinlist, x_max


[docs]def simulateModel(Model, y0, parameters,isStochastic, tspan,seed):
    """Call numerical integration functions, either odeint() from Scipy,
    or simulator.eulersde() defined in simulator.py. By default, stochastic simulations are
    carried out using simulator.eulersde.

    :param Model: Function defining ODE model
    :type Model: function
    :param y0: array of initial values for each state variable
    :type y0: array
    :param parameters: list of parameter values to be used in simulations
    :type parameters: list
    :param isStochastic: User defined parameter. Default = True, can be turned off to perform ODE simulations.
    :type isStochastic: bool
    :param tspan: Time points to simulate
    :type tspan: ndarray
    :param seed: Seed to initialize random number generator
    :type seed: float
    :returns: 
        - P: Time course from numerical integration
    :rtype: ndarray

    """
    if not isStochastic:
        P = odeint(Model,y0,tspan,args=(parameters,))
    else:
        P = simulator.eulersde(Model,simulator.noise,y0,tspan,parameters,seed=seed)
    return(P)

[docs]def getInitialCondition(ss, ModelSpec, rnaIndex,
                        proteinIndex,
                        genelist, proteinlist,
                        varmapper,revvarmapper):
    """
    Calculate the initial values of all state variables. 
    Takes into consideration user defined initial conditions, and computes the steady 
    states of the protein variables based on the estimated values of their corresponding genes.

    :param ss: Steady state array
    :type ss: ndarray
    :param ModelSpec: Dictionary of dictionary specifying the ODE model, containing parameters, initial conditions and equations.
    :type ModelSpec: dict
    :param rnaIndex: list of indices of genes
    :type rnaIndex: list
    :param proteinIndex: List of indices of proteins
    :type proteinIndex: list
    :param genelist: List of names of all genes in the model
    :type genelist: list
    :param proteinlist: List of names of all proteins in the model
    :type proteinlist: list
    :param varmapper: Mapper: {variable name : index}
    :type varmapper: dict
    :param revvarmapper: Mapper: {index : variable name}
    :type revvarmapper: dict
    :returns:
        - newics: List containing new initial conditions
    """
    
    # Initialize
    new_ics = [0 for _ in range(len(varmapper.keys()))]
    # Set the mRNA ics
    for ind in rnaIndex:
        if ss[ind] < 0:
            ss[ind] = 0.0
        new_ics[ind] =  ss[ind]
        if new_ics[ind] < 0:
            new_ics[ind] = 0
    for p in proteinlist:
        ind = revvarmapper['p_'+p]
        if ss[ind] < 0:
            ss[ind] = 0.0
        new_ics[ind] =  ss[ind]
        if new_ics[ind] < 0:
            new_ics[ind] = 0            
    # Calculate the Protein ics based on mRNA levels
    for genename in genelist:
        pss = ((ModelSpec['pars']['r_' + genename])/\
                                      (ModelSpec['pars']['l_p_' + genename]))\
                                      *new_ics[revvarmapper['x_' + genename]]
        new_ics[revvarmapper['p_' + genename.replace('_','')]] = pss
    return(new_ics)

[docs]def simulateAndSample(argdict):
    allParameters = argdict['allParameters']
    parNames = argdict['parNames']
    Model = argdict['Model']
    isStochastic = argdict['isStochastic']
    tspan = argdict['tspan']
    varmapper = argdict['varmapper']
    timeIndex = argdict['timeIndex']
    genelist = argdict['genelist']
    proteinlist = argdict['proteinlist']
    writeProtein=argdict['writeProtein']
    cellid = argdict['cellid']
    outPrefix = argdict['outPrefix']
    sampleCells = argdict['sampleCells']
    ss = argdict['ss']
    ModelSpec = argdict['ModelSpec']
    rnaIndex = argdict['rnaIndex']
    proteinIndex = argdict['proteinIndex']
    genelist = argdict['genelist']
    proteinlist = argdict['proteinlist']
    revvarmapper = argdict['revvarmapper']
    seed = argdict['seed']
    pars = argdict['pars']
    x_max = argdict['x_max']
    if sampleCells:
        header = argdict['header']
    pars = {}
    for k, v in allParameters.items():
        pars[k] = v
    pars = [pars[k] for k in parNames]
    
    ## Boolean to check if a simulation is going to a
    ## 0 steady state, with all genes/proteins dying out
    retry = True
    trys = 0
    
    ## timepoints
    tps = [i for i in range(1,len(tspan))]
    ## gene ids
    gid = [i for i,n in varmapper.items() if 'x_' in n]
    outPrefix = outPrefix +'simulations/'
    while retry:
        seed += 1000
        y0_exp = getInitialCondition(ss, ModelSpec, rnaIndex, proteinIndex,
                                     genelist, proteinlist,
                                     varmapper,revvarmapper)
        
        P = simulateModel(Model, y0_exp, pars, isStochastic, tspan, seed)
        P = P.T
        retry = False
        ## Extract Time points
        subset = P[gid,:][:,tps]
        df = pd.DataFrame(subset,
                          index=pd.Index(genelist),
                          columns = ['E' + str(cellid) +'_' +str(i)\
                                     for i in tps])
        ## Heuristic:
        ## If the largest value of a protein achieved in a simulation is
        ## less than 10% of the y_max, drop the simulation.
        ## This check stems from the observation that in some simulations,
        ## all genes go to the 0 steady state in some rare simulations.
        dfmax = df.max()
        for col in df.columns:
            colmax = df[col].max()
            if colmax < 0.1*x_max:
                retry= True
                break
        
        if sampleCells:
            ## Write a single cell to file
            ## These samples allow for quickly and
            ## reproducibly testing the output.
            sampledf = utils.sampleCellFromTraj(cellid,
                                          tspan, 
                                          P,
                                          varmapper, timeIndex,
                                          genelist, proteinlist,
                                          header,
                                          writeProtein=writeProtein)
            sampledf = sampledf.T
            sampledf.to_csv(outPrefix + 'E' + str(cellid) + '-cell.csv')            
            
        trys += 1
        if trys > 1:
            print('try', trys)
            

    # write to file
    df.to_csv(outPrefix + 'E'+ str(cellid) + '.csv')

    
[docs]def Experiment(Model, ModelSpec,tspan,
               num_cells,
               sampleCells,
               varmapper, parmapper,
               genelist, proteinlist,
               outPrefix,icsDF,
               nClusters,
               x_max,
               doParallel,
               burnin=False,writeProtein=False,
               normalizeTrajectory=False):
    """
    Carry out an `in-silico` experiment. This function takes as input 
    an ODE model defined as a python function and carries out stochastic
    simulations. BoolODE defines a _cell_ as a single time point from 
    a simulated time course. Thus, in order to obtain 50 single cells,
    BoolODE carries out 50 simulations, which are stored in ./simulations/.
    Further, if it is known that the resulting single cell dataset will
    exhibit multiple trajectories, the user can specify  the number of clusters in
    `nClusters`; BoolODE will then cluster the entire simulation, such that each
    simulated trajectory possesess a cluster ID.

    :param Model: Function defining ODE model
    :type Model: function
    :param ModelSpec: Dictionary defining ODE model. See readBooleanRules()
    :type ModelSpec: dict
    :param tspan: Array of time points
    :type tspan: ndarray
    :param num_cells: Number of simulations to perform
    :type num_cells: int
    :param sampleCells: Bool that specifies if a random sample of size num_cells should be generated from the simulated output, where one cell is picked per simulation without replacement
    :type sampleCells: bool
    :param varmapper: 
    :type varmapper: dict
    :param parmapper: 
    :type parmapper: dict
    :param genelist: List of all gene names
    :type genelist:  list
    :param proteinlist: List of all protein names
    :type proteinlist: list
    :param outPrefix: Name of output folder. 
    :type outPrefix: str
    :param icsDF: Dataframe specifying initial condition for simulation
    :type icsDF: pandas DataFrame
    :param nClusters: Number of expected trajectories. Used to perform k-means clustering
    :type nClusters: int
    :param x_max: max value of gene. By default 2.0, will vary if parameters are sampled
    :type x_max: float
    :param doParallel: Bool specifying starting simulations in parallel
    :type doParallel: bool
    :param burnin: Bool specifying that initial fraction of simulation should be discarded. Obsolete
    :type burnin: bool
    :param writeProtein: Bool specifying if the protein values should be written to file. Default = False
    :type writeProtein: bool
    :param normalizeTrajectory: Bool specifying if the gene expression values should be scaled between 0 and 1.
    :type normalizeTrajectory: bool 
    """
    if not sampleCells:
        print("Note: Simulated trajectories will be clustered. nClusters = %d" % nClusters)
    ####################    
    allParameters = dict(ModelSpec['pars'])
    parNames = sorted(list(allParameters.keys()))
    ## Use default parameters 
    pars = [ModelSpec['pars'][k] for k in parNames]
    ####################
    rnaIndex = [i for i in range(len(varmapper.keys())) if 'x_' in varmapper[i]]
    revvarmapper = {v:k for k,v in varmapper.items()}
    proteinIndex = [i for i in range(len(varmapper.keys())) if 'p_' in varmapper[i]]

    y0 = [ModelSpec['ics'][varmapper[i]] for i in range(len(varmapper.keys()))]
    ss = np.zeros(len(varmapper.keys()))
    
    for i,k in varmapper.items():
        if 'x_' in k:
            ss[i] = 1.0
        elif 'p_' in k:
            if k.replace('p_','') in proteinlist:
                # Seting them to the threshold
                # causes them to drop to 0 rapidly
                # TODO: try setting to threshold < v < y_max
                ss[i] = 20.
            
    if icsDF is not None:
        icsspec = icsDF.loc[0]
        genes = ast.literal_eval(icsspec['Genes'])
        values = ast.literal_eval(icsspec['Values'])
        icsmap = {g:v for g,v in zip(genes,values)}
        for i,k in varmapper.items():
            for p in proteinlist:
                if p in icsmap.keys():
                    ss[revvarmapper['p_'+p]] = icsmap[p]
                else:
                    ss[revvarmapper['p_'+p]] = 0.01
            for g in genelist:
                if g in icsmap.keys():
                    ss[revvarmapper['x_'+g]] = icsmap[g]
                else:
                    ss[revvarmapper['x_'+g]] = 0.01
            
    outputfilenames = []
    for isStochastic in [True]: 
        # "WT" simulation
        if len(proteinlist) == 0:
            result = pd.DataFrame(index=pd.Index([varmapper[i] for i in rnaIndex]))
        else:
            speciesoi = [revvarmapper['p_' + p] for p in proteinlist]
            speciesoi.extend([revvarmapper['x_' + g] for g in genelist])
            result = pd.DataFrame(index=pd.Index([varmapper[i] for i in speciesoi]))
            
        frames = []
        if burnin:
            burninFraction = 0.25 # Chosen arbitrarily as 25%
            # Ignore the first 25% of the simulation
            startat = int(np.ceil(burninFraction*len(tspan)))
        else:
            startat = 0
            
        # Index of every possible time point. Sample from this list
        timeIndex = [i for i in range(startat, len(tspan))]        
        if sampleCells:

            # pre-define the time points from which a cell will be sampled
            # per simulation
            sampleAt = np.random.choice(timeIndex, size=num_cells)
            header = ['E' + str(cellid) + '_' + str(time) \
                      for cellid, time in\
                      zip(range(num_cells), sampleAt)]

        ## Construct dictionary of arguments to be passed
        ## to simulateAndSample(), done in parallel
        argdict = {}
        argdict['allParameters'] = allParameters
        argdict['parNames'] = parNames
        argdict['Model'] = Model
        argdict['isStochastic'] = isStochastic
        argdict['tspan'] = tspan
        argdict['varmapper'] = varmapper
        argdict['timeIndex'] = timeIndex
        argdict['genelist'] = genelist
        argdict['proteinlist'] = proteinlist
        argdict['writeProtein'] = writeProtein
        argdict['outPrefix'] = outPrefix
        argdict['sampleCells'] = sampleCells
        argdict['pars'] = pars
        argdict['ss'] = ss
        argdict['ModelSpec'] = ModelSpec
        argdict['rnaIndex'] = rnaIndex
        argdict['proteinIndex'] = proteinIndex
        argdict['genelist'] = genelist
        argdict['proteinlist'] = proteinlist
        argdict['revvarmapper'] = revvarmapper
        argdict['x_max'] = x_max

        if sampleCells:
            argdict['header'] = header

        simfilepath = outPrefix + 'simulations/'
        if len(simfilepath) > 0:
            if '/' in simfilepath:
                outDir = '/'.join(simfilepath.split('/')[:-1])
                if not os.path.exists(simfilepath):
                    print(simfilepath, "does not exist, creating it...")
                    os.makedirs(simfilepath)
        print('Starting simulations')
        start = time.time()
        ########################
        if doParallel:
            ## Carry out simulations in parrallel
            lock = Lock()
            jobs = []
            for cellid in range(num_cells):
                argdict['seed'] = cellid
                argdict['cellid'] = cellid
                p = Process(target=simulateAndSample,args=([argdict]))
                p.start()
                #p.join()
        else:
            for cellid in tqdm(range(num_cells)):
                argdict['seed'] = cellid
                argdict['cellid'] = cellid
                simulateAndSample(argdict)
        ########################
        ## Sleep for 1 s to allow for IO. Hack necessary for smalle number of simulations
        ## where sim time < IO time.
        time.sleep(1)
        print("Simulations took %0.3f s"%(time.time() - start))
        frames = []
        print('starting to concat files')
        start = time.time()
        if not sampleCells:
            # initialize dictionary to hold raveled values, used to cluster
            groupedDict = {} 
        for cellid in tqdm(range(num_cells)):
            if sampleCells:
                df = pd.read_csv(outPrefix + 'simulations/E'+str(cellid) + '-cell.csv',index_col=0)
                df = df.sort_index()                
            else:
                df = pd.read_csv(outPrefix + 'simulations/E'+str(cellid) + '.csv',index_col=0)
                df = df.sort_index()
                groupedDict['E' + str(cellid)] = df.values.ravel()
            frames.append(df.T)
        stop = time.time()
        print("Concating files took %.2f s" %(stop-start))
        result = pd.concat(frames,axis=0)
        result = result.T
        indices = result.index
        newindices = [i.replace('x_','') for i in indices]
        result.index = pd.Index(newindices)
        
        if not sampleCells:
            ## Carry out k-means clustering to identify which
            ## trajectory a simulation belongs to
            print('Starting k-means clustering')
            #headers = result.columns
            # simulations = sorted(list(set([h.split('_')[0] + '_' for h in headers])))
            ## group each simulation
            #groupedDict = {} 
            ## The following loop takes time for a large number of
            ## simulations
            #print('Raveling dataframe to start clustering')
            #for s in tqdm(simulations):
            #    groupedDict[s] = result.loc[:, result.columns.str.startswith(s)].values.ravel()
            groupedDF = pd.DataFrame.from_dict(groupedDict)
            print('Clustering simulations...')
            start = time.time()            
            # Find clusters in the experiments
            clusterLabels= KMeans(n_clusters=nClusters,
                                  n_jobs=8).fit(groupedDF.T.values).labels_
            print('Clustering took %0.3fs' % (time.time() - start))
            clusterDF = pd.DataFrame(data=clusterLabels, index =\
                                     groupedDF.columns, columns=['cl'])
        ##################################################

        if normalizeTrajectory:
            resultN = utils.normalizeExp(result)
        else:
            resultN = result
            
        if isStochastic:
            name = 'stoch'
        else:
            name = 'ode'

        if not sampleCells:
            clusterDF.to_csv(outPrefix + 'ClusterIds.csv')
        outputfilenames.append(outPrefix + name +'_experiment.txt')
    return outputfilenames, resultN
            
[docs]def parseArgs(args):
    parser = OptionParser()
    parser.add_option('', '--max-time', type='int',default=20,
                      help='Total time of simulation. (Default = 20)')
    parser.add_option('', '--num-cells', type='int',default=100,
                      help='Number of cells sample. (Default = 100)')
    parser.add_option('', '--sample-cells', action='store_true',default=False,
                      help="Sample a single cell from each trajectory?\n"
                      "By default will store full trajectory of each simulation (Default = False)")    
    parser.add_option('', '--add-dummy', action="store_true",default=False,
                      help='Add dummy genes')        
    parser.add_option('-n', '--normalize-trajectory', action="store_true",default=False,
                      help="Min-Max normalize genes across all experiments")
    parser.add_option('-i', '--identical-pars', action="store_true",default=False,
                      help="Set single value to similar parameters\n"
                      "NOTE: Consider setting this if you are using --sample-pars.")
    parser.add_option('-s', '--sample-pars', action="store_true",default=False,
                      help="Sample rate parameters around the hardcoded means\n"
                      ", using 10% stand. dev.")
    parser.add_option('', '--std', type='float',default=0.1,
                      help="If sampling parameters, specify standard deviation. (Default 0.1)")
    parser.add_option('', '--write-protein', action="store_true",default=False,
                      help="Write both protein and RNA values to file. Useful for debugging.")
    parser.add_option('-b', '--burn-in', action="store_true",default=False,
                      help="Treats the first 25% of the time course as burnin\n"
                      ", samples from the rest.")        
    parser.add_option('', '--outPrefix', type = 'str',default='',
                      help='Prefix for output files.')
    parser.add_option('', '--path', type='str',
                      help='Path to boolean model file')
    parser.add_option('', '--inputs', type='str',default='',
                      help='Path to input parameter files. This is different from specifying a parameter set!')
    parser.add_option('', '--pset', type='str',default='',
                      help='Path to pre-generated parameter set.')        
    parser.add_option('', '--ics', type='str',default='',
                      help='Path to list of initial conditions')
    parser.add_option('', '--strengths', type='str',default='',
                      help='Path to list of interaction strengths')
    parser.add_option('', '--species-type', type='str',default='',
                      help="Path to list of molecular species type file."\
                      "Useful to specify proteins/genes")
    parser.add_option('-c', '--nClusters', type='int',default='1',
                      help='Number of expected clusters in the dataset. (Default = 1)')    
    parser.add_option('', '--max-parents', type='int',default='1',
                      help='Number of parents to add to dummy genes. (Default = 1)')    
    parser.add_option('', '--do-parallel', action="store_true",default=False,
                      help='Run simulations in parallel. Recommended for > 50 simulations')    
        
    (opts, args) = parser.parse_args(args)

    return opts, args

[docs]def main(args):
    startfull = time.time()
    opts, args = parseArgs(args)
    path = opts.path
    if path is None or len(path) == 0:
        print("Please specify path to Boolean model")
        sys.exit()

    tmax = opts.max_time
    numCells = opts.num_cells
    identicalPars = opts.identical_pars
    burnin = opts.burn_in
    samplePars = opts.sample_pars
    sampleStd = opts.std
    outPrefix = opts.outPrefix
    parameterInputsPath = opts.inputs
    parameterSetPath = opts.pset
    icsPath = opts.ics    
    writeProtein = opts.write_protein
    normalizeTrajectory = opts.normalize_trajectory
    interactionStrengthPath = opts.strengths
    speciesTypePath = opts.species_type
    add_dummy = opts.add_dummy
    sampleCells = opts.sample_cells
    nClusters = opts.nClusters
    max_parents = opts.max_parents
    doParallel = opts.do_parallel    
    # if output directory doesn't exist
    # create one!
    if len(outPrefix) > 0:
        if '/' in outPrefix:
            outDir = '/'.join(outPrefix.split('/')[:-1])
            if not os.path.exists(outDir):
                print(outDir, "does not exist, creating it...")
                os.makedirs(outDir)
    
    if len(parameterInputsPath) > 0: 
        parameterInputsDF = pd.read_csv(parameterInputsPath,sep='\t')
    else:
        parameterInputsDF = None
        
    if len(parameterSetPath) > 0: 
        parameterSetDF = pd.read_csv(parameterSetPath,
                                     header=None,
                                     sep='\t',
                                     index_col=0)
    else:
        parameterSetDF = None

    if len(icsPath) > 0: 
        icsDF = pd.read_csv(icsPath,sep='\t',engine='python')
    else:
        icsDF = None

    if len(interactionStrengthPath) > 0:
        print("Interaction Strengths are supplied")
        interactionStrengthDF = pd.read_csv(interactionStrengthPath,sep='\t',engine='python')
    else:
        interactionStrengthDF = None

    if len(speciesTypePath) > 0: 
        speciesTypeDF = pd.read_csv(speciesTypePath,sep='\t')
    else:
        speciesTypeDF = None
        
    ## Hardcoded  ODE simulation time steps
    ## This can be reduced
    timesteps = 100
    tspan = np.linspace(0,tmax,tmax*timesteps)
    
    DF, withoutRules = readBooleanRules(path, parameterInputsPath,
                                        outPrefix, add_dummy, max_parents)
    if len(withoutRules) == 0:
        withoutRules = []
        
    it = 0
    someexception = True
    while someexception:
        try:
            genesDict = {}
            
            ModelSpec,\
                parameterInputs,\
                genelist,\
                proteinlist,\
                x_max = generateModelDict(DF,identicalPars,
                                          samplePars,
                                          sampleStd,
                                          withoutRules,
                                          speciesTypeDF,
                                          parameterInputsDF,
                                          parameterSetDF,
                                          interactionStrengthDF)
            # FIXME : ask user to pass row if parameter input file
            # Hardcoded. We only care about one input vector, say the first one
            if len(parameterInputsPath) > 0:
                ModelSpec['pars'].update(parameterInputs[0])
            varmapper = {i:var for i,var in enumerate(ModelSpec['varspecs'].keys())}
            parmapper = {i:par for i,par in enumerate(ModelSpec['pars'].keys())}    
            dir_path  = utils.writeModelToFile(ModelSpec)
            ## Load file from path
            model = SourceFileLoader("model", dir_path + "/model.py").load_module()
            #import model
            outputfilenames, resultDF = Experiment(model.Model,
                                                   ModelSpec,
                                                   tspan,
                                                   numCells,
                                                   sampleCells,
                                                   varmapper, parmapper,
                                                   genelist,proteinlist,
                                                   outPrefix, icsDF,
                                                   nClusters,
                                                   x_max,
                                                   doParallel,
                                                   burnin=burnin,
                                                   writeProtein=writeProtein,
                                                   normalizeTrajectory=normalizeTrajectory)
            print('Generating input files for pipline...')
            start = time.time()
            utils.generateInputFiles(resultDF, outputfilenames, DF,
                               withoutRules,
                               parameterInputsPath,
                               outPrefix=outPrefix)
            print('Input file generation took %0.2f s' % (time.time() - start))

            
            someexception= False
            
        except FloatingPointError as e:
            it +=1 
            print(e,"\nattempt %d" %it)
        
    utils.writeParametersToFile(ModelSpec, outPrefix)
    print("BoolODE.py took %0.2fs"% (time.time() - startfull))
    print('all done.')    

if __name__ == "__main__":
    main(sys.argv)