Source code for src.BoolODE

#!/usr/bin/env python
# coding: utf-8
__author__ = 'Amogh Jalihal'
import sys
import numpy as np
import scipy as sc
import pandas as pd
from itertools import combinations
from scipy.integrate import odeint
from sklearn.cluster import KMeans
import time
import importlib
import warnings
from tqdm import tqdm 
from optparse import OptionParser
from multiprocessing import Process, Lock, Manager
import ast
from importlib.machinery import SourceFileLoader
import utils 
import simulator 

np.seterr(all='raise')
import os

[docs]def readBooleanRules(path, parameterInputsPath, outPrefix='', add_dummy=False, max_parents=1): """ Reads a rule file from path and stores the rules in a dictionary Parameters ---------- :param path: Path to Boolean Rule file :type path: str :param parameterInputsPath: Path to file containing input parameters to the model :type parameterInputsPath: str :param add_dummy: Experimental feature, adds dummy targets to each gene in the model. Useful to change the density of the network :type add_dummy: bool :param max_parents: Experimental feature, specifies number of parent genes per dummy gene added :type max_parents: int :returns: - DF: Dataframe containing rules - withoutrules: list of nodes in input file without rules """ DF = pd.read_csv(path,sep='\t',engine='python') withRules = list(DF['Gene'].values) allnodes = set() for ind,row in DF.iterrows(): rhs = row['Rule'] rhs = rhs.replace('(',' ') rhs = rhs.replace(')',' ') tokens = rhs.split(' ') reg = [t for t in tokens if t not in ['not','and','or','']] allnodes.update(set(reg)) withoutRules = allnodes.difference(set(withRules)) for n in withoutRules: if len(parameterInputsPath) == 0: print(n, "has no rule, adding self-activation.") DF = DF.append({'Gene':n,'Rule':n},ignore_index=True) else: print("Treating %s as parameter" % {n}) ## Add dummy genes ## determinisitcally add parents if add_dummy: print('using max_parents=' + str(max_parents)) num_dummy = 2*len(allnodes) for dg in range(num_dummy): ## Variable number of parents # DF = DF.append({'Gene':'dummy' + str(dg), # 'Rule':' or '.join([s for s in \ # np.random.choice(list(allnodes), # size=np.random.choice([i+1 for i in range(len(allnodes)-1)]))])}, # ignore_index = True) DF = DF.append({'Gene':'dummy' + str(dg), 'Rule':' or '.join([s for s in \ np.random.choice(list(allnodes), size=max_parents)])}, ignore_index = True) DF.to_csv(outPrefix + 'rules-with-added-genes.csv') print(DF) return DF, withoutRules
[docs]def getParameters(DF,identicalPars, samplePars, sampleStd, genelist, proteinlist, withoutRules, parameterInputsDF, parameterSetDF, interactionStrengthDF): """ Create dictionary of parameters and values. Assigns parameter values by evaluating the Boolean expression for each variable. :param DF: Table of values with two columns, 'Gene' specifies target, 'Rule' specifies Boolean function :type DF: pandas DataFrame :param identicalPars: Passed to utils.getSaneNval to set identical parameters :type identicalPars: bool :param samplePars: Sample kinetic parameters using a Gaussian distribution centered around the default parameters :type samplePars: bool :param parameterInputsDF: Optional table that specifies input parameter values. Useful to specify experimental conditions. :type parameterInputsDF: pandas DataFrame :param parameterSetDF: Optional table that specifies predefined parameter set. :type parameterSetDF: pandas DataFrame :param interactionStrengthDF: Optional table that specifies interaction strengths. When not specified, default strength is set to 1. :type interactionStrengthDF: pandas DataFrame :returns: pars : dict Dictionary of parameters """ ## Set default parameters mRNATranscription = 20. mRNADegradation = mRNATranscription/2. proteinTranslation = 10 proteinDegradation = 1 ## The threshold is calculated as the max value of the species ## divided by 2. y_max = (proteinTranslation/proteinDegradation)*\ (mRNATranscription/mRNADegradation) x_max = (mRNATranscription/mRNADegradation) hillThreshold = y_max/2 # Chosen arbitrarily signalingtimescale = 5.0 hillCoefficient = 10 interactionStrength = 1.0 parameterNamePrefixAndDefaultsAll = { # Hill coefficients 'n_':hillCoefficient, # Thresholds 'k_':hillThreshold, } parameterNamePrefixAndDefaultsGenes = { # mRNA transcription rates 'm_':mRNATranscription, # mRNA degradation rates 'l_x_':mRNADegradation, # Protein translation rate 'r_':proteinTranslation, # protein degradation rates 'l_p_':proteinDegradation } par = dict() ## If there is even one signaling protein, ## create the y_max parameter if len(proteinlist) > 0: par['y_max'] = y_max par['signalingtimescale'] = signalingtimescale parameterInputsDict = {} interactionStrengths = {} ## Get set of species which have rules specified for them species = set(DF['Gene'].values) ## Check 1: ## If a parameter input file is specified, ## Every row in parameterInputsDF contains two ## columns: ## 'Inputs' is a python list of strings ## specifying the parameters. ## 'Values' is a python list of floats ## each corresponding to the value of a parameter. ## Now, we create a fake hill term for this input, even ## though there is no equation corresponding to it. Thus, ## we create a hillThreshold and hillCoefficient ## term for it. This is useful so we can treat this input ## parameter as a "regulator" and leave the structure of ## the logic term unchanged. if parameterInputsDF is not None: for i, row in parameterInputsDF.iterrows(): # initialize parameterInputsDict[i] = {p:0 for p in withoutRules} inputParams = ast.literal_eval(row['Inputs']) inputValues = ast.literal_eval(row['Values']) # Set to a max value parameterInputsDict[i].update({p:hillThreshold*2 for p,v in\ zip(inputParams,inputValues) if v > 0}) # Add input parameters, set to 0 by default par.update({p:0 for p in parameterInputsDict[0].keys()}) par.update({'k_'+p:hillThreshold for p in parameterInputsDict[0].keys()}) par.update({'n_'+p:hillCoefficient for p in parameterInputsDict[0].keys()}) ## Check 2: ## If interaction strengths are specified. ## Create a new parameter specifying the strength ## of a given interaction. For all other equations in ## which "regulator" appears in, its hillThreshold value ## will remain the default value. if interactionStrengthDF is not None: for i,row in interactionStrengthDF.iterrows(): regulator = row['Gene2'] target = row['Gene1'] interactionStrength = row['Strength'] par.update({'k_' + regulator + '_' + target: hillThreshold/interactionStrength}) ## Check 3: ## Whether to sample parameters or stick to defaults if samplePars: print("Sampling parameters") sigmamult = sampleStd print("Using std=" + str(sampleStd)) lomult = 0.9 himult = 1.1 for parPrefix, parDefault in parameterNamePrefixAndDefaultsAll.items(): sampledParameterValues = utils.getSaneNval(len(species),\ lo=lomult*parDefault,\ hi=himult*parDefault,\ mu=parDefault,\ sig=sigmamult*parDefault,\ identicalPars=identicalPars) for sp, sparval in zip(species, sampledParameterValues): if sp in genelist: par[parPrefix + sp] = sparval transcriptionRate = 0.0 mRNADegradationRate = 0.0 for parPrefix, parDefault in parameterNamePrefixAndDefaultsGenes.items(): sampledParameterValues = utils.getSaneNval(len(species),\ lo=lomult*parDefault,\ hi=himult*parDefault,\ mu=parDefault,\ sig=sigmamult*parDefault,\ identicalPars=identicalPars) if identicalPars: if parPrefix == 'm_': transcriptionRate = sampledParameterValues[0] if parPrefix == 'l_x_': mRNADegradationRate = sampledParameterValues[0] else: if parPrefix == 'm_': transcriptionRate = parDefault if parPrefix == 'l_x_': mRNADegradationRate = parDefault for sp, sparval in zip(species, sampledParameterValues): if sp in genelist: par[parPrefix + sp] = sparval ## Based on these samples, estimate max value a ## gene can take at _steady-state_ x_max = (transcriptionRate/mRNADegradationRate) else: print("Fixing rate parameters to defaults") for parPrefix, parDefault in parameterNamePrefixAndDefaultsAll.items(): for sp in species: par[parPrefix + sp] = parDefault for parPrefix, parDefault in parameterNamePrefixAndDefaultsGenes.items(): for sp in species: if sp in genelist: par[parPrefix + sp] = parDefault ## Final Check. ## If parameterSetDF is specified, reassign ## parameter values to those from table ## This guarantees that all the parameters are defined without ## specific checks in parameterSetDF if parameterSetDF is not None: for pname, pvalue in parameterSetDF.iterrows(): par[pname] = pvalue[1] return par, x_max, parameterInputsDict
[docs]def generateModelDict(DF,identicalPars, samplePars, sampleStd, withoutRules, speciesTypeDF, parameterInputsDF, parameterSetDF, interactionStrengthDF): """ Take a DataFrame object with Boolean rules, construct ODE equations for each variable. Takes optional parameter inputs and interaction strengths :param DF: Table of values with two columns, 'Gene' specifies target, 'Rule' specifies Boolean function :type DF: pandas DataFrame :param identicalPars: Passed to utils.getSaneNval to set identical parameters :type identicalPars: bool :param samplePars: Sample kinetic parameters using a Gaussian distribution centered around the default parameters :type samplePars: bool :param sampleStd: Sample from a distribution of mean=par, sigma=sampleStd*par :type sampleStd: float :param speciesTypeDF: Table defining type of each species. Takes values 'gene' or 'protein' :type speciesTypeDF: pandas DataFrame :param parameterInputsDF: Optional table that specifies parameter input values. Useful to specify experimental conditions. :type parameterInputsDF: pandas DataFrame :param parameterSetDF: Optional table that specifies a predefined parameter set. :type parameterSetDF: pandas DataFrame :param interactionStrengthDF: Optional table that specifies interaction strengths. When not specified, default strength is set to 1. :type interactionStrengthDF: pandas DataFrame :returns: ModelSpec : dict Dictionary of dictionaries. 'varspecs' {variable:ODE equation} 'ics' {variable:initial condition} 'pars' {parameter:parameter value} """ species = set(DF['Gene'].values) # Variables: ## Check: ## If speciesType is provided, initialize the ## correct species names ## Rule: ## Every gene (x_species) gets a corresponding ## protein (p_species). The gene equation contains ## regulatory logic, protein equation contains production ## degradation terms. ## Every 'protein' (p_species) only gets one variable, ## and the corresponding equation contains the regulatory ## logic term. There is no production/degradation associated ## with this species. varspecs = {} genelist = [] proteinlist = [] if speciesTypeDF is None: # Assume everything is a gene. for sp in species: varspecs['x_' + sp] = '' varspecs['p_' + sp] = '' genelist.append(sp) else: for i, row in speciesTypeDF.iterrows(): sp = row['Species'] if row['Type'] == 'protein': varspecs['p_' + sp] = '' proteinlist.append(sp) elif row['Type'] == 'gene': varspecs['x_' + sp] = '' varspecs['p_' + sp] = '' genelist.append(sp) specified = set(speciesTypeDF['Species'].values) for sp in species: if sp not in specified: genelist.append(sp) ## Useful for debugging. # print(genelist) # print(proteinlist) par, x_max, parameterInputs = getParameters(DF,identicalPars, samplePars, sampleStd, genelist, proteinlist, withoutRules, parameterInputsDF, parameterSetDF, interactionStrengthDF) ########################################################## ## Assign values to alpha parameters representing the logic ## relationships between variables # Initialize new namespace boolodespace = {} for i,row in DF.iterrows(): # Initialize species to 0 tempStr = row['Gene'] + " = 0" exec(tempStr, boolodespace) if parameterInputsDF is None: inputs = set() else: inputs = set(withoutRules) for k in parameterInputs[0].keys(): # Initialize variables to 0 tempStr = k + " = 0" exec(tempStr, boolodespace) for i,row in DF.iterrows(): # Basal alpha: # Execute the rule to figure out # the value of alpha_0 exec('booleval = ' + row['Rule'], boolodespace) par['alpha_'+row['Gene']] = int(boolodespace['booleval']) for i,row in DF.iterrows(): rhs = row['Rule'] rhs = rhs.replace('(',' ') rhs = rhs.replace(')',' ') tokens = rhs.split(' ') allreg = set([t for t in tokens if (t in species or t in inputs)]) regulatorySpecies = set([t for t in tokens if t in species]) inputreg = set([t for t in tokens if t in inputs]) currSp = row['Gene'] num = '( alpha_' + currSp den = '( 1' strengthSpecified = False if interactionStrengthDF is not None: if currSp in interactionStrengthDF['Gene1'].values: strengthSpecified = True # Get the list of regulators of currSp # whose interaction strengths have been specified regulatorsWithStrength = set(interactionStrengthDF[\ interactionStrengthDF['Gene1']\ == currSp]['Gene2'].values) print(regulatorsWithStrength) for i in range(1,len(allreg) + 1): for c in combinations(allreg,i): # Create the hill function terms for each regulator hills = [] for ci in c: if strengthSpecified and ci in regulatorsWithStrength: hillThresholdName = 'k_' + ci + '_' + currSp else: hillThresholdName = 'k_' + ci if ci in regulatorySpecies: # Note: Only proteins can be regulators hills.append('(p_'+ci+'/'+hillThresholdName+')^n_'+ci) elif ci in inputreg: hills.append('('+ci+'/'+hillThresholdName+')^n_'+ci) mult = '*'.join(hills) # Create Numerator and Denominator den += ' +' + mult num += ' + a_' + currSp +'_' + '_'.join(list(c)) + '*' + mult for i1, row1 in DF.iterrows(): exec(row1['Gene'] + ' = 0', boolodespace) for geneInList in c: exec(geneInList + ' = 1', boolodespace) exec('boolval = ' + row['Rule'], boolodespace) par['a_' + currSp +'_' + '_'.join(list(c))] = int(boolodespace['boolval']) num += ' )' den += ' )' if currSp in proteinlist: Production = '(' +num + '/' + den + ')' Degradation = 'p_' + currSp varspecs['p_' + currSp] = 'signalingtimescale*(y_max*' + Production \ + '-' + Degradation + ')' else: Production = 'm_'+ currSp + '*(' +num + '/' + den + ')' Degradation = 'l_x_' + currSp + '*x_' + currSp varspecs['x_' + currSp] = Production \ + '-' + Degradation # Create the corresponding translated protein equation varspecs['p_' + currSp] = 'r_'+currSp+'*'+'x_' +currSp + '- l_p_'+currSp+'*'+'p_' + currSp ########################################################## # Initialize variables between 0 and 1, Doesn't matter. xvals = [1. for _ in range(len(genelist))] pvals = [20. for _ in range(len(proteinlist))] ics = {} for sp, xv in zip(genelist, xvals): ics['x_' + sp] = xv ics['p_' + sp] = 0 for sp, pv in zip(proteinlist, pvals): ics['p_' + sp] = pv ModelSpec = {} ModelSpec['varspecs'] = varspecs ModelSpec['pars'] = par ModelSpec['ics'] = ics return ModelSpec, parameterInputs, genelist, proteinlist, x_max
[docs]def simulateModel(Model, y0, parameters,isStochastic, tspan,seed): """Call numerical integration functions, either odeint() from Scipy, or simulator.eulersde() defined in simulator.py. By default, stochastic simulations are carried out using simulator.eulersde. :param Model: Function defining ODE model :type Model: function :param y0: array of initial values for each state variable :type y0: array :param parameters: list of parameter values to be used in simulations :type parameters: list :param isStochastic: User defined parameter. Default = True, can be turned off to perform ODE simulations. :type isStochastic: bool :param tspan: Time points to simulate :type tspan: ndarray :param seed: Seed to initialize random number generator :type seed: float :returns: - P: Time course from numerical integration :rtype: ndarray """ if not isStochastic: P = odeint(Model,y0,tspan,args=(parameters,)) else: P = simulator.eulersde(Model,simulator.noise,y0,tspan,parameters,seed=seed) return(P)
[docs]def getInitialCondition(ss, ModelSpec, rnaIndex, proteinIndex, genelist, proteinlist, varmapper,revvarmapper): """ Calculate the initial values of all state variables. Takes into consideration user defined initial conditions, and computes the steady states of the protein variables based on the estimated values of their corresponding genes. :param ss: Steady state array :type ss: ndarray :param ModelSpec: Dictionary of dictionary specifying the ODE model, containing parameters, initial conditions and equations. :type ModelSpec: dict :param rnaIndex: list of indices of genes :type rnaIndex: list :param proteinIndex: List of indices of proteins :type proteinIndex: list :param genelist: List of names of all genes in the model :type genelist: list :param proteinlist: List of names of all proteins in the model :type proteinlist: list :param varmapper: Mapper: {variable name : index} :type varmapper: dict :param revvarmapper: Mapper: {index : variable name} :type revvarmapper: dict :returns: - newics: List containing new initial conditions """ # Initialize new_ics = [0 for _ in range(len(varmapper.keys()))] # Set the mRNA ics for ind in rnaIndex: if ss[ind] < 0: ss[ind] = 0.0 new_ics[ind] = ss[ind] if new_ics[ind] < 0: new_ics[ind] = 0 for p in proteinlist: ind = revvarmapper['p_'+p] if ss[ind] < 0: ss[ind] = 0.0 new_ics[ind] = ss[ind] if new_ics[ind] < 0: new_ics[ind] = 0 # Calculate the Protein ics based on mRNA levels for genename in genelist: pss = ((ModelSpec['pars']['r_' + genename])/\ (ModelSpec['pars']['l_p_' + genename]))\ *new_ics[revvarmapper['x_' + genename]] new_ics[revvarmapper['p_' + genename.replace('_','')]] = pss return(new_ics)
[docs]def simulateAndSample(argdict): allParameters = argdict['allParameters'] parNames = argdict['parNames'] Model = argdict['Model'] isStochastic = argdict['isStochastic'] tspan = argdict['tspan'] varmapper = argdict['varmapper'] timeIndex = argdict['timeIndex'] genelist = argdict['genelist'] proteinlist = argdict['proteinlist'] writeProtein=argdict['writeProtein'] cellid = argdict['cellid'] outPrefix = argdict['outPrefix'] sampleCells = argdict['sampleCells'] ss = argdict['ss'] ModelSpec = argdict['ModelSpec'] rnaIndex = argdict['rnaIndex'] proteinIndex = argdict['proteinIndex'] genelist = argdict['genelist'] proteinlist = argdict['proteinlist'] revvarmapper = argdict['revvarmapper'] seed = argdict['seed'] pars = argdict['pars'] x_max = argdict['x_max'] if sampleCells: header = argdict['header'] pars = {} for k, v in allParameters.items(): pars[k] = v pars = [pars[k] for k in parNames] ## Boolean to check if a simulation is going to a ## 0 steady state, with all genes/proteins dying out retry = True trys = 0 ## timepoints tps = [i for i in range(1,len(tspan))] ## gene ids gid = [i for i,n in varmapper.items() if 'x_' in n] outPrefix = outPrefix +'simulations/' while retry: seed += 1000 y0_exp = getInitialCondition(ss, ModelSpec, rnaIndex, proteinIndex, genelist, proteinlist, varmapper,revvarmapper) P = simulateModel(Model, y0_exp, pars, isStochastic, tspan, seed) P = P.T retry = False ## Extract Time points subset = P[gid,:][:,tps] df = pd.DataFrame(subset, index=pd.Index(genelist), columns = ['E' + str(cellid) +'_' +str(i)\ for i in tps]) ## Heuristic: ## If the largest value of a protein achieved in a simulation is ## less than 10% of the y_max, drop the simulation. ## This check stems from the observation that in some simulations, ## all genes go to the 0 steady state in some rare simulations. dfmax = df.max() for col in df.columns: colmax = df[col].max() if colmax < 0.1*x_max: retry= True break if sampleCells: ## Write a single cell to file ## These samples allow for quickly and ## reproducibly testing the output. sampledf = utils.sampleCellFromTraj(cellid, tspan, P, varmapper, timeIndex, genelist, proteinlist, header, writeProtein=writeProtein) sampledf = sampledf.T sampledf.to_csv(outPrefix + 'E' + str(cellid) + '-cell.csv') trys += 1 if trys > 1: print('try', trys) # write to file df.to_csv(outPrefix + 'E'+ str(cellid) + '.csv')
[docs]def Experiment(Model, ModelSpec,tspan, num_cells, sampleCells, varmapper, parmapper, genelist, proteinlist, outPrefix,icsDF, nClusters, x_max, doParallel, burnin=False,writeProtein=False, normalizeTrajectory=False): """ Carry out an `in-silico` experiment. This function takes as input an ODE model defined as a python function and carries out stochastic simulations. BoolODE defines a _cell_ as a single time point from a simulated time course. Thus, in order to obtain 50 single cells, BoolODE carries out 50 simulations, which are stored in ./simulations/. Further, if it is known that the resulting single cell dataset will exhibit multiple trajectories, the user can specify the number of clusters in `nClusters`; BoolODE will then cluster the entire simulation, such that each simulated trajectory possesess a cluster ID. :param Model: Function defining ODE model :type Model: function :param ModelSpec: Dictionary defining ODE model. See readBooleanRules() :type ModelSpec: dict :param tspan: Array of time points :type tspan: ndarray :param num_cells: Number of simulations to perform :type num_cells: int :param sampleCells: Bool that specifies if a random sample of size num_cells should be generated from the simulated output, where one cell is picked per simulation without replacement :type sampleCells: bool :param varmapper: :type varmapper: dict :param parmapper: :type parmapper: dict :param genelist: List of all gene names :type genelist: list :param proteinlist: List of all protein names :type proteinlist: list :param outPrefix: Name of output folder. :type outPrefix: str :param icsDF: Dataframe specifying initial condition for simulation :type icsDF: pandas DataFrame :param nClusters: Number of expected trajectories. Used to perform k-means clustering :type nClusters: int :param x_max: max value of gene. By default 2.0, will vary if parameters are sampled :type x_max: float :param doParallel: Bool specifying starting simulations in parallel :type doParallel: bool :param burnin: Bool specifying that initial fraction of simulation should be discarded. Obsolete :type burnin: bool :param writeProtein: Bool specifying if the protein values should be written to file. Default = False :type writeProtein: bool :param normalizeTrajectory: Bool specifying if the gene expression values should be scaled between 0 and 1. :type normalizeTrajectory: bool """ if not sampleCells: print("Note: Simulated trajectories will be clustered. nClusters = %d" % nClusters) #################### allParameters = dict(ModelSpec['pars']) parNames = sorted(list(allParameters.keys())) ## Use default parameters pars = [ModelSpec['pars'][k] for k in parNames] #################### rnaIndex = [i for i in range(len(varmapper.keys())) if 'x_' in varmapper[i]] revvarmapper = {v:k for k,v in varmapper.items()} proteinIndex = [i for i in range(len(varmapper.keys())) if 'p_' in varmapper[i]] y0 = [ModelSpec['ics'][varmapper[i]] for i in range(len(varmapper.keys()))] ss = np.zeros(len(varmapper.keys())) for i,k in varmapper.items(): if 'x_' in k: ss[i] = 1.0 elif 'p_' in k: if k.replace('p_','') in proteinlist: # Seting them to the threshold # causes them to drop to 0 rapidly # TODO: try setting to threshold < v < y_max ss[i] = 20. if icsDF is not None: icsspec = icsDF.loc[0] genes = ast.literal_eval(icsspec['Genes']) values = ast.literal_eval(icsspec['Values']) icsmap = {g:v for g,v in zip(genes,values)} for i,k in varmapper.items(): for p in proteinlist: if p in icsmap.keys(): ss[revvarmapper['p_'+p]] = icsmap[p] else: ss[revvarmapper['p_'+p]] = 0.01 for g in genelist: if g in icsmap.keys(): ss[revvarmapper['x_'+g]] = icsmap[g] else: ss[revvarmapper['x_'+g]] = 0.01 outputfilenames = [] for isStochastic in [True]: # "WT" simulation if len(proteinlist) == 0: result = pd.DataFrame(index=pd.Index([varmapper[i] for i in rnaIndex])) else: speciesoi = [revvarmapper['p_' + p] for p in proteinlist] speciesoi.extend([revvarmapper['x_' + g] for g in genelist]) result = pd.DataFrame(index=pd.Index([varmapper[i] for i in speciesoi])) frames = [] if burnin: burninFraction = 0.25 # Chosen arbitrarily as 25% # Ignore the first 25% of the simulation startat = int(np.ceil(burninFraction*len(tspan))) else: startat = 0 # Index of every possible time point. Sample from this list timeIndex = [i for i in range(startat, len(tspan))] if sampleCells: # pre-define the time points from which a cell will be sampled # per simulation sampleAt = np.random.choice(timeIndex, size=num_cells) header = ['E' + str(cellid) + '_' + str(time) \ for cellid, time in\ zip(range(num_cells), sampleAt)] ## Construct dictionary of arguments to be passed ## to simulateAndSample(), done in parallel argdict = {} argdict['allParameters'] = allParameters argdict['parNames'] = parNames argdict['Model'] = Model argdict['isStochastic'] = isStochastic argdict['tspan'] = tspan argdict['varmapper'] = varmapper argdict['timeIndex'] = timeIndex argdict['genelist'] = genelist argdict['proteinlist'] = proteinlist argdict['writeProtein'] = writeProtein argdict['outPrefix'] = outPrefix argdict['sampleCells'] = sampleCells argdict['pars'] = pars argdict['ss'] = ss argdict['ModelSpec'] = ModelSpec argdict['rnaIndex'] = rnaIndex argdict['proteinIndex'] = proteinIndex argdict['genelist'] = genelist argdict['proteinlist'] = proteinlist argdict['revvarmapper'] = revvarmapper argdict['x_max'] = x_max if sampleCells: argdict['header'] = header simfilepath = outPrefix + 'simulations/' if len(simfilepath) > 0: if '/' in simfilepath: outDir = '/'.join(simfilepath.split('/')[:-1]) if not os.path.exists(simfilepath): print(simfilepath, "does not exist, creating it...") os.makedirs(simfilepath) print('Starting simulations') start = time.time() ######################## if doParallel: ## Carry out simulations in parrallel lock = Lock() jobs = [] for cellid in range(num_cells): argdict['seed'] = cellid argdict['cellid'] = cellid p = Process(target=simulateAndSample,args=([argdict])) p.start() #p.join() else: for cellid in tqdm(range(num_cells)): argdict['seed'] = cellid argdict['cellid'] = cellid simulateAndSample(argdict) ######################## ## Sleep for 1 s to allow for IO. Hack necessary for smalle number of simulations ## where sim time < IO time. time.sleep(1) print("Simulations took %0.3f s"%(time.time() - start)) frames = [] print('starting to concat files') start = time.time() if not sampleCells: # initialize dictionary to hold raveled values, used to cluster groupedDict = {} for cellid in tqdm(range(num_cells)): if sampleCells: df = pd.read_csv(outPrefix + 'simulations/E'+str(cellid) + '-cell.csv',index_col=0) df = df.sort_index() else: df = pd.read_csv(outPrefix + 'simulations/E'+str(cellid) + '.csv',index_col=0) df = df.sort_index() groupedDict['E' + str(cellid)] = df.values.ravel() frames.append(df.T) stop = time.time() print("Concating files took %.2f s" %(stop-start)) result = pd.concat(frames,axis=0) result = result.T indices = result.index newindices = [i.replace('x_','') for i in indices] result.index = pd.Index(newindices) if not sampleCells: ## Carry out k-means clustering to identify which ## trajectory a simulation belongs to print('Starting k-means clustering') #headers = result.columns # simulations = sorted(list(set([h.split('_')[0] + '_' for h in headers]))) ## group each simulation #groupedDict = {} ## The following loop takes time for a large number of ## simulations #print('Raveling dataframe to start clustering') #for s in tqdm(simulations): # groupedDict[s] = result.loc[:, result.columns.str.startswith(s)].values.ravel() groupedDF = pd.DataFrame.from_dict(groupedDict) print('Clustering simulations...') start = time.time() # Find clusters in the experiments clusterLabels= KMeans(n_clusters=nClusters, n_jobs=8).fit(groupedDF.T.values).labels_ print('Clustering took %0.3fs' % (time.time() - start)) clusterDF = pd.DataFrame(data=clusterLabels, index =\ groupedDF.columns, columns=['cl']) ################################################## if normalizeTrajectory: resultN = utils.normalizeExp(result) else: resultN = result if isStochastic: name = 'stoch' else: name = 'ode' if not sampleCells: clusterDF.to_csv(outPrefix + 'ClusterIds.csv') outputfilenames.append(outPrefix + name +'_experiment.txt') return outputfilenames, resultN
[docs]def parseArgs(args): parser = OptionParser() parser.add_option('', '--max-time', type='int',default=20, help='Total time of simulation. (Default = 20)') parser.add_option('', '--num-cells', type='int',default=100, help='Number of cells sample. (Default = 100)') parser.add_option('', '--sample-cells', action='store_true',default=False, help="Sample a single cell from each trajectory?\n" "By default will store full trajectory of each simulation (Default = False)") parser.add_option('', '--add-dummy', action="store_true",default=False, help='Add dummy genes') parser.add_option('-n', '--normalize-trajectory', action="store_true",default=False, help="Min-Max normalize genes across all experiments") parser.add_option('-i', '--identical-pars', action="store_true",default=False, help="Set single value to similar parameters\n" "NOTE: Consider setting this if you are using --sample-pars.") parser.add_option('-s', '--sample-pars', action="store_true",default=False, help="Sample rate parameters around the hardcoded means\n" ", using 10% stand. dev.") parser.add_option('', '--std', type='float',default=0.1, help="If sampling parameters, specify standard deviation. (Default 0.1)") parser.add_option('', '--write-protein', action="store_true",default=False, help="Write both protein and RNA values to file. Useful for debugging.") parser.add_option('-b', '--burn-in', action="store_true",default=False, help="Treats the first 25% of the time course as burnin\n" ", samples from the rest.") parser.add_option('', '--outPrefix', type = 'str',default='', help='Prefix for output files.') parser.add_option('', '--path', type='str', help='Path to boolean model file') parser.add_option('', '--inputs', type='str',default='', help='Path to input parameter files. This is different from specifying a parameter set!') parser.add_option('', '--pset', type='str',default='', help='Path to pre-generated parameter set.') parser.add_option('', '--ics', type='str',default='', help='Path to list of initial conditions') parser.add_option('', '--strengths', type='str',default='', help='Path to list of interaction strengths') parser.add_option('', '--species-type', type='str',default='', help="Path to list of molecular species type file."\ "Useful to specify proteins/genes") parser.add_option('-c', '--nClusters', type='int',default='1', help='Number of expected clusters in the dataset. (Default = 1)') parser.add_option('', '--max-parents', type='int',default='1', help='Number of parents to add to dummy genes. (Default = 1)') parser.add_option('', '--do-parallel', action="store_true",default=False, help='Run simulations in parallel. Recommended for > 50 simulations') (opts, args) = parser.parse_args(args) return opts, args
[docs]def main(args): startfull = time.time() opts, args = parseArgs(args) path = opts.path if path is None or len(path) == 0: print("Please specify path to Boolean model") sys.exit() tmax = opts.max_time numCells = opts.num_cells identicalPars = opts.identical_pars burnin = opts.burn_in samplePars = opts.sample_pars sampleStd = opts.std outPrefix = opts.outPrefix parameterInputsPath = opts.inputs parameterSetPath = opts.pset icsPath = opts.ics writeProtein = opts.write_protein normalizeTrajectory = opts.normalize_trajectory interactionStrengthPath = opts.strengths speciesTypePath = opts.species_type add_dummy = opts.add_dummy sampleCells = opts.sample_cells nClusters = opts.nClusters max_parents = opts.max_parents doParallel = opts.do_parallel # if output directory doesn't exist # create one! if len(outPrefix) > 0: if '/' in outPrefix: outDir = '/'.join(outPrefix.split('/')[:-1]) if not os.path.exists(outDir): print(outDir, "does not exist, creating it...") os.makedirs(outDir) if len(parameterInputsPath) > 0: parameterInputsDF = pd.read_csv(parameterInputsPath,sep='\t') else: parameterInputsDF = None if len(parameterSetPath) > 0: parameterSetDF = pd.read_csv(parameterSetPath, header=None, sep='\t', index_col=0) else: parameterSetDF = None if len(icsPath) > 0: icsDF = pd.read_csv(icsPath,sep='\t',engine='python') else: icsDF = None if len(interactionStrengthPath) > 0: print("Interaction Strengths are supplied") interactionStrengthDF = pd.read_csv(interactionStrengthPath,sep='\t',engine='python') else: interactionStrengthDF = None if len(speciesTypePath) > 0: speciesTypeDF = pd.read_csv(speciesTypePath,sep='\t') else: speciesTypeDF = None ## Hardcoded ODE simulation time steps ## This can be reduced timesteps = 100 tspan = np.linspace(0,tmax,tmax*timesteps) DF, withoutRules = readBooleanRules(path, parameterInputsPath, outPrefix, add_dummy, max_parents) if len(withoutRules) == 0: withoutRules = [] it = 0 someexception = True while someexception: try: genesDict = {} ModelSpec,\ parameterInputs,\ genelist,\ proteinlist,\ x_max = generateModelDict(DF,identicalPars, samplePars, sampleStd, withoutRules, speciesTypeDF, parameterInputsDF, parameterSetDF, interactionStrengthDF) # FIXME : ask user to pass row if parameter input file # Hardcoded. We only care about one input vector, say the first one if len(parameterInputsPath) > 0: ModelSpec['pars'].update(parameterInputs[0]) varmapper = {i:var for i,var in enumerate(ModelSpec['varspecs'].keys())} parmapper = {i:par for i,par in enumerate(ModelSpec['pars'].keys())} dir_path = utils.writeModelToFile(ModelSpec) ## Load file from path model = SourceFileLoader("model", dir_path + "/model.py").load_module() #import model outputfilenames, resultDF = Experiment(model.Model, ModelSpec, tspan, numCells, sampleCells, varmapper, parmapper, genelist,proteinlist, outPrefix, icsDF, nClusters, x_max, doParallel, burnin=burnin, writeProtein=writeProtein, normalizeTrajectory=normalizeTrajectory) print('Generating input files for pipline...') start = time.time() utils.generateInputFiles(resultDF, outputfilenames, DF, withoutRules, parameterInputsPath, outPrefix=outPrefix) print('Input file generation took %0.2f s' % (time.time() - start)) someexception= False except FloatingPointError as e: it +=1 print(e,"\nattempt %d" %it) utils.writeParametersToFile(ModelSpec, outPrefix) print("BoolODE.py took %0.2fs"% (time.time() - startfull)) print('all done.')
if __name__ == "__main__": main(sys.argv)