Source code for BLEval


"""
BEELINE Evaluation (:mod:`BLEval`) module contains the following
:class:`BLEval.BLEval` and three additional classes used in the
definition of BLEval class 

- :class:`BLEval.ConfigParser` 
- :class:`BLEval.InputSettings` 
- :class:`BLEval.OutputSettings`


"""
import os
import yaml
import argparse
import itertools
import numpy as np
import pandas as pd
import networkx as nx
from tqdm import tqdm
import multiprocessing
from pathlib import Path
import concurrent.futures
from itertools import permutations
from collections import defaultdict
from multiprocessing import Pool, cpu_count
from networkx.convert_matrix import from_pandas_adjacency


# local imports
from BLEval.parseTime import getTime
from BLEval.computeDGAUC import PRROC
from BLEval.computeJaccard import Jaccard
from BLEval.computeSpearman import Spearman
from BLEval.computeNetMotifs import Motifs
from BLEval.computeEarlyPrec import EarlyPrec
#from BLEval.computePathStats import pathAnalysis
from BLEval.computeSignedEPrec import signedEPrec


[docs]class InputSettings(object):
    '''
    The class for storing the names of input files.
    This initilizes an InputSettings object based on the
    following three parameters.
    

    :param datadir:   input dataset root directory, typically 'inputs/'
    :type datadir: str

    :param datasets:   List of dataset names
    :type datasets: list
        
    :param algorithms:   List of algorithm names
    :type algorithms: list
    '''

    def __init__(self,
            datadir, datasets, algorithms) -> None:

        self.datadir = datadir
        self.datasets = datasets
        self.algorithms = algorithms


[docs]class OutputSettings(object):
    '''    
    The class for storing the names of directories that output should
    be written to. This initilizes an OutputSettings object based on the
    following two parameters.
    

    :param base_dir: output root directory, typically 'outputs/'
    :type base_dir: str
    :param output_prefix: A prefix added to the final output files.
    :type str:
    '''

    def __init__(self, base_dir, output_prefix: Path) -> None:
        self.base_dir = base_dir
        self.output_prefix = output_prefix




[docs]class BLEval(object):
    '''
    The BEELINE Evaluation object is created by parsing a user-provided configuration
    file. Its methods provide for further processing its inputs into
    a series of jobs to be run, as well as running these jobs.
    '''

    def __init__(self,
            input_settings: InputSettings,
            output_settings: OutputSettings) -> None:

        self.input_settings = input_settings
        self.output_settings = output_settings


[docs]    def computeAUC(self, directed = True):

        '''
        Computes areas under the precision-recall (PR) and
        and ROC plots for each algorithm-dataset combination.
      
        Parameters
        ----------
        directedFlag: bool
            A flag to specifiy whether to treat predictions
            as directed edges (directed = True) or 
            undirected edges (directed = False).
        
        :returns:
            - AUPRC: A dataframe containing AUPRC values for each algorithm-dataset combination
            - AUROC: A dataframe containing AUROC values for each algorithm-dataset combination
        '''
        AUPRCDict = {}
        AUROCDict = {}

        for dataset in tqdm(self.input_settings.datasets, 
                            total = len(self.input_settings.datasets), unit = " Datasets"):
            
            AUPRC, AUROC = PRROC(dataset, self.input_settings, 
                                    directed = directed, selfEdges = False, plotFlag = False)
            AUPRCDict[dataset['name']] = AUPRC
            AUROCDict[dataset['name']] = AUROC
        AUPRC = pd.DataFrame(AUPRCDict)
        AUROC = pd.DataFrame(AUROCDict)
        return AUPRC, AUROC
    

[docs]    def parseTime(self):
        """
        Parse time output for each
        algorithm-dataset combination.
        
        :returns:
            A dictionary of times for all dataset-algorithm combinations
        """
        TimeDict = dict()

        for dataset in tqdm(self.input_settings.datasets, 
                            total = len(self.input_settings.datasets), unit = " Datasets"):
            timevals  = getTime(self, dataset)
            TimeDict[dataset["name"]] = timevals

        return TimeDict

[docs]    def computeJaccard(self):

        '''
        Computes Jaccard Index between top-k edge predictions
        of the same algorithm.
        
        :returns:
            A dataframe containing the median and median absolute
            deviation of the Jaccard Index values of each algorithm
            on the given set of datasets.
        '''
        JaccDF = {}
        JaccDF['Jaccard Median'] = {}
        JaccDF['Jaccard MAD'] = {}
        outDir = str(self.output_settings.base_dir) + \
                 str(self.input_settings.datadir).split("inputs")[1] + "/"
        for algo in tqdm(self.input_settings.algorithms, unit = " Algorithms"):
            if algo[1]['should_run'] == True:
                JaccDF['Jaccard Median'][algo[0]], JaccDF['Jaccard MAD'][algo[0]] = Jaccard(self, algo[0])
            
        return pd.DataFrame(JaccDF)     

    
[docs]    def computeSpearman(self):

        '''
        Finds the Spearman's correlation coefficient between
        the ranked edges of the same algorithm on the given
        set of datasets.
        
        :returns:
            A dataframe containing the median and median absolute
            deviation of the Separman's correlation values of each algorithm.
        '''
        corrDF = {}
        corrDF['Spearman Median'] = {}
        corrDF['Spearman MAD'] = {}
        outDir = str(self.output_settings.base_dir) + \
                 str(self.input_settings.datadir).split("inputs")[1] + "/"
        for algo in tqdm(self.input_settings.algorithms, unit = " Algorithms"):
            if algo[1]['should_run'] == True:
                corrDF['Spearman Median'][algo[0]],corrDF['Spearman MAD'][algo[0]] = Spearman(self, algo[0])
            
        return pd.DataFrame(corrDF)



[docs]    def computeNetMotifs(self):

        '''
        For each algorithm-dataset combination, this function computes the network motifs such as 
        Feedforward loops, Feedback loops and 
        Mutual interactions in the predicted top-k network. It returns the ratio of network motif counts
        compared to their respective values in the reference network.
      
        :returns:
            - FBL: A dataframe containing ratios of number of Feedback loops
            
            - FFL: A dataframe containing ratios of number of Feedforward loops
            
            - MI: A dataframe containing ratios of number of Mutual Interactions
        '''
        FFLDict = {}
        FBLDict = {}
        MIDict = {}
        for dataset in tqdm(self.input_settings.datasets, 
                            total = len(self.input_settings.datasets), unit = " Datasets"):
            FBLDict[dataset["name"]], FFLDict[dataset["name"]], MIDict[dataset["name"]] = Motifs(dataset, self.input_settings)
            
        FBL = pd.DataFrame(FBLDict)
        FFL = pd.DataFrame(FFLDict)
        MI = pd.DataFrame(MIDict)
        
        return FBL, FFL, MI
    
    
    # def computePaths(self):
    #     '''
    #     For each algorithm-dataset combination, this function computes path lengths
    #     through TP edges and FP edges, returns statistics on path lengths.

    #     :returns:
    #         - pathStats: A dataframe path lengths in predicted network
    #     '''
    #     for dataset in tqdm(self.input_settings.datasets, 
    #                         total = len(self.input_settings.datasets), unit = " Datasets"):
    #         pathAnalysis(dataset, self.input_settings)

                 
[docs]    def computeEarlyPrec(self):

        '''
        For each algorithm-dataset combination,
        this function computes the Early Precision values of the 
        network formed using the predicted top-k edges.
        
        :returns: 
            A dataframe containing the early precision values 
            for each algorithm-dataset combination.

        '''
        Eprec = {}
        outDir = str(self.output_settings.base_dir) + \
                 str(self.input_settings.datadir).split("inputs")[1] + "/"
        for algo in tqdm(self.input_settings.algorithms, unit = " Algorithms"):
            if algo[1]['should_run'] == True:
                Eprec[algo[0]] = EarlyPrec(self, algo[0])
        return pd.DataFrame(Eprec).T


    
[docs]    def computeSignedEPrec(self):

        '''
        For each algorithm-dataset combination,
        this function computes the Early Precision values separately 
        for the activation and inhibitory edges.
        
        :returns: 
            - A dataframe containing early precision for activation edges
            - A dataframe containing early precision for inhibitory edges
        '''
        sEPRDict = {}
        sEPRDict['EPrec Activation'] = {}
        sEPRDict['EPrec Inhibition'] = {}
        outDir = str(self.output_settings.base_dir) + \
                 str(self.input_settings.datadir).split("inputs")[1] + "/"
        for algo in tqdm(self.input_settings.algorithms, unit = " Algorithms"):
            if algo[1]['should_run'] == True:
                sEPrecDF = signedEPrec(self, algo[0])
                sEPRDict['EPrec Activation'][algo[0]] = sEPrecDF['+']
                sEPRDict['EPrec Inhibition'][algo[0]] = sEPrecDF['-']
        return(pd.DataFrame(sEPRDict['EPrec Activation']).T, pd.DataFrame(sEPRDict['EPrec Inhibition']).T)
    

[docs]class ConfigParser(object):
    '''
    The class define static methods for parsing and storing the contents 
    of the config file that sets a that sets a large number of parameters 
    used in the BLEval.
    '''
[docs]    @staticmethod
    def parse(config_file_handle) -> BLEval:
        '''
        A method for parsing the input .yaml file.
        
        :param config_file_handle: Name of the .yaml file to be parsed
        :type config_file_handle: str
        
        :returns: 
            An object of class :class:`BLEval.BLEval`.

        '''
        config_map = yaml.load(config_file_handle)
        return BLEval(
            ConfigParser.__parse_input_settings(
                config_map['input_settings']),
            ConfigParser.__parse_output_settings(
                config_map['output_settings']))
    
    @staticmethod
    def __parse_input_settings(input_settings_map) -> InputSettings:
        '''
        A method for parsing and initializing 
        InputSettings object.
        '''
        input_dir = input_settings_map['input_dir']
        dataset_dir = input_settings_map['dataset_dir']
        datasets = input_settings_map['datasets']

        return InputSettings(
                Path(input_dir, dataset_dir),
                datasets,
                ConfigParser.__parse_algorithms(
                input_settings_map['algorithms']))


    @staticmethod
    def __parse_algorithms(algorithms_list):
        '''
        A method for parsing the list of algorithms
        that are being evaluated, along with
        any parameters being passed.
        
        Note that these parameters may not be
        used in the current evaluation, but can 
        be used at a later point.
        '''
        
        # Initilalize the list of algorithms
        algorithms = []
        
        # Parse contents of algorithms_list
        for algorithm in algorithms_list:
                combos = [dict(zip(algorithm['params'], val))
                    for val in itertools.product(
                        *(algorithm['params'][param]
                            for param in algorithm['params']))]
                for combo in combos:
                    algorithms.append([algorithm['name'],combo])
            

        return algorithms

    @staticmethod
    def __parse_output_settings(output_settings_map) -> OutputSettings:
        '''
        A method for parsing and initializing 
        Output object.
        '''
        output_dir = Path(output_settings_map['output_dir'])
        output_prefix = Path(output_settings_map['output_prefix'])

        return OutputSettings(output_dir,
                             output_prefix)