Source code for BLEval.computeSignedEPrec

import os
import yaml
import argparse
import itertools
import numpy as np
import pandas as pd
import networkx as nx
from tqdm import tqdm
import multiprocessing
from pathlib import Path
import concurrent.futures
from itertools import permutations
from collections import defaultdict
from multiprocessing import Pool, cpu_count
from networkx.convert_matrix import from_pandas_adjacency

[docs]def signedEPrec(evalObject, algorithmName):
    '''
    Computes median signed early precision for a given algorithm across all datasets, 
    i.e., the function computes early precision of activation edges and
    early precision for inhibitory edges in the reference network.
    We define early precision of activation edges as the fraction of true 
    positives in the top-ka edges, where ka is the number of activation
    edges in the reference network (excluding self loops). 
    We define early precision of inhibitory edges as the fraction of true 
    positives in the top-ki edges, where ki is the number of inhibitory
    edges in the reference network (excluding self loops).
    

    :param evalObject: An object of class :class:`BLEval.BLEval`.
    :type evalObject: BLEval
      
    :param algorithmName: Name of the algorithm for which the early precision is computed.
    :type algorithmName: str
      
            
    :returns: 
        A dataframe with early precision of activation edges (+) and inhibitory edges (-)
        for a given algorithm
    '''
        
    rankDict = {'+':{},'-':{}}
    sim_names = []
    for sgn in ['+','-']:
        for dataset in tqdm(evalObject.input_settings.datasets):
            trueEdgesDF = pd.read_csv(str(evalObject.input_settings.datadir)+'/'+ \
                                      dataset['name'] + '/' +\
                                      dataset['trueEdges'], sep = ',',
                                      header = 0, index_col = None)

            possibleEdges = list(permutations(np.unique(trueEdgesDF.loc[:,['Gene1','Gene2']]),
                                         r = 2))


            TrueEdgeDict = {'|'.join(p):0 for p in possibleEdges}
            PredEdgeDict = {'|'.join(p):0 for p in possibleEdges}

            # Compute TrueEdgeDict Dictionary
            # 1 if edge is present in the ground-truth
            # 0 if edge is not present in the ground-truth
            numEdges = 0
            trueEdges = set()
            toRemove = []
            for key in TrueEdgeDict.keys():
                subDF = trueEdgesDF.loc[(trueEdgesDF['Gene1'] == key.split('|')[0]) &
                       (trueEdgesDF['Gene2'] == key.split('|')[1])]

                if subDF.shape[0] > 0:
                    if  subDF['Type'].values[0] == sgn:
                        TrueEdgeDict[key] = 1
                        trueEdges.add(key)
                        numEdges += 1
                    else:
                        toRemove.append(key)
            for key in toRemove:
                TrueEdgeDict.pop(key, None)


            outDir = str(evalObject.output_settings.base_dir) + \
                     str(evalObject.input_settings.datadir).split("inputs")[1] + \
                     "/" + dataset["name"] + "/" + algorithmName

            #algos = evalObject.input_settings.algorithms
            rank_path = outDir + "/rankedEdges.csv"
            if not os.path.isdir(outDir):
                print(outDir," not found")
                rankDict[sgn][dataset["name"]] = set([])
                continue
            try:
                predDF = pd.read_csv(rank_path, sep="\t", header=0, index_col=None)
            except:
                print("Skipping signed precision computation for ", algorithmName, "on path", outDir)
                rankDict[sgn][dataset["name"]] = set([])
                continue

            predDF = predDF.loc[(predDF['Gene1'] != predDF['Gene2'])]
            predDF.drop_duplicates(keep = 'first', inplace=True)
            predDF.reset_index(drop = True,  inplace= True)

            # Remove incorrect sign from consideration
            for idx, row in predDF.iterrows():
                if str(row['Gene1']) + '|' + str(row['Gene2']) not in TrueEdgeDict.keys():
                    predDF.drop(idx, axis = 'index', inplace= True)
            predDF.reset_index(drop = True,  inplace= True)
            # check if ranked edges list is empty
            # if so, it is just set to an empty set

            if not predDF.shape[0] == 0:

                # we want to ensure that we do not include
                # edges without any edge weight
                # so check if the non-zero minimum is
                # greater than the edge weight of the top-kth
                # node, else use the non-zero minimum value.
                predDF.EdgeWeight = predDF.EdgeWeight.round(6)
                predDF.EdgeWeight = predDF.EdgeWeight.abs()

                # Use num True edges or the number of
                # edges in the dataframe, which ever is lower
                maxk = min(predDF.shape[0], numEdges)
                edgeWeightTopk = predDF.iloc[maxk-1].EdgeWeight

                nonZeroMin = np.nanmin(predDF.EdgeWeight.replace(0, np.nan).values)
                bestVal = max(nonZeroMin, edgeWeightTopk)

                newDF = predDF.loc[(predDF['EdgeWeight'] >= bestVal)]
                rankDict[sgn][dataset["name"]] = set(newDF['Gene1'] + "|" + newDF['Gene2'])
            else:
                print("\nSkipping signed early precision computation for file on path ", rank_path,"due to lack of predictions.")
                rankDict[sgn][dataset["name"]] = set([])

    Pprec = {'+':{},'-':{}}
    for sgn in ['+','-']:

        TrueEdgeDict = {'|'.join(p):0 for p in possibleEdges}

        # Compute TrueEdgeDict Dictionary
        # 1 if edge is present in the ground-truth
        # 0 if edge is not present in the ground-truth
        trueEdges = set()
        for key in TrueEdgeDict.keys():
            subDF = trueEdgesDF.loc[(trueEdgesDF['Gene1'] == key.split('|')[0]) &
                   (trueEdgesDF['Gene2'] == key.split('|')[1])]

            if subDF.shape[0] > 0:
                if  subDF['Type'].values[0] == sgn:
                    TrueEdgeDict[key] = 1
                    trueEdges.add(key)

        for dataset in tqdm(evalObject.input_settings.datasets):
            if len(rankDict[sgn][dataset["name"]]) != 0 and len(trueEdges) != 0:
                intersectionSet = rankDict[sgn][dataset["name"]].intersection(trueEdges)
                Pprec[sgn][dataset["name"]] = len(intersectionSet)/len(rankDict[sgn][dataset["name"]])
            else:
                Pprec[sgn][dataset["name"]] = 0
                
    # To return just the median values, uncomment the line below
    #return(pd.DataFrame(Pprec).median(axis='index').\
    #values[0],pd.DataFrame(Pprec).median(axis='index').values[1])

    return(pd.DataFrame(Pprec))