Source code for BLEval.computeJaccard

import os
import argparse
import numpy as np
import pandas as pd
import networkx as nx
from tqdm import tqdm
import multiprocessing
from pathlib import Path
import concurrent.futures
from itertools import permutations
from collections import defaultdict
from multiprocessing import Pool, cpu_count
from networkx.convert_matrix import from_pandas_adjacency

[docs]def Jaccard(evalObject, algorithmName):
    """
    A function to compute median pairwirse Jaccard similarity index
    of predicted top-k edges for a given set of datasets (obtained from
    the same reference network). Here k is the number of edges in the
    reference network (excluding self loops). 
    
    
    :param evalObject: An object of class :class:`BLEval.BLEval`.
    :type evalObject: :obj:`BLEval`
      
      
    :param algorithmName: Name of the algorithm for which the Spearman correlation is computed.
    :type algorithmName: str
      
      
    :returns:
        - median: Median of Jaccard correlation values
        - mad: Median Absolute Deviation of  the Spearman correlation values
    """

    rankDict = {}
    sim_names = []
    for dataset in tqdm(evalObject.input_settings.datasets):
        trueEdgesDF = pd.read_csv(str(evalObject.input_settings.datadir)+'/'+ \
                      dataset['name'] + '/' +\
                      dataset['trueEdges'], sep = ',',
                      header = 0, index_col = None)

        possibleEdges = list(permutations(np.unique(trueEdgesDF.loc[:,['Gene1','Gene2']]),
                                     r = 2))

        TrueEdgeDict = {'|'.join(p):0 for p in possibleEdges}
        PredEdgeDict = {'|'.join(p):0 for p in possibleEdges}

        # Compute TrueEdgeDict Dictionary
        # 1 if edge is present in the ground-truth
        # 0 if edge is not present in the ground-truth
        numEdges = 0
        for key in TrueEdgeDict.keys():
            if len(trueEdgesDF.loc[(trueEdgesDF['Gene1'] == key.split('|')[0]) &
                   (trueEdgesDF['Gene2'] == key.split('|')[1])])>0:
                    TrueEdgeDict[key] = 1
                    numEdges += 1

        outDir = str(evalObject.output_settings.base_dir) + \
                 str(evalObject.input_settings.datadir).split("inputs")[1] + \
                 "/" + dataset["name"] + "/" + algorithmName

        #algos = evalObject.input_settings.algorithms
        rank_path = outDir + "/rankedEdges.csv"
        if not os.path.isdir(outDir):
            continue
        try:
            predDF = pd.read_csv(rank_path, sep="\t", header=0, index_col=None)
        except:
            print("Skipping Jaccard computation for ", algorithmName, "on path", outDir)
            continue

        predDF = predDF.loc[(predDF['Gene1'] != predDF['Gene2'])]
        predDF.drop_duplicates(keep = 'first', inplace=True)
        predDF.reset_index(drop = True,  inplace= True)
        # check if ranked edges list is empty
        # if so, it is just set to an empty set

        if not predDF.shape[0] == 0:

            # we want to ensure that we do not include
            # edges without any edge weight
            # so check if the non-zero minimum is
            # greater than the edge weight of the top-kth
            # node, else use the non-zero minimum value.
            predDF.EdgeWeight = predDF.EdgeWeight.round(6)
            predDF.EdgeWeight = predDF.EdgeWeight.abs()

            # Use num True edges or the number of
            # edges in the dataframe, which ever is lower
            maxk = min(predDF.shape[0], numEdges)
            edgeWeightTopk = predDF.iloc[maxk-1].EdgeWeight

            nonZeroMin = np.nanmin(predDF.EdgeWeight.replace(0, np.nan).values)
            bestVal = max(nonZeroMin, edgeWeightTopk)

            newDF = predDF.loc[(predDF['EdgeWeight'] >= bestVal)]
            rankDict[dataset["name"]] = set(newDF['Gene1'] + "|" + newDF['Gene2'])
        else:
            rankDict[dataset["name"]] = set([])

    Jdf = computePairwiseJacc(rankDict)
    df = Jdf.where(np.triu(np.ones(Jdf.shape),  k = 1).astype(np.bool))
    df = df.stack().reset_index()
    df.columns = ['Row','Column','Value']
    return(df.Value.median(),df.Value.mad())


[docs]def computePairwiseJacc(inDict):
    """
    A helper function to compute all pairwise Jaccard similarity indices
    of predicted top-k edges for a given set of datasets (obtained from
    the same reference network). Here k is the number of edges in the
    reference network (excluding self loops). 
    
    

    :param inDict:  A dictionary contaninig top-k predicted edges  for each dataset. Here, keys are the dataset name and the values are the set of top-k edges.
    :type inDict: dict
    :returns:
        A dataframe containing pairwise Jaccard similarity index values
    """
    jaccDF = {key:{key1:{} for key1 in inDict.keys()} for key in inDict.keys()}
    for key_i in inDict.keys():
        for key_j in inDict.keys():
            num = len(inDict[key_i].intersection(inDict[key_j]))
            den = len(inDict[key_i].union(inDict[key_j]))
            if den != 0:
                jaccDF[key_i][key_j] = num/den
            else:
                jaccDF[key_i][key_j] = 0
    return pd.DataFrame(jaccDF)