Source code for BLEval.computeEarlyPrec

import os
import argparse
import numpy as np
import pandas as pd
import networkx as nx
from tqdm import tqdm
import multiprocessing
from pathlib import Path
import concurrent.futures
from collections import defaultdict
from itertools import product, permutations
from multiprocessing import Pool, cpu_count
from networkx.convert_matrix import from_pandas_adjacency

[docs]def EarlyPrec(evalObject, algorithmName, TFEdges = False): ''' Computes early precision for a given algorithm for each dataset. We define early precision as the fraction of true positives in the top-k edges, where k is the number of edges in the ground truth network (excluding self loops). :param evalObject: An object of class :class:`BLEval.BLEval`. :type evalObject: BLEval :param algorithmName: Name of the algorithm for which the early precision is computed. :type algorithmName: str :returns: A dataframe containing early precision values for a given algorithm for each dataset. ''' rankDict = {} for dataset in tqdm(evalObject.input_settings.datasets): trueEdgesDF = pd.read_csv(str(evalObject.input_settings.datadir)+'/'+ \ dataset['name'] + '/' +\ dataset['trueEdges'], sep = ',', header = 0, index_col = None) trueEdgesDF = trueEdgesDF.loc[(trueEdgesDF['Gene1'] != trueEdgesDF['Gene2'])] trueEdgesDF.drop_duplicates(keep = 'first', inplace=True) trueEdgesDF.reset_index(drop=True, inplace=True) outDir = str(evalObject.output_settings.base_dir) + \ str(evalObject.input_settings.datadir).split("inputs")[1] + \ "/" + dataset["name"] + "/" + algorithmName #algos = evalObject.input_settings.algorithms rank_path = outDir + "/rankedEdges.csv" if not os.path.isdir(outDir): rankDict[dataset["name"]] = set([]) continue try: predDF = pd.read_csv(rank_path, sep="\t", header=0, index_col=None) except: print("\nSkipping early precision computation for ", algorithmName, "on path", outDir) rankDict[dataset["name"]] = set([]) continue predDF = predDF.loc[(predDF['Gene1'] != predDF['Gene2'])] predDF.drop_duplicates(keep = 'first', inplace=True) predDF.reset_index(drop=True, inplace=True) if TFEdges: # Consider only edges going out of TFs # Get a list of all possible TF to gene interactions uniqueNodes = np.unique(trueEdgesDF.loc[:,['Gene1','Gene2']]) possibleEdges_TF = set(product(set(trueEdgesDF.Gene1),set(uniqueNodes))) # Get a list of all possible interactions possibleEdges_noSelf = set(permutations(uniqueNodes, r = 2)) # Find intersection of above lists to ignore self edges # TODO: is there a better way of doing this? possibleEdges = possibleEdges_TF.intersection(possibleEdges_noSelf) TrueEdgeDict = {'|'.join(p):0 for p in possibleEdges} trueEdges = trueEdgesDF['Gene1'] + "|" + trueEdgesDF['Gene2'] trueEdges = trueEdges[trueEdges.isin(TrueEdgeDict)] print("\nEdges considered ", len(trueEdges)) numEdges = len(trueEdges) predDF['Edges'] = predDF['Gene1'] + "|" + predDF['Gene2'] # limit the predicted edges to the genes that are in the ground truth predDF = predDF[predDF['Edges'].isin(TrueEdgeDict)] else: trueEdges = trueEdgesDF['Gene1'] + "|" + trueEdgesDF['Gene2'] trueEdges = set(trueEdges.values) numEdges = len(trueEdges) # check if ranked edges list is empty # if so, it is just set to an empty set if not predDF.shape[0] == 0: # we want to ensure that we do not include # edges without any edge weight # so check if the non-zero minimum is # greater than the edge weight of the top-kth # node, else use the non-zero minimum value. predDF.EdgeWeight = predDF.EdgeWeight.round(6) predDF.EdgeWeight = predDF.EdgeWeight.abs() # Use num True edges or the number of # edges in the dataframe, which ever is lower maxk = min(predDF.shape[0], numEdges) edgeWeightTopk = predDF.iloc[maxk-1].EdgeWeight nonZeroMin = np.nanmin(predDF.EdgeWeight.replace(0, np.nan).values) bestVal = max(nonZeroMin, edgeWeightTopk) newDF = predDF.loc[(predDF['EdgeWeight'] >= bestVal)] rankDict[dataset["name"]] = set(newDF['Gene1'] + "|" + newDF['Gene2']) else: print("\nSkipping early precision computation for on path ", rank_path,"due to lack of predictions.") rankDict[dataset["name"]] = set([]) Eprec = {} Erec = {} for dataset in tqdm(evalObject.input_settings.datasets): if len(rankDict[dataset["name"]]) != 0: intersectionSet = rankDict[dataset["name"]].intersection(trueEdges) Eprec[dataset["name"]] = len(intersectionSet)/len(rankDict[dataset["name"]]) Erec[dataset["name"]] = len(intersectionSet)/len(trueEdges) else: Eprec[dataset["name"]] = 0 Erec[dataset["name"]] = 0 return(Eprec)