Source code for BLPlot.plotter

import math
from abc import ABC, abstractmethod
from pathlib import Path
from typing import Dict, Iterator, List, Tuple

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

[docs]def get_algo_ids(config: dict) -> List[str]: """ Return the list of enabled algorithm IDs from the config. Parameters ---------- config : dict Parsed YAML configuration. Returns ------- list[str] Algorithm IDs whose should_run flag is truthy. """ if not isinstance(config, dict): raise TypeError(f"config must be dict, got {type(config)}") algos = [] for a in config['input_settings'].get('algorithms', []): flag = a.get('should_run', [False]) if flag[0] if isinstance(flag, list) else flag: algos.append(a['algorithm_id']) return algos
[docs]def iter_datasets_with_runs( config: dict, root: Path, ) -> Iterator[Tuple[str, str, Path, Path, list]]: """ Yield (dataset_id, dataset_label, dataset_path, gt_path, runs) for each enabled dataset. Extends iter_datasets by also yielding the raw list of run dicts from the config, so callers can determine run count and resolve per-run paths via dataset_path / run['run_id']. Parameters ---------- config : dict Parsed YAML configuration. root : Path Working directory from which config paths are resolved. Yields ------ tuple of (str, str, Path, Path, list) dataset_id, dataset_label (nickname if set else dataset_id), output dataset directory, ground truth file path, list of run dicts from config (each has at least 'run_id'). """ if not isinstance(config, dict): raise TypeError(f"config must be dict, got {type(config)}") if not isinstance(root, Path): raise TypeError(f"root must be Path, got {type(root)}") input_settings = config['input_settings'] output_settings = config['output_settings'] input_dir = (root / input_settings['input_dir']).resolve() output_dir = (root / output_settings['output_dir']).resolve() # experiment_id : str — optional; when set, an experiment_id segment is # inserted between output_dir and the dataset path. experiment_id = output_settings.get('experiment_id', '') if experiment_id: output_dir = output_dir / experiment_id for ds in input_settings.get('datasets', []): should_run = ds.get('should_run', [True]) if not (should_run[0] if isinstance(should_run, list) else should_run): continue dataset_id = ds['dataset_id'] # dataset_label : str — nickname used for plot labels; defaults to # dataset_id when 'nickname' is absent from the dataset config entry. dataset_label = ds.get('nickname', dataset_id) gt_filename = ds.get('groundTruthNetwork', 'GroundTruthNetwork.csv') dataset_path = output_dir / dataset_id gt_path = input_dir / dataset_id / gt_filename # Resolve the run list: either scan input subdirectories or use the # explicit 'runs' list from the config. if ds.get('scan_run_subdirectories'): # ds_input_path : Path — input_dir/dataset_id/ ds_input_path = input_dir / dataset_id if not ds_input_path.is_dir(): raise FileNotFoundError( f"scan_run_subdirectories is set for dataset '{dataset_id}' " f"but input directory '{ds_input_path}' does not exist." ) runs = [{'run_id': d.name} for d in sorted(ds_input_path.iterdir()) if d.is_dir()] if not runs: raise RuntimeError( f"scan_run_subdirectories is set for dataset '{dataset_id}' " f"but no subdirectories were found in '{ds_input_path}'." ) else: runs = ds.get('runs', []) yield dataset_id, dataset_label, dataset_path, gt_path, runs
[docs]def iter_datasets( config: dict, root: Path, ) -> Iterator[Tuple[str, str, Path, Path]]: """ Yield (dataset_id, dataset_label, dataset_path, gt_path) for each enabled dataset in config. Parameters ---------- config : dict Parsed YAML configuration. root : Path Working directory from which config paths are resolved. Yields ------ tuple of (str, str, Path, Path) dataset_id, dataset_label (nickname if set else dataset_id), output dataset directory, ground truth file path. """ if not isinstance(config, dict): raise TypeError(f"config must be dict, got {type(config)}") if not isinstance(root, Path): raise TypeError(f"root must be Path, got {type(root)}") input_settings = config['input_settings'] output_settings = config['output_settings'] input_dir = (root / input_settings['input_dir']).resolve() output_dir = (root / output_settings['output_dir']).resolve() # experiment_id : str — optional; when set, an experiment_id segment is # inserted between output_dir and the dataset path. experiment_id = output_settings.get('experiment_id', '') if experiment_id: output_dir = output_dir / experiment_id for ds in input_settings.get('datasets', []): should_run = ds.get('should_run', [True]) if not (should_run[0] if isinstance(should_run, list) else should_run): continue dataset_id = ds['dataset_id'] # dataset_label : str — nickname used for plot labels; defaults to # dataset_id when 'nickname' is absent from the dataset config entry. dataset_label = ds.get('nickname', dataset_id) gt_filename = ds.get('groundTruthNetwork', 'GroundTruthNetwork.csv') dataset_path = output_dir / dataset_id gt_path = input_dir / dataset_id / gt_filename yield dataset_id, dataset_label, dataset_path, gt_path
[docs]def random_classifier_baseline(gt_path: Path) -> float: """ Compute the expected precision of a random predictor for a dataset. Equals k / (n*(n-1)) where k is the number of non-self-loop ground truth edges and n is the number of unique genes. This is the random baseline for both AUPRC and early precision (the PR curve of a random predictor is flat at height k / n_possible). Parameters ---------- gt_path : Path Path to the ground truth edge list CSV (columns Gene1, Gene2). Returns ------- float Random predictor baseline in (0, 1], or nan if undefined (empty network). """ if not isinstance(gt_path, Path): raise TypeError(f"gt_path must be Path, got {type(gt_path)}") gt = pd.read_csv(gt_path, header=0) gt = gt[gt['Gene1'] != gt['Gene2']] k = len(gt) genes = set(gt['Gene1']).union(set(gt['Gene2'])) n = len(genes) n_possible = n * (n - 1) if n_possible == 0 or k == 0: return float('nan') return k / n_possible
[docs]def load_dataset_metric( dataset_path: Path, metric_csv: str, ) -> Dict[str, List[float]]: """ Load per-algorithm metric values from one dataset's pre-computed CSV. The CSV is expected to have rows = algorithms (index column) and columns = run_ids. Returns an empty dict (with a warning) if the file is missing. Parameters ---------- dataset_path : Path Output directory for the dataset (contains the metric CSV). metric_csv : str Filename of the metric CSV (e.g. 'AUPRC.csv'). Returns ------- dict[str, list[float]] Algorithm name -> list of non-NaN run values for this dataset. """ if not isinstance(dataset_path, Path): raise TypeError(f"dataset_path must be Path, got {type(dataset_path)}") if not isinstance(metric_csv, str): raise TypeError(f"metric_csv must be str, got {type(metric_csv)}") csv_path = dataset_path / metric_csv if not csv_path.exists(): print(f"Warning: {csv_path} not found, skipping.") return {} df = pd.read_csv(csv_path, index_col=0) return { str(algo): [v for v in df.loc[algo].tolist() if not math.isnan(v)] for algo in df.index }
[docs]def make_box_figure( values: Dict[str, List[float]], title: str, ylabel: str, rand_value: float = None, ylim: 'Tuple[float, float] | None' = (0.0, 1.0), ) -> 'plt.Figure | None': """ Create and return a box plot figure without saving it. Renders a seaborn box plot with individual data points overlaid as a swarm plot. When rand_value is provided, a dashed grey reference line marks the expected performance of a random predictor. Returns None when values is empty so callers can skip adding an empty page to a multi-page PDF. Parameters ---------- values : dict[str, list[float]] Algorithm name -> list of observed metric values. title : str Plot title. ylabel : str Y-axis label. rand_value : float or None If provided, a dashed grey horizontal line is drawn at this y value to indicate the random-predictor baseline. ylim : tuple of (float, float) or None Y-axis limits. Pass None to let matplotlib auto-scale (required for metrics like EPR whose values can exceed 1.0). Defaults to (0.0, 1.0), which is appropriate for AUPRC and AUROC. Returns ------- plt.Figure or None The created figure, or None if values is empty. """ if not isinstance(values, dict): raise TypeError(f"values must be dict, got {type(values)}") if not values: print(f"No data to plot for '{title}'.") return None plt.rcParams.update({'font.size': 14}) algos = sorted(values.keys()) # Build long-form DataFrame required by seaborn records = [(algo, v) for algo in algos for v in values[algo]] df = pd.DataFrame(records, columns=['Algorithm', 'Value']) fig, ax = plt.subplots(figsize=(max(6, len(algos) * 0.9), 5)) # Dashed grey line marking the random-predictor baseline if rand_value is not None: ax.axhline(rand_value, color='gray', linestyle='--', linewidth=0.8) sns.boxplot( data=df, x='Algorithm', y='Value', order=algos, palette=sns.color_palette("Set1", n_colors=len(algos)), fliersize=0, ax=ax, ) sns.swarmplot( data=df, x='Algorithm', y='Value', order=algos, alpha=0.5, color='k', ax=ax, ) if ylim is not None: ax.set_ylim(ylim) ax.set_title(title) ax.set_ylabel(ylabel, fontsize=18) ax.set_xlabel('Algorithm', fontsize=18) plt.setp(ax.get_xticklabels(), rotation=45, ha='right') plt.tight_layout() return fig
[docs]class Plotter(ABC): """ Abstract base class for BEELINE plot generators. Each subclass implements __call__ to read pre-computed evaluation CSVs and write one or more plot files to a caller-specified output directory. Shared loading and rendering helpers are provided as module-level functions in this module: iter_datasets, iter_datasets_with_runs, load_dataset_metric, make_box_figure, random_classifier_baseline. """ @abstractmethod def __call__(self, config: dict, output_dir: Path, root: Path) -> None: """ Generate plots from pre-computed evaluation CSVs. Parameters ---------- config : dict Parsed YAML configuration. output_dir : Path Directory where plot files are written. root : Path Working directory from which config paths are resolved. Returns ------- None """ ...