Source code for BLPlot.PlotAUROC

from pathlib import Path
from typing import List

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.metrics import auc, roc_curve

from BLPlot.plotter import (
    Plotter,
    get_algo_ids,
    iter_datasets_with_runs,
    load_dataset_metric,
    make_box_figure,
)


def _make_roc_curve_figure(
    run_path: Path,
    gt_path: Path,
    algos: List[str],
    dataset_label: str,
) -> 'plt.Figure | None':
    """
    Build a ROC curve figure for all algorithms in a single run.

    Each algorithm is drawn as a separate line. Algorithms with missing
    rankedEdges.csv or only one class in predictions are skipped. Returns
    None if no algorithm produced a valid curve.

    Parameters
    ----------
    run_path : Path
        Output directory for the run (contains per-algorithm subdirectories).
    gt_path : Path
        Path to the ground truth edge list CSV (columns Gene1, Gene2).
    algos : list[str]
        Algorithm IDs to plot, drawn in sorted order.
    dataset_label : str
        Dataset label used in the plot title (nickname if set, else dataset_id).

    Returns
    -------
    plt.Figure or None
        The created figure, or None if no valid curves could be drawn.
    """
    if not isinstance(run_path, Path):
        raise TypeError(f"run_path must be Path, got {type(run_path)}")
    if not isinstance(gt_path, Path):
        raise TypeError(f"gt_path must be Path, got {type(gt_path)}")

    if not gt_path.exists():
        print(f"Warning: ground truth not found at {gt_path}, skipping.")
        return None

    gt_df = pd.read_csv(gt_path, header=0)
    true_edges = set(zip(gt_df['Gene1'], gt_df['Gene2']))

    # Compute AUROC for each algorithm first, then sort descending so that
    # the best-performing algorithm appears first in the legend and is drawn
    # with the first palette colour.
    curves = []
    for algo in algos:
        edges_path = run_path / algo / 'rankedEdges.csv'
        if not edges_path.exists():
            continue

        df = pd.read_csv(edges_path, sep='\t', header=0)
        predicted = df[df['Gene1'] != df['Gene2']].copy()
        if predicted.empty:
            continue

        labels = [
            1 if (g1, g2) in true_edges else 0
            for g1, g2 in zip(predicted['Gene1'], predicted['Gene2'])
        ]
        scores = predicted['EdgeWeight'].values

        # ROC curve is undefined when only one class appears
        if sum(labels) == 0 or sum(labels) == len(labels):
            continue

        fpr, tpr, _ = roc_curve(labels, scores)
        score = auc(fpr, tpr)
        curves.append((score, algo, fpr, tpr))

    if not curves:
        return None

    # Sort by AUROC descending
    curves.sort(key=lambda x: x[0], reverse=True)
    colors = sns.color_palette("Set1", n_colors=len(curves))

    fig, ax = plt.subplots(figsize=(7, 5))
    # Random classifier diagonal reference line
    ax.plot([0, 1], [0, 1], color='grey', linestyle='--', linewidth=0.8, label='Random')

    for (score, algo, fpr, tpr), color in zip(curves, colors):
        ax.plot(fpr, tpr, label=f'{algo} (AUROC={score:.3f})', color=color)

    ax.set_xlabel('False Positive Rate')
    ax.set_ylabel('True Positive Rate')
    ax.set_title(f'ROC Curve — {dataset_label}')
    ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left', fontsize='small')
    plt.tight_layout()
    return fig


def _make_figure(
    dataset_label: str,
    dataset_path: Path,
    gt_path: Path,
    runs: list,
    algos: List[str],
) -> 'plt.Figure | None':
    """
    Produce the appropriate AUROC figure for one dataset.

    Returns a ROC curve figure for single-run datasets and a box plot for
    multi-run datasets. The random classifier AUROC baseline is always 0.5.

    Parameters
    ----------
    dataset_label : str
        Dataset label used in the plot title (nickname if set, else dataset_id).
    dataset_path : Path
        Output directory containing AUROC.csv for multi-run datasets.
    gt_path : Path
        Ground truth CSV path (used for single-run curve plots).
    runs : list
        Run dicts from the config for this dataset.
    algos : list[str]
        Algorithm IDs to include.

    Returns
    -------
    plt.Figure or None
    """
    if len(runs) == 1:
        run_path = dataset_path / runs[0]['run_id']
        return _make_roc_curve_figure(run_path, gt_path, algos, dataset_label)

    values = load_dataset_metric(dataset_path, 'AUROC.csv')
    return make_box_figure(values, f'AUROC — {dataset_label}', 'AUROC', rand_value=0.5)


[docs]class PlotAUROC(Plotter):
    """
    Plotter that produces one AUROC graphic per dataset.

    For datasets with a single run a ROC curve is drawn (one line per
    algorithm). For datasets with multiple runs a box plot is drawn (one box
    per algorithm, distribution across runs). Each dataset is written as both a
    PDF and PNG under an AUROC/ subdirectory of the output directory, named
    <dataset_label>-AUROC.pdf and <dataset_label>-AUROC.png.
    """

    def __call__(self, config: dict, output_dir: Path, root: Path) -> None:
        """
        Generate per-dataset AUROC graphics, writing each to its own PDF and PNG.

        Creates output_dir/AUROC/ and writes <dataset_label>-AUROC.pdf and
        <dataset_label>-AUROC.png for each enabled dataset. Datasets that
        produce no figure (missing data or no valid curves) are skipped.

        Parameters
        ----------
        config : dict
            Parsed YAML configuration.
        output_dir : Path
            Parent directory; an AUROC/ subdirectory is created inside it.
        root : Path
            Working directory from which config paths are resolved.

        Returns
        -------
        None
        """
        if not isinstance(config, dict):
            raise TypeError(f"config must be dict, got {type(config)}")
        if not isinstance(output_dir, Path):
            raise TypeError(f"output_dir must be Path, got {type(output_dir)}")
        if not isinstance(root, Path):
            raise TypeError(f"root must be Path, got {type(root)}")

        auroc_dir = output_dir / 'AUROC'
        auroc_dir.mkdir(parents=True, exist_ok=True)

        algos = get_algo_ids(config)
        for _, dlabel, dp, gtp, runs in iter_datasets_with_runs(config, root):
            fig = _make_figure(dlabel, dp, gtp, runs, algos)
            if fig is None:
                continue
            safe_label = dlabel.replace('/', '-')
            stem = auroc_dir / f'{safe_label}-AUROC'
            fig.savefig(stem.with_suffix('.pdf'))
            fig.savefig(stem.with_suffix('.png'))
            plt.close(fig)
            print(f"Saved {stem}.pdf and .png")