nschense
/
alnn_rewrite


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327
							# pyright: basic

"""Regenerate analysis plots from existing computed data (CSV files).

This script regenerates all plots from previously computed analysis results
without re-running the full analysis pipeline. Useful when making changes
to plotting parameters or fixing visualizations.

Usage: Run from the project root (alnn_rewrite directory):
    python analysis/regenerate_plots.py /path/to/run_directory/backend_name

Example:
    python analysis/regenerate_plots.py analysis_output/run_20260428_120000/ensemble
"""

from __future__ import annotations

import argparse
import sys
from pathlib import Path
from typing import Any

import numpy as np
import pandas as pd

# Add parent directory to path for imports
sys.path.insert(0, str(Path(__file__).parent.parent))

from analysis.analysis_modules import _uncertainty_cutoff_analysis
from analysis.defaults import (
    DEFAULT_CALIBRATION_BINS,
    DEFAULT_DECISION_THRESHOLD,
    uncertainty_cutoff_percentiles,
)
from analysis.plotting import (
    plots_dir,
    save_calibration_plot,
    save_performance_threshold_pair_plot,
    save_performance_threshold_plot,
    save_uncertainty_cutoff_pair_plot,
    save_uncertainty_cutoff_plot,
)
from analysis.runtime import write_json


def _plot_description(filename: str) -> str:
    descriptions = {
        "performance_threshold_accuracy.png": "Accuracy as the decision threshold varies.",
        "performance_threshold_f1.png": "F1 score as the decision threshold varies.",
        "performance_threshold_accuracy_f1.png": "Accuracy and F1 shown side-by-side as the decision threshold varies.",
        "performance_uncertainty_cutoff_accuracy.png": "Accuracy while progressively restricting to higher-confidence and uncertainty-metric subsets.",
        "performance_uncertainty_cutoff_f1.png": "F1 score while progressively restricting to higher-confidence and uncertainty-metric subsets.",
        "performance_uncertainty_cutoff_accuracy_f1.png": "Accuracy and F1 shown side-by-side across uncertainty-cutoff restriction levels.",
        "performance_uncertainty_percentile_cutoff_accuracy.png": "Accuracy from least to most restricted percentile-wise subset selection.",
        "performance_uncertainty_percentile_cutoff_f1.png": "F1 score from least to most restricted percentile-wise subset selection.",
        "performance_uncertainty_percentile_cutoff_accuracy_f1.png": "Accuracy and F1 shown side-by-side across percentile-floor restriction levels.",
        "calibration_reliability.png": "Reliability diagram comparing predicted probability to empirical outcome frequency.",
        "performance_threshold_accuracy_coverage.png": "Sample distribution (correct vs incorrect) across decision thresholds.",
        "performance_threshold_f1_coverage.png": "Sample distribution (correct vs incorrect) across decision thresholds.",
        "performance_threshold_accuracy_f1_coverage.png": "Sample distribution (correct vs incorrect) across decision thresholds.",
        "performance_uncertainty_cutoff_accuracy_coverage.png": "Sample coverage breakdown across restriction levels.",
        "performance_uncertainty_cutoff_f1_coverage.png": "Sample coverage breakdown across restriction levels.",
        "performance_uncertainty_cutoff_accuracy_f1_coverage.png": "Sample coverage breakdown across restriction levels.",
        "performance_uncertainty_percentile_cutoff_accuracy_coverage.png": "Sample coverage breakdown as percentile floor increases.",
        "performance_uncertainty_percentile_cutoff_f1_coverage.png": "Sample coverage breakdown as percentile floor increases.",
        "performance_uncertainty_percentile_cutoff_accuracy_f1_coverage.png": "Sample coverage breakdown as percentile floor increases.",
    }
    return descriptions.get(filename, "Generated analysis plot.")


def _write_backend_plot_report(backend: str, out_dir: Path) -> Path:
    plots = out_dir / "plots"
    images = sorted(plots.rglob("*.png")) if plots.exists() else []

    report_path = out_dir / "plots_report.md"
    lines = [
        f"# {backend.title()} Analysis Plot Report (Regenerated)",
        "",
        "This document lists regenerated analysis plots with brief descriptions.",
        "",
    ]
    if not images:
        lines.append("No plot images were found for this backend run.")
    else:
        for image_path in images:
            rel = image_path.relative_to(out_dir).as_posix()
            title = image_path.stem.replace("_", " ").title()
            lines.append(f"## {title}")
            lines.append(_plot_description(image_path.name))
            lines.append("")
            lines.append(f"![{title}]({rel})")
            lines.append("")

    report_path.write_text("\n".join(lines), encoding="utf-8")
    return report_path


def regenerate_performance_plots(backend_dir: Path) -> dict[str, Any]:
    """Regenerate performance threshold plots from existing CSV."""
    perf_csv = backend_dir / "performance_threshold_sweep.csv"
    if not perf_csv.exists():
        return {"status": "skipped", "reason": "no performance_threshold_sweep.csv"}

    df = pd.read_csv(perf_csv)
    backend = backend_dir.name if backend_dir.name != "plots" else "ensemble"

    # Get backend name from parent directory name if not found
    if backend_dir.parent.name not in ["ensemble", "bayesian"]:
        parent_name = backend_dir.name
        if parent_name in {"ensemble", "bayesian"}:
            backend = parent_name

    accuracy_plot_path = plots_dir(backend_dir) / "performance_threshold_accuracy.png"
    f1_plot_path = plots_dir(backend_dir) / "performance_threshold_f1.png"
    pair_plot_path = plots_dir(backend_dir) / "performance_threshold_accuracy_f1.png"

    save_performance_threshold_plot(
        df=df,
        backend=backend,
        output_path=accuracy_plot_path,
        metric_column="accuracy",
        metric_label="Accuracy",
        plot_key="performance_threshold_accuracy",
    )
    save_performance_threshold_plot(
        df=df,
        backend=backend,
        output_path=f1_plot_path,
        metric_column="f1",
        metric_label="F1",
        plot_key="performance_threshold_f1",
    )
    save_performance_threshold_pair_plot(
        df=df,
        backend=backend,
        output_path=pair_plot_path,
        plot_key="performance_threshold_accuracy_f1",
    )

    return {
        "status": "regenerated",
        "performance_threshold_accuracy": str(accuracy_plot_path),
        "performance_threshold_f1": str(f1_plot_path),
        "performance_threshold_accuracy_f1": str(pair_plot_path),
    }


def regenerate_uncertainty_cutoff_plots(backend_dir: Path) -> dict[str, Any]:
    """Regenerate uncertainty cutoff plots from existing CSV."""
    cutoff_csv = backend_dir / "performance_uncertainty_cutoff.csv"
    percentile_csv = backend_dir / "performance_uncertainty_percentile_cutoff.csv"

    results = {"status": "skipped", "reason": "no cutoff CSV files found"}

    if cutoff_csv.exists():
        cutoff_df = pd.read_csv(cutoff_csv)
        results["status"] = "regenerated"

        # Create plots by uncertainty type
        for uncertainty_name in sorted(pd.unique(cutoff_df["uncertainty_type"])):
            sub_df = cutoff_df[cutoff_df["uncertainty_type"] == uncertainty_name].copy()
            slug = uncertainty_name.lower().replace(" ", "_")

            sub_accuracy_plot_path = (
                plots_dir(backend_dir)
                / f"performance_uncertainty_cutoff_{slug}_accuracy.png"
            )
            sub_f1_plot_path = (
                plots_dir(backend_dir) / f"performance_uncertainty_cutoff_{slug}_f1.png"
            )
            sub_pair_plot_path = (
                plots_dir(backend_dir)
                / f"performance_uncertainty_cutoff_{slug}_accuracy_f1.png"
            )

            save_uncertainty_cutoff_plot(
                cutoff_df=sub_df,
                title_prefix="Model Output / Uncertainty Cutoff Percentile",
                x_label="Restriction Level (0 = all samples, 100 = most restricted subset)",
                output_path=sub_accuracy_plot_path,
                metric_column="accuracy",
                metric_label="Accuracy",
                plot_key="performance_uncertainty_cutoff_accuracy",
            )
            save_uncertainty_cutoff_plot(
                cutoff_df=sub_df,
                title_prefix="Model Output / Uncertainty Cutoff Percentile",
                x_label="Restriction Level (0 = all samples, 100 = most restricted subset)",
                output_path=sub_f1_plot_path,
                metric_column="f1",
                metric_label="F1",
                plot_key="performance_uncertainty_cutoff_f1",
            )
            save_uncertainty_cutoff_pair_plot(
                cutoff_df=sub_df,
                title_prefix="Model Output / Uncertainty Cutoff Percentile",
                x_label="Restriction Level (0 = all samples, 100 = most restricted subset)",
                output_path=sub_pair_plot_path,
                plot_key="performance_uncertainty_cutoff_accuracy_f1",
            )

    if percentile_csv.exists():
        percentile_df = pd.read_csv(percentile_csv)
        results["status"] = "regenerated"

        # Create plots by uncertainty type
        for uncertainty_name in sorted(pd.unique(percentile_df["uncertainty_type"])):
            sub_df = percentile_df[
                percentile_df["uncertainty_type"] == uncertainty_name
            ].copy()
            slug = uncertainty_name.lower().replace(" ", "_")

            sub_accuracy_plot_path = (
                plots_dir(backend_dir)
                / f"performance_uncertainty_percentile_cutoff_{slug}_accuracy.png"
            )
            sub_f1_plot_path = (
                plots_dir(backend_dir)
                / f"performance_uncertainty_percentile_cutoff_{slug}_f1.png"
            )
            sub_pair_plot_path = (
                plots_dir(backend_dir)
                / f"performance_uncertainty_percentile_cutoff_{slug}_accuracy_f1.png"
            )

            save_uncertainty_cutoff_plot(
                cutoff_df=sub_df,
                title_prefix="Model Output / Uncertainty Percentile Floor",
                x_label="Percentile Floor (0 = all samples, 100 = top percentile subset)",
                output_path=sub_accuracy_plot_path,
                metric_column="accuracy",
                metric_label="Accuracy",
                plot_key="performance_uncertainty_percentile_cutoff_accuracy",
            )
            save_uncertainty_cutoff_plot(
                cutoff_df=sub_df,
                title_prefix="Model Output / Uncertainty Percentile Floor",
                x_label="Percentile Floor (0 = all samples, 100 = top percentile subset)",
                output_path=sub_f1_plot_path,
                metric_column="f1",
                metric_label="F1",
                plot_key="performance_uncertainty_percentile_cutoff_f1",
            )
            save_uncertainty_cutoff_pair_plot(
                cutoff_df=sub_df,
                title_prefix="Model Output / Uncertainty Percentile Floor",
                x_label="Percentile Floor (0 = all samples, 100 = top percentile subset)",
                output_path=sub_pair_plot_path,
                plot_key="performance_uncertainty_percentile_cutoff_accuracy_f1",
            )

    return results


def regenerate_calibration_plots(backend_dir: Path) -> dict[str, Any]:
    """Regenerate calibration plots from existing calibration data."""
    calib_path = backend_dir / "calibration_per_bin.npy"
    if not calib_path.exists():
        return {"status": "skipped", "reason": "no calibration_per_bin.npy"}

    per_bin = np.load(calib_path)
    backend = backend_dir.name if backend_dir.name != "plots" else "ensemble"

    # Get backend name from parent directory name if not found
    if backend_dir.parent.name not in ["ensemble", "bayesian"]:
        parent_name = backend_dir.name
        if parent_name in {"ensemble", "bayesian"}:
            backend = parent_name

    plot_path = plots_dir(backend_dir) / "calibration_reliability.png"
    save_calibration_plot(per_bin=per_bin, backend=backend, output_path=plot_path)

    return {
        "status": "regenerated",
        "calibration_reliability": str(plot_path),
    }


def main() -> None:
    parser = argparse.ArgumentParser(
        description="Regenerate analysis plots from existing computed data CSV files."
    )
    parser.add_argument(
        "backend_dir",
        type=Path,
        help="Path to backend-specific analysis output directory "
        "(e.g., analysis_output/run_xxx/ensemble)",
    )

    args = parser.parse_args()
    backend_dir = args.backend_dir.resolve()

    if not backend_dir.exists():
        print(
            f"Error: Backend directory does not exist: {backend_dir}", file=sys.stderr
        )
        sys.exit(1)

    print(f"Regenerating plots from: {backend_dir}")

    results: dict[str, Any] = {
        "backend_dir": str(backend_dir),
        "performance": regenerate_performance_plots(backend_dir),
        "uncertainty_cutoff": regenerate_uncertainty_cutoff_plots(backend_dir),
        "calibration": regenerate_calibration_plots(backend_dir),
    }

    # Write updated report
    report_path = _write_backend_plot_report(
        backend=backend_dir.name, out_dir=backend_dir
    )
    results["plots_report"] = str(report_path)

    print(f"\nPlot regeneration complete!")
    print(f"Results summary:")
    print(f"  Performance plots: {results['performance'].get('status', 'unknown')}")
    print(
        f"  Uncertainty cutoff plots: {results['uncertainty_cutoff'].get('status', 'unknown')}"
    )
    print(f"  Calibration plots: {results['calibration'].get('status', 'unknown')}")
    print(f"  Report written to: {report_path}")

    write_json(backend_dir / "plot_regeneration_log.json", results)


if __name__ == "__main__":
    main()