Sfoglia il codice sorgente

more analysis work - fixed graphs, going to implement more noise + stdev analysis

Nicholas Schense 6 giorni fa
parent
commit
e79e7f50c2

+ 4 - 11
analysis/data_access.py

@@ -51,14 +51,11 @@ def _positive_probability(
     if "model" in predictions.dims:
         class_probs = predictions.isel(img_class=class_index)
         prob_mean = class_probs.mean(dim="model").values
-        # Confidence is defined as distance from 0.5 and averaged across models.
-        class_prob_arr = np.asarray(class_probs.values, dtype=float)
-        model_axis = class_probs.dims.index("model")
-        conf_mean = np.abs(class_prob_arr - 0.5).mean(axis=model_axis)
+        # Confidence is the direct model output probability for the predicted class.
         prob_std = class_probs.std(dim="model").values
         return (
             np.asarray(prob_mean, dtype=float),
-            np.asarray(conf_mean, dtype=float),
+            np.asarray(prob_mean, dtype=float),
             np.asarray(prob_std, dtype=float),
             "std",
         )
@@ -68,9 +65,6 @@ def _positive_probability(
         dim = str(sample_like[0])
         class_probs = predictions.isel(img_class=class_index)
         prob_mean = class_probs.mean(dim=dim).values
-        class_prob_arr = np.asarray(class_probs.values, dtype=float)
-        sample_axis = class_probs.dims.index(dim)
-        conf_mean = np.abs(class_prob_arr - 0.5).mean(axis=sample_axis)
 
         # For Bayesian MC predictions, uncertainty should come from predictive
         # entropy of the predictive distribution rather than classwise std.
@@ -78,16 +72,15 @@ def _positive_probability(
         entropy_uncertainty = predictive_entropy(np.asarray(mc_preds, dtype=float))
         return (
             np.asarray(prob_mean, dtype=float),
-            np.asarray(conf_mean, dtype=float),
+            np.asarray(prob_mean, dtype=float),
             np.asarray(entropy_uncertainty, dtype=float),
             "predictive_entropy",
         )
 
     prob = predictions.isel(img_class=class_index).values
-    conf = np.abs(np.asarray(prob, dtype=float) - 0.5)
     return (
         np.asarray(prob, dtype=float),
-        np.asarray(conf, dtype=float),
+        np.asarray(prob, dtype=float),
         np.full_like(np.asarray(prob, dtype=float), np.nan),
         "unknown",
     )

+ 83 - 54
analysis/evaluate_models.py

@@ -7,6 +7,7 @@ from pathlib import Path
 from typing import Any
 
 import pandas as pd
+from tqdm.auto import tqdm
 
 from analysis.analysis_modules import (
     run_calibration,
@@ -229,65 +230,91 @@ def _run_backend(
         "uncertainty_metric": evaluation.uncertainty_metric,
     }
 
-    summary["performance"] = run_performance(
-        evaluation=evaluation,
-        output_dir=out_dir,
-        thresholds=thresholds,
-    )
-    summary["calibration"] = run_calibration(
-        evaluation=evaluation,
-        output_dir=out_dir,
-        bins=DEFAULT_CALIBRATION_BINS,
-    )
-    summary["physician"] = run_physician(
-        evaluation=evaluation,
-        clinical_df=clinical_df,
-        output_dir=out_dir,
-    )
-    summary["longitudinal"] = run_longitudinal(
-        evaluation=evaluation,
-        clinical_df=clinical_df,
-        output_dir=out_dir,
+    n_stages = 4 + (0 if skip_noise else 2)
+    stage_bar = tqdm(
+        total=n_stages,
+        desc=f"[{backend}] analysis stages",
+        unit="stage",
+        leave=False,
     )
+    try:
+        stage_bar.set_postfix_str("performance")
+        summary["performance"] = run_performance(
+            evaluation=evaluation,
+            output_dir=out_dir,
+            thresholds=thresholds,
+        )
+        stage_bar.update(1)
 
-    if skip_noise:
-        summary["noise"] = {"skipped": True, "reason": "--skip-noise supplied"}
-        summary["noise_accuracy_uncertainty"] = {
-            "skipped": True,
-            "reason": "Noise analysis skipped, so no noise table available.",
-        }
-    else:
-        try:
-            summary["noise"] = run_noise_analysis(
-                config=config,
-                root_dir=root_dir,
-                backend=backend,
-                output_dir=out_dir,
-                class_index=DEFAULT_POSITIVE_CLASS_INDEX,
-                noise_sigmas=noise_factors,
-                threshold=DEFAULT_DECISION_THRESHOLD,
-                calibration_bins=DEFAULT_CALIBRATION_BINS,
-                bayesian_mc_passes=DEFAULT_BAYESIAN_MC_PASSES,
-            )
+        stage_bar.set_postfix_str("calibration")
+        summary["calibration"] = run_calibration(
+            evaluation=evaluation,
+            output_dir=out_dir,
+            bins=DEFAULT_CALIBRATION_BINS,
+        )
+        stage_bar.update(1)
 
-            noise_table_path = Path(str(summary["noise"]["table"]))
-            noise_df = pd.read_csv(noise_table_path)
-            summary["noise_accuracy_uncertainty"] = (
-                run_noise_accuracy_uncertainty_analysis(
-                    noise_df=noise_df,
-                    backend=backend,
-                    output_dir=out_dir,
-                )
-            )
-        except Exception as exc:
-            summary["noise"] = {
-                "skipped": True,
-                "reason": f"Noise analysis failed: {exc}",
-            }
+        stage_bar.set_postfix_str("physician")
+        summary["physician"] = run_physician(
+            evaluation=evaluation,
+            clinical_df=clinical_df,
+            output_dir=out_dir,
+        )
+        stage_bar.update(1)
+
+        stage_bar.set_postfix_str("longitudinal")
+        summary["longitudinal"] = run_longitudinal(
+            evaluation=evaluation,
+            clinical_df=clinical_df,
+            output_dir=out_dir,
+        )
+        stage_bar.update(1)
+
+        if skip_noise:
+            summary["noise"] = {"skipped": True, "reason": "--skip-noise supplied"}
             summary["noise_accuracy_uncertainty"] = {
                 "skipped": True,
-                "reason": f"Noise relationship analysis failed: {exc}",
+                "reason": "Noise analysis skipped, so no noise table available.",
             }
+        else:
+            try:
+                stage_bar.set_postfix_str("noise")
+                summary["noise"] = run_noise_analysis(
+                    config=config,
+                    root_dir=root_dir,
+                    backend=backend,
+                    output_dir=out_dir,
+                    class_index=DEFAULT_POSITIVE_CLASS_INDEX,
+                    noise_sigmas=noise_factors,
+                    threshold=DEFAULT_DECISION_THRESHOLD,
+                    calibration_bins=DEFAULT_CALIBRATION_BINS,
+                    bayesian_mc_passes=DEFAULT_BAYESIAN_MC_PASSES,
+                )
+                stage_bar.update(1)
+
+                stage_bar.set_postfix_str("noise-correlation")
+                noise_table_path = Path(str(summary["noise"]["table"]))
+                noise_df = pd.read_csv(noise_table_path)
+                summary["noise_accuracy_uncertainty"] = (
+                    run_noise_accuracy_uncertainty_analysis(
+                        noise_df=noise_df,
+                        backend=backend,
+                        output_dir=out_dir,
+                    )
+                )
+                stage_bar.update(1)
+            except Exception as exc:
+                summary["noise"] = {
+                    "skipped": True,
+                    "reason": f"Noise analysis failed: {exc}",
+                }
+                summary["noise_accuracy_uncertainty"] = {
+                    "skipped": True,
+                    "reason": f"Noise relationship analysis failed: {exc}",
+                }
+                stage_bar.update(2)
+    finally:
+        stage_bar.close()
 
     report_path = _write_backend_plot_report(backend=backend, out_dir=out_dir)
     summary["plots_report"] = str(report_path)
@@ -341,8 +368,10 @@ def main() -> None:
         print(f"Dataset summary complete. Results saved to {paths.run_dir}")
         return
 
-    for backend in args.backend:
+    backend_iter = tqdm(args.backend, desc="Backends", unit="backend")
+    for backend in backend_iter:
         out_dir = backend_dir(paths, backend)
+        backend_iter.set_postfix_str(backend)
         if args.longitudinal_breakdown_only:
             manifest["backends"][backend] = _run_longitudinal_breakdown_only(
                 config=config,

+ 23 - 4
analysis/holdout_evaluation.py

@@ -9,6 +9,7 @@ from typing import Any
 import numpy as np
 import torch
 from torch.utils.data import DataLoader
+from tqdm.auto import tqdm
 import xarray as xr
 
 from model.cnn import CNN3D
@@ -66,7 +67,9 @@ def _evaluate_ensemble(
     labels = np.zeros((n_samples, n_classes), dtype=np.float32)
     image_ids = np.zeros((n_samples,), dtype=int)
 
-    for model_i, model_file in enumerate(model_files):
+    model_iter = tqdm(model_files, desc="Ensemble checkpoints", unit="model")
+    for model_i, model_file in enumerate(model_iter):
+        model_iter.set_postfix_str(model_file.name)
         model = _init_cnn(config)
         model.load_state_dict(
             torch.load(model_file, map_location=device),
@@ -76,7 +79,14 @@ def _evaluate_ensemble(
         model.eval()
 
         with torch.no_grad():
-            for sample_i, (mri, xls, label, img_id) in enumerate(holdout_loader):
+            sample_iter = tqdm(
+                holdout_loader,
+                total=n_samples,
+                desc=f"{model_file.name}",
+                unit="batch",
+                leave=False,
+            )
+            for sample_i, (mri, xls, label, img_id) in enumerate(sample_iter):
                 mri_device = mri.float().to(device)
                 xls_device = xls.float().to(device)
                 output = model((mri_device, xls_device))
@@ -156,8 +166,17 @@ def _evaluate_bayesian(
     image_ids = np.zeros((n_samples,), dtype=int)
 
     with torch.no_grad():
-        for pass_i in range(mc_passes):
-            for sample_i, (mri, xls, label, img_id) in enumerate(holdout_loader):
+        pass_iter = tqdm(range(mc_passes), desc="Bayesian MC passes", unit="pass")
+        for pass_i in pass_iter:
+            pass_iter.set_postfix_str(f"pass={pass_i + 1}/{mc_passes}")
+            sample_iter = tqdm(
+                holdout_loader,
+                total=n_samples,
+                desc=f"MC pass {pass_i + 1}",
+                unit="batch",
+                leave=False,
+            )
+            for sample_i, (mri, xls, label, img_id) in enumerate(sample_iter):
                 mri_device = mri.float().to(device)
                 xls_device = xls.float().to(device)
                 output = model((mri_device, xls_device))

+ 25 - 6
analysis/noise_analysis.py

@@ -9,6 +9,7 @@ import numpy as np
 import pandas as pd
 import torch
 from bayesian_torch.utils.util import predictive_entropy
+from tqdm.auto import tqdm
 
 from model.cnn import CNN3D
 
@@ -132,7 +133,14 @@ def _infer_with_noise_ensemble(
     all_true: list[int] = []
 
     with torch.no_grad():
-        for mri, xls, labels, _ in test_loader:
+        batch_iter = tqdm(
+            test_loader,
+            total=len(test_loader),
+            desc=f"ensemble sigma={sigma:g}",
+            unit="batch",
+            leave=False,
+        )
+        for mri, xls, labels, _ in batch_iter:
             mri_device = mri.float().to(device)
             xls_device = xls.float().to(device)
             labels_device = labels.to(device)
@@ -144,7 +152,7 @@ def _infer_with_noise_ensemble(
 
             pred_mat = np.stack(preds, axis=0)
             mean = pred_mat.mean(axis=0)
-            confidence = np.abs(pred_mat - 0.5).mean(axis=0)
+            confidence = mean
             std = pred_mat.std(axis=0)
             true = labels_device[:, class_index].detach().cpu().numpy().astype(int)
 
@@ -176,7 +184,14 @@ def _infer_with_noise_bayesian(
     all_true: list[int] = []
 
     with torch.no_grad():
-        for mri, xls, labels, _ in test_loader:
+        batch_iter = tqdm(
+            test_loader,
+            total=len(test_loader),
+            desc=f"bayesian sigma={sigma:g}",
+            unit="batch",
+            leave=False,
+        )
+        for mri, xls, labels, _ in batch_iter:
             mri_device = mri.float().to(device)
             xls_device = xls.float().to(device)
             labels_device = labels.to(device)
@@ -188,7 +203,7 @@ def _infer_with_noise_bayesian(
 
             draw_mat = np.stack(draws, axis=0)  # (mc_passes, batch, classes)
             mean = draw_mat.mean(axis=0)[:, class_index]
-            confidence = np.abs(draw_mat[:, :, class_index] - 0.5).mean(axis=0)
+            confidence = mean
             entropy_uncertainty = predictive_entropy(draw_mat)
             true = labels_device[:, class_index].detach().cpu().numpy().astype(int)
 
@@ -237,7 +252,9 @@ def run_noise_analysis(
     if backend == "ensemble":
         models = _load_ensemble_models(config)
         example_rows: list[tuple[float, torch.Tensor]] = []
-        for sigma in noise_sigmas:
+        sigma_iter = tqdm(noise_sigmas, desc="Noise sweep (ensemble)", unit="sigma")
+        for sigma in sigma_iter:
+            sigma_iter.set_postfix_str(f"sigma={sigma:g}")
             y_true, y_prob, y_confidence, y_std = _infer_with_noise_ensemble(
                 test_loader,
                 models,
@@ -286,7 +303,9 @@ def run_noise_analysis(
     elif backend == "bayesian":
         model = _load_bayesian_model(config)
         example_rows = []
-        for sigma in noise_sigmas:
+        sigma_iter = tqdm(noise_sigmas, desc="Noise sweep (bayesian)", unit="sigma")
+        for sigma in sigma_iter:
+            sigma_iter.set_postfix_str(f"sigma={sigma:g}")
             y_true, y_prob, y_confidence, y_std = _infer_with_noise_bayesian(
                 test_loader,
                 model,

+ 114 - 37
analysis/plotting.py

@@ -10,7 +10,6 @@ import pandas as pd
 import torch
 from matplotlib.axes import Axes
 
-
 # Easily editable plot text overrides by plot key.
 # Example:
 # "performance_threshold": {
@@ -124,6 +123,56 @@ def _plot_correct_incorrect_bars(
     bars_ax.grid(False)
 
 
+def save_coverage_bar_plot(
+    x_values: pd.Series | np.ndarray,
+    n_correct: pd.Series | np.ndarray,
+    n_incorrect: pd.Series | np.ndarray,
+    x_label: str,
+    title: str,
+    output_path: Path,
+) -> None:
+    """Save a standalone bar chart showing sample counts (correct vs incorrect)."""
+    x = np.asarray(x_values, dtype=float)
+    correct = np.asarray(n_correct, dtype=float)
+    incorrect = np.asarray(n_incorrect, dtype=float)
+    if x.size == 0 or correct.size == 0 or incorrect.size == 0:
+        return
+
+    width = float(np.diff(np.sort(x)).min()) * 0.8 if x.size > 1 else 0.04
+    max_count = float(max(np.nanmax(correct), np.nanmax(incorrect), 1.0))
+
+    fig, ax = plt.subplots(figsize=(10, 5))
+    ax.bar(
+        x,
+        correct,
+        width=width,
+        color="#2ca02c",
+        alpha=0.6,
+        label="correct",
+        align="center",
+    )
+    ax.bar(
+        x,
+        -incorrect,
+        width=width,
+        color="#d62728",
+        alpha=0.6,
+        label="incorrect",
+        align="center",
+    )
+    ax.axhline(0.0, color="gray", linewidth=0.8, alpha=0.4)
+    ax.set_ylim(-1.15 * max_count, 1.15 * max_count)
+    ax.set_xlabel(x_label)
+    ax.set_ylabel("Sample Count")
+    ax.set_title(title)
+    ax.legend()
+    ax.grid(True, alpha=0.3)
+    fig.tight_layout()
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    fig.savefig(output_path)
+    plt.close(fig)
+
+
 def plots_dir(output_dir: Path) -> Path:
     plots = output_dir / "plots"
     plots.mkdir(parents=True, exist_ok=True)
@@ -153,7 +202,6 @@ def save_performance_threshold_plot(
     )
 
     fig, ax = plt.subplots(figsize=(10, 5))
-    _plot_correct_incorrect_bars(ax, df["threshold"], n_correct, n_incorrect)
     ax.plot(df["threshold"], df[metric_column], label=metric_label, marker="o")
     ax.set_xlabel(x_label)
     ax.set_ylabel(y_label)
@@ -165,6 +213,17 @@ def save_performance_threshold_plot(
     fig.savefig(output_path)
     plt.close(fig)
 
+    # Generate separate coverage bar plot
+    coverage_path = output_path.parent / f"{output_path.stem}_coverage.png"
+    save_coverage_bar_plot(
+        x_values=df["threshold"],
+        n_correct=n_correct,
+        n_incorrect=n_incorrect,
+        x_label=x_label,
+        title=f"Sample Distribution vs Decision Threshold ({backend})",
+        output_path=coverage_path,
+    )
+
 
 def save_performance_threshold_pair_plot(
     df: pd.DataFrame,
@@ -191,7 +250,6 @@ def save_performance_threshold_pair_plot(
         (axes[0], "accuracy", "Accuracy", "o"),
         (axes[1], "f1", "F1", "s"),
     ]:
-        _plot_correct_incorrect_bars(ax, df["threshold"], n_correct, n_incorrect)
         ax.plot(df["threshold"], df[metric_col], label=metric_label, marker=marker)
         ax.set_xlabel(x_label)
         ax.set_ylabel(metric_label)
@@ -205,6 +263,17 @@ def save_performance_threshold_pair_plot(
     fig.savefig(output_path)
     plt.close(fig)
 
+    # Generate separate coverage bar plot
+    coverage_path = output_path.parent / f"{output_path.stem}_coverage.png"
+    save_coverage_bar_plot(
+        x_values=df["threshold"],
+        n_correct=n_correct,
+        n_incorrect=n_incorrect,
+        x_label=x_label,
+        title=f"Sample Distribution vs Decision Threshold ({backend})",
+        output_path=coverage_path,
+    )
+
 
 def save_uncertainty_cutoff_plot(
     cutoff_df: pd.DataFrame,
@@ -223,23 +292,6 @@ def save_uncertainty_cutoff_plot(
     )
 
     fig, ax = plt.subplots(figsize=(10, 5))
-    first_group = (
-        cutoff_df.sort_values(["uncertainty_type", "restriction_level"])
-        .groupby("uncertainty_type", as_index=False)
-        .head(1)
-    )
-    if not first_group.empty:
-        # Draw count bars once; uncertainty lines are overlaid afterwards.
-        rep_name = str(first_group.iloc[0]["uncertainty_type"])
-        rep = cutoff_df[cutoff_df["uncertainty_type"] == rep_name].sort_values(
-            "restriction_level"
-        )
-        _plot_correct_incorrect_bars(
-            ax,
-            rep["restriction_level"],
-            pd.to_numeric(rep["n_correct"], errors="coerce"),
-            pd.to_numeric(rep["n_incorrect"], errors="coerce"),
-        )
 
     for uncertainty_name, group in cutoff_df.groupby("uncertainty_type"):
         g = group.sort_values("restriction_level")
@@ -260,6 +312,27 @@ def save_uncertainty_cutoff_plot(
     fig.savefig(output_path)
     plt.close(fig)
 
+    # Generate separate coverage bar plot
+    first_group = (
+        cutoff_df.sort_values(["uncertainty_type", "restriction_level"])
+        .groupby("uncertainty_type", as_index=False)
+        .head(1)
+    )
+    if not first_group.empty:
+        rep_name = str(first_group.iloc[0]["uncertainty_type"])
+        rep = cutoff_df[cutoff_df["uncertainty_type"] == rep_name].sort_values(
+            "restriction_level"
+        )
+        coverage_path = output_path.parent / f"{output_path.stem}_coverage.png"
+        save_coverage_bar_plot(
+            x_values=rep["restriction_level"],
+            n_correct=pd.to_numeric(rep["n_correct"], errors="coerce"),
+            n_incorrect=pd.to_numeric(rep["n_incorrect"], errors="coerce"),
+            x_label=x_label_final,
+            title=f"Sample Coverage vs {title_prefix}",
+            output_path=coverage_path,
+        )
+
 
 def save_uncertainty_cutoff_pair_plot(
     cutoff_df: pd.DataFrame,
@@ -276,23 +349,6 @@ def save_uncertainty_cutoff_pair_plot(
     )
 
     fig, axes = plt.subplots(1, 2, figsize=(14, 5), sharex=True)
-    first_group = (
-        cutoff_df.sort_values(["uncertainty_type", "restriction_level"])
-        .groupby("uncertainty_type", as_index=False)
-        .head(1)
-    )
-    if not first_group.empty:
-        rep_name = str(first_group.iloc[0]["uncertainty_type"])
-        rep = cutoff_df[cutoff_df["uncertainty_type"] == rep_name].sort_values(
-            "restriction_level"
-        )
-        for ax in axes:
-            _plot_correct_incorrect_bars(
-                ax,
-                rep["restriction_level"],
-                pd.to_numeric(rep["n_correct"], errors="coerce"),
-                pd.to_numeric(rep["n_incorrect"], errors="coerce"),
-            )
 
     for uncertainty_name, group in cutoff_df.groupby("uncertainty_type"):
         g = group.sort_values("restriction_level")
@@ -317,6 +373,27 @@ def save_uncertainty_cutoff_pair_plot(
     fig.savefig(output_path)
     plt.close(fig)
 
+    # Generate separate coverage bar plot
+    first_group = (
+        cutoff_df.sort_values(["uncertainty_type", "restriction_level"])
+        .groupby("uncertainty_type", as_index=False)
+        .head(1)
+    )
+    if not first_group.empty:
+        rep_name = str(first_group.iloc[0]["uncertainty_type"])
+        rep = cutoff_df[cutoff_df["uncertainty_type"] == rep_name].sort_values(
+            "restriction_level"
+        )
+        coverage_path = output_path.parent / f"{output_path.stem}_coverage.png"
+        save_coverage_bar_plot(
+            x_values=rep["restriction_level"],
+            n_correct=pd.to_numeric(rep["n_correct"], errors="coerce"),
+            n_incorrect=pd.to_numeric(rep["n_incorrect"], errors="coerce"),
+            x_label=x_label_final,
+            title=f"Sample Coverage vs {title_prefix}",
+            output_path=coverage_path,
+        )
+
 
 def save_calibration_plot(per_bin: np.ndarray, backend: str, output_path: Path) -> None:
     title, x_label, y_label = _resolve_plot_text(

+ 327 - 0
analysis/regenerate_plots.py

@@ -0,0 +1,327 @@
+# pyright: basic
+
+"""Regenerate analysis plots from existing computed data (CSV files).
+
+This script regenerates all plots from previously computed analysis results
+without re-running the full analysis pipeline. Useful when making changes
+to plotting parameters or fixing visualizations.
+
+Usage: Run from the project root (alnn_rewrite directory):
+    python analysis/regenerate_plots.py /path/to/run_directory/backend_name
+
+Example:
+    python analysis/regenerate_plots.py analysis_output/run_20260428_120000/ensemble
+"""
+
+from __future__ import annotations
+
+import argparse
+import sys
+from pathlib import Path
+from typing import Any
+
+import numpy as np
+import pandas as pd
+
+# Add parent directory to path for imports
+sys.path.insert(0, str(Path(__file__).parent.parent))
+
+from analysis.analysis_modules import _uncertainty_cutoff_analysis
+from analysis.defaults import (
+    DEFAULT_CALIBRATION_BINS,
+    DEFAULT_DECISION_THRESHOLD,
+    uncertainty_cutoff_percentiles,
+)
+from analysis.plotting import (
+    plots_dir,
+    save_calibration_plot,
+    save_performance_threshold_pair_plot,
+    save_performance_threshold_plot,
+    save_uncertainty_cutoff_pair_plot,
+    save_uncertainty_cutoff_plot,
+)
+from analysis.runtime import write_json
+
+
+def _plot_description(filename: str) -> str:
+    descriptions = {
+        "performance_threshold_accuracy.png": "Accuracy as the decision threshold varies.",
+        "performance_threshold_f1.png": "F1 score as the decision threshold varies.",
+        "performance_threshold_accuracy_f1.png": "Accuracy and F1 shown side-by-side as the decision threshold varies.",
+        "performance_uncertainty_cutoff_accuracy.png": "Accuracy while progressively restricting to higher-confidence and uncertainty-metric subsets.",
+        "performance_uncertainty_cutoff_f1.png": "F1 score while progressively restricting to higher-confidence and uncertainty-metric subsets.",
+        "performance_uncertainty_cutoff_accuracy_f1.png": "Accuracy and F1 shown side-by-side across uncertainty-cutoff restriction levels.",
+        "performance_uncertainty_percentile_cutoff_accuracy.png": "Accuracy from least to most restricted percentile-wise subset selection.",
+        "performance_uncertainty_percentile_cutoff_f1.png": "F1 score from least to most restricted percentile-wise subset selection.",
+        "performance_uncertainty_percentile_cutoff_accuracy_f1.png": "Accuracy and F1 shown side-by-side across percentile-floor restriction levels.",
+        "calibration_reliability.png": "Reliability diagram comparing predicted probability to empirical outcome frequency.",
+        "performance_threshold_accuracy_coverage.png": "Sample distribution (correct vs incorrect) across decision thresholds.",
+        "performance_threshold_f1_coverage.png": "Sample distribution (correct vs incorrect) across decision thresholds.",
+        "performance_threshold_accuracy_f1_coverage.png": "Sample distribution (correct vs incorrect) across decision thresholds.",
+        "performance_uncertainty_cutoff_accuracy_coverage.png": "Sample coverage breakdown across restriction levels.",
+        "performance_uncertainty_cutoff_f1_coverage.png": "Sample coverage breakdown across restriction levels.",
+        "performance_uncertainty_cutoff_accuracy_f1_coverage.png": "Sample coverage breakdown across restriction levels.",
+        "performance_uncertainty_percentile_cutoff_accuracy_coverage.png": "Sample coverage breakdown as percentile floor increases.",
+        "performance_uncertainty_percentile_cutoff_f1_coverage.png": "Sample coverage breakdown as percentile floor increases.",
+        "performance_uncertainty_percentile_cutoff_accuracy_f1_coverage.png": "Sample coverage breakdown as percentile floor increases.",
+    }
+    return descriptions.get(filename, "Generated analysis plot.")
+
+
+def _write_backend_plot_report(backend: str, out_dir: Path) -> Path:
+    plots = out_dir / "plots"
+    images = sorted(plots.rglob("*.png")) if plots.exists() else []
+
+    report_path = out_dir / "plots_report.md"
+    lines = [
+        f"# {backend.title()} Analysis Plot Report (Regenerated)",
+        "",
+        "This document lists regenerated analysis plots with brief descriptions.",
+        "",
+    ]
+    if not images:
+        lines.append("No plot images were found for this backend run.")
+    else:
+        for image_path in images:
+            rel = image_path.relative_to(out_dir).as_posix()
+            title = image_path.stem.replace("_", " ").title()
+            lines.append(f"## {title}")
+            lines.append(_plot_description(image_path.name))
+            lines.append("")
+            lines.append(f"![{title}]({rel})")
+            lines.append("")
+
+    report_path.write_text("\n".join(lines), encoding="utf-8")
+    return report_path
+
+
+def regenerate_performance_plots(backend_dir: Path) -> dict[str, Any]:
+    """Regenerate performance threshold plots from existing CSV."""
+    perf_csv = backend_dir / "performance_threshold_sweep.csv"
+    if not perf_csv.exists():
+        return {"status": "skipped", "reason": "no performance_threshold_sweep.csv"}
+
+    df = pd.read_csv(perf_csv)
+    backend = backend_dir.name if backend_dir.name != "plots" else "ensemble"
+
+    # Get backend name from parent directory name if not found
+    if backend_dir.parent.name not in ["ensemble", "bayesian"]:
+        parent_name = backend_dir.name
+        if parent_name in {"ensemble", "bayesian"}:
+            backend = parent_name
+
+    accuracy_plot_path = plots_dir(backend_dir) / "performance_threshold_accuracy.png"
+    f1_plot_path = plots_dir(backend_dir) / "performance_threshold_f1.png"
+    pair_plot_path = plots_dir(backend_dir) / "performance_threshold_accuracy_f1.png"
+
+    save_performance_threshold_plot(
+        df=df,
+        backend=backend,
+        output_path=accuracy_plot_path,
+        metric_column="accuracy",
+        metric_label="Accuracy",
+        plot_key="performance_threshold_accuracy",
+    )
+    save_performance_threshold_plot(
+        df=df,
+        backend=backend,
+        output_path=f1_plot_path,
+        metric_column="f1",
+        metric_label="F1",
+        plot_key="performance_threshold_f1",
+    )
+    save_performance_threshold_pair_plot(
+        df=df,
+        backend=backend,
+        output_path=pair_plot_path,
+        plot_key="performance_threshold_accuracy_f1",
+    )
+
+    return {
+        "status": "regenerated",
+        "performance_threshold_accuracy": str(accuracy_plot_path),
+        "performance_threshold_f1": str(f1_plot_path),
+        "performance_threshold_accuracy_f1": str(pair_plot_path),
+    }
+
+
+def regenerate_uncertainty_cutoff_plots(backend_dir: Path) -> dict[str, Any]:
+    """Regenerate uncertainty cutoff plots from existing CSV."""
+    cutoff_csv = backend_dir / "performance_uncertainty_cutoff.csv"
+    percentile_csv = backend_dir / "performance_uncertainty_percentile_cutoff.csv"
+
+    results = {"status": "skipped", "reason": "no cutoff CSV files found"}
+
+    if cutoff_csv.exists():
+        cutoff_df = pd.read_csv(cutoff_csv)
+        results["status"] = "regenerated"
+
+        # Create plots by uncertainty type
+        for uncertainty_name in sorted(pd.unique(cutoff_df["uncertainty_type"])):
+            sub_df = cutoff_df[cutoff_df["uncertainty_type"] == uncertainty_name].copy()
+            slug = uncertainty_name.lower().replace(" ", "_")
+
+            sub_accuracy_plot_path = (
+                plots_dir(backend_dir)
+                / f"performance_uncertainty_cutoff_{slug}_accuracy.png"
+            )
+            sub_f1_plot_path = (
+                plots_dir(backend_dir) / f"performance_uncertainty_cutoff_{slug}_f1.png"
+            )
+            sub_pair_plot_path = (
+                plots_dir(backend_dir)
+                / f"performance_uncertainty_cutoff_{slug}_accuracy_f1.png"
+            )
+
+            save_uncertainty_cutoff_plot(
+                cutoff_df=sub_df,
+                title_prefix="Model Output / Uncertainty Cutoff Percentile",
+                x_label="Restriction Level (0 = all samples, 100 = most restricted subset)",
+                output_path=sub_accuracy_plot_path,
+                metric_column="accuracy",
+                metric_label="Accuracy",
+                plot_key="performance_uncertainty_cutoff_accuracy",
+            )
+            save_uncertainty_cutoff_plot(
+                cutoff_df=sub_df,
+                title_prefix="Model Output / Uncertainty Cutoff Percentile",
+                x_label="Restriction Level (0 = all samples, 100 = most restricted subset)",
+                output_path=sub_f1_plot_path,
+                metric_column="f1",
+                metric_label="F1",
+                plot_key="performance_uncertainty_cutoff_f1",
+            )
+            save_uncertainty_cutoff_pair_plot(
+                cutoff_df=sub_df,
+                title_prefix="Model Output / Uncertainty Cutoff Percentile",
+                x_label="Restriction Level (0 = all samples, 100 = most restricted subset)",
+                output_path=sub_pair_plot_path,
+                plot_key="performance_uncertainty_cutoff_accuracy_f1",
+            )
+
+    if percentile_csv.exists():
+        percentile_df = pd.read_csv(percentile_csv)
+        results["status"] = "regenerated"
+
+        # Create plots by uncertainty type
+        for uncertainty_name in sorted(pd.unique(percentile_df["uncertainty_type"])):
+            sub_df = percentile_df[
+                percentile_df["uncertainty_type"] == uncertainty_name
+            ].copy()
+            slug = uncertainty_name.lower().replace(" ", "_")
+
+            sub_accuracy_plot_path = (
+                plots_dir(backend_dir)
+                / f"performance_uncertainty_percentile_cutoff_{slug}_accuracy.png"
+            )
+            sub_f1_plot_path = (
+                plots_dir(backend_dir)
+                / f"performance_uncertainty_percentile_cutoff_{slug}_f1.png"
+            )
+            sub_pair_plot_path = (
+                plots_dir(backend_dir)
+                / f"performance_uncertainty_percentile_cutoff_{slug}_accuracy_f1.png"
+            )
+
+            save_uncertainty_cutoff_plot(
+                cutoff_df=sub_df,
+                title_prefix="Model Output / Uncertainty Percentile Floor",
+                x_label="Percentile Floor (0 = all samples, 100 = top percentile subset)",
+                output_path=sub_accuracy_plot_path,
+                metric_column="accuracy",
+                metric_label="Accuracy",
+                plot_key="performance_uncertainty_percentile_cutoff_accuracy",
+            )
+            save_uncertainty_cutoff_plot(
+                cutoff_df=sub_df,
+                title_prefix="Model Output / Uncertainty Percentile Floor",
+                x_label="Percentile Floor (0 = all samples, 100 = top percentile subset)",
+                output_path=sub_f1_plot_path,
+                metric_column="f1",
+                metric_label="F1",
+                plot_key="performance_uncertainty_percentile_cutoff_f1",
+            )
+            save_uncertainty_cutoff_pair_plot(
+                cutoff_df=sub_df,
+                title_prefix="Model Output / Uncertainty Percentile Floor",
+                x_label="Percentile Floor (0 = all samples, 100 = top percentile subset)",
+                output_path=sub_pair_plot_path,
+                plot_key="performance_uncertainty_percentile_cutoff_accuracy_f1",
+            )
+
+    return results
+
+
+def regenerate_calibration_plots(backend_dir: Path) -> dict[str, Any]:
+    """Regenerate calibration plots from existing calibration data."""
+    calib_path = backend_dir / "calibration_per_bin.npy"
+    if not calib_path.exists():
+        return {"status": "skipped", "reason": "no calibration_per_bin.npy"}
+
+    per_bin = np.load(calib_path)
+    backend = backend_dir.name if backend_dir.name != "plots" else "ensemble"
+
+    # Get backend name from parent directory name if not found
+    if backend_dir.parent.name not in ["ensemble", "bayesian"]:
+        parent_name = backend_dir.name
+        if parent_name in {"ensemble", "bayesian"}:
+            backend = parent_name
+
+    plot_path = plots_dir(backend_dir) / "calibration_reliability.png"
+    save_calibration_plot(per_bin=per_bin, backend=backend, output_path=plot_path)
+
+    return {
+        "status": "regenerated",
+        "calibration_reliability": str(plot_path),
+    }
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(
+        description="Regenerate analysis plots from existing computed data CSV files."
+    )
+    parser.add_argument(
+        "backend_dir",
+        type=Path,
+        help="Path to backend-specific analysis output directory "
+        "(e.g., analysis_output/run_xxx/ensemble)",
+    )
+
+    args = parser.parse_args()
+    backend_dir = args.backend_dir.resolve()
+
+    if not backend_dir.exists():
+        print(
+            f"Error: Backend directory does not exist: {backend_dir}", file=sys.stderr
+        )
+        sys.exit(1)
+
+    print(f"Regenerating plots from: {backend_dir}")
+
+    results: dict[str, Any] = {
+        "backend_dir": str(backend_dir),
+        "performance": regenerate_performance_plots(backend_dir),
+        "uncertainty_cutoff": regenerate_uncertainty_cutoff_plots(backend_dir),
+        "calibration": regenerate_calibration_plots(backend_dir),
+    }
+
+    # Write updated report
+    report_path = _write_backend_plot_report(
+        backend=backend_dir.name, out_dir=backend_dir
+    )
+    results["plots_report"] = str(report_path)
+
+    print(f"\nPlot regeneration complete!")
+    print(f"Results summary:")
+    print(f"  Performance plots: {results['performance'].get('status', 'unknown')}")
+    print(
+        f"  Uncertainty cutoff plots: {results['uncertainty_cutoff'].get('status', 'unknown')}"
+    )
+    print(f"  Calibration plots: {results['calibration'].get('status', 'unknown')}")
+    print(f"  Report written to: {report_path}")
+
+    write_json(backend_dir / "plot_regeneration_log.json", results)
+
+
+if __name__ == "__main__":
+    main()