3 maanden geleden · 1cae771e45
--- a/analysis/ANALYSES_OVERVIEW.md
+++ b/analysis/ANALYSES_OVERVIEW.md
@@ -0,0 +1,104 @@
 
				+# Analysis Overview
			
 
				+
			
 
				+This document summarizes what each analysis in this folder does and what it produces.
			
 
				+
			
 
				+## Runtime Defaults
			
 
				+
			
 
				+- Analysis defaults are centralized in `analysis/defaults.py`.
			
 
				+- The orchestrator CLI in `analysis/evaluate_models.py` is intentionally minimal and focuses on operational controls (`--backend`, `--run-name`, `--skip-noise`).
			
 
				+- Threshold grids, uncertainty percentile grids, noise-factor grids, calibration bins, class index, decision threshold, and bayesian MC passes are sourced from `analysis/defaults.py`.
			
 
				+
			
 
				+## Shared Utilities
			
 
				+
			
 
				+- Shared loader/split logic is centralized in `analysis/data_pipeline.py`.
			
 
				+- All plotting code is centralized in `analysis/plotting.py` for easier inspection and maintenance.
			
 
				+
			
 
				+## 1. Performance Threshold Sweep
			
 
				+
			
 
				+- Purpose: Measure classification performance as decision threshold changes.
			
 
				+- Inputs: Ground-truth labels and predicted probabilities.
			
 
				+- Method: Evaluate metrics across an evenly spaced threshold grid.
			
 
				+- Main outputs:
			
 
				+  - `performance_threshold_sweep.csv`
			
 
				+  - `plots/performance_threshold_sweep.png`
			
 
				+
			
 
				+## 2. Uncertainty Cutoff (Raw-Value)
			
 
				+
			
 
				+- Purpose: Evaluate performance on subsets with uncertainty below percentile-derived cutoff values.
			
 
				+- Inputs: Uncertainty arrays (confidence-derived uncertainty and backend-specific uncertainty).
			
 
				+- Method:
			
 
				+  - Build evenly spaced percentile points.
			
 
				+  - Convert each percentile to a raw uncertainty cutoff value.
			
 
				+  - Keep samples where uncertainty is less than or equal to that cutoff.
			
 
				+  - Compute accuracy and F1 for each retained subset.
			
 
				+- Main outputs:
			
 
				+  - `performance_uncertainty_cutoff.csv`
			
 
				+  - `plots/performance_uncertainty_cutoff.png`
			
 
				+
			
 
				+## 3. Uncertainty Cutoff (Percentile-Ranked)
			
 
				+
			
 
				+- Purpose: Evaluate performance from all samples toward only the lowest-uncertainty samples.
			
 
				+- Inputs: Same uncertainty arrays as above.
			
 
				+- Method:
			
 
				+  - Use an evenly spaced percentile grid.
			
 
				+  - Keep samples where uncertainty is less than or equal to the selected percentile cutoff.
			
 
				+  - Plot from least restricted on the left (all samples) to most restricted on the right (lowest-uncertainty subset only).
			
 
				+- Main outputs:
			
 
				+  - `performance_uncertainty_percentile_cutoff.csv`
			
 
				+  - `plots/performance_uncertainty_percentile_cutoff.png`
			
 
				+
			
 
				+## 4. Calibration Analysis
			
 
				+
			
 
				+- Purpose: Quantify probability calibration quality.
			
 
				+- Inputs: Ground-truth labels and predicted probabilities.
			
 
				+- Method:
			
 
				+  - Reliability binning with configurable bin count.
			
 
				+  - Compute ECE, MCE, and Brier score.
			
 
				+- Main outputs:
			
 
				+  - `calibration_bins.csv`
			
 
				+  - `plots/calibration_reliability.png`
			
 
				+
			
 
				+## 5. Physician Confidence Comparison
			
 
				+
			
 
				+- Purpose: Compare model uncertainty with physician confidence ratings.
			
 
				+- Inputs: Evaluation outputs plus clinical table (Image Data ID + physician confidence column).
			
 
				+- Method:
			
 
				+  - Merge by image ID.
			
 
				+  - Group metrics by physician confidence level.
			
 
				+  - Plot distributions per rating group.
			
 
				+- Main outputs include grouped summary CSV files and boxplots for confidence and secondary uncertainty metrics.
			
 
				+
			
 
				+## 6. Longitudinal Stability Analysis
			
 
				+
			
 
				+- Purpose: Examine uncertainty patterns across stable and transitioning patient trajectories.
			
 
				+- Inputs: Evaluation outputs and clinical timeline information.
			
 
				+- Method:
			
 
				+  - Build patient-level trajectories.
			
 
				+  - Group by clinical cohort dynamics.
			
 
				+  - Compare uncertainty summaries across cohorts.
			
 
				+- Main outputs include patient/cohort summary CSV files and cohort uncertainty plots.
			
 
				+
			
 
				+## 7. Noise Sensitivity Analysis
			
 
				+
			
 
				+- Purpose: Test robustness and uncertainty behavior under synthetic Gaussian noise.
			
 
				+- Inputs: Holdout data loader, model backend, noise factor schedule, threshold, calibration bins.
			
 
				+- Method:
			
 
				+  - Use an evenly spaced noise factor schedule.
			
 
				+  - Add Gaussian noise scaled by a fixed intensity-range factor.
			
 
				+  - Recompute performance and calibration at each sigma.
			
 
				+  - Save visual examples of noised images.
			
 
				+- Main outputs:
			
 
				+  - `noise_sensitivity.csv`
			
 
				+  - `plots/noise_sensitivity.png`
			
 
				+  - `plots/noise_uncertainty.png`
			
 
				+  - `plots/noise_confidence_certainty.png`
			
 
				+  - `plots/noise_examples/*_noise_examples.png`
			
 
				+
			
 
				+## Plot Report
			
 
				+
			
 
				+- Each backend output now includes `plots_report.md`.
			
 
				+- The report embeds all generated plot images and includes a short description per plot.
			
 
				+
			
 
				+## Notes on Even Spacing
			
 
				+
			
 
				+The pipeline uses evenly spaced grids for sampled x-axis points in threshold and uncertainty-cutoff performance plots, and for the sigma schedule used by noise sensitivity plots.
			
--- a/analysis/README.md
+++ b/analysis/README.md
@@ -32,6 +32,11 @@ Uncertainty analyses are now run using both:
 
				 ### Current Modules
			
 
				 
			
 
				 - `evaluate_models.py`: Orchestrator CLI for running selected analyses across ensemble and bayesian backends.
			
 
				+- `defaults.py`: Centralized analysis defaults (threshold grid, noise factors, calibration bins, class index, decision threshold, bayesian MC passes).
			
 
				+- `data_pipeline.py`: Shared analysis data loading and split/loader construction utilities.
			
 
				+- `plotting.py`: Shared plotting functions for all analysis visualizations.
			
 
				+- `uncertainty.py`: Shared confidence certainty/uncertainty transforms.
			
 
				+- `model_utils.py`: Shared model behavior utilities (including bayesian sampling mode setup).
			
 
				 - `runtime.py`: Runtime paths and JSON helpers.
			
 
				 - `data_access.py`: netCDF loading, class probability extraction, and clinical table access.
			
 
				 - `metrics.py`: Shared performance and calibration metrics.
			
@@ -51,10 +56,7 @@ Useful options:
 
				 ```bash
			
 
				 python -m analysis.evaluate_models \
			
 
				 	--backend ensemble bayesian \
			
 
				-	--run-name first_modular_run \
			
 
				-	--positive-class-index 0 \
			
 
				-	--calibration-bins 10 \
			
 
				-	--noise-sigmas 0.0 0.01 0.03 0.05 0.1
			
 
				+	--run-name first_modular_run
			
 
				 ```
			
 
				 
			
 
				 If you want to skip noise analysis while validating the pipeline:
			
@@ -63,6 +65,8 @@ If you want to skip noise analysis while validating the pipeline:
 
				 python -m analysis.evaluate_models --skip-noise
			
 
				 ```
			
 
				 
			
 
				+Most analysis tuning parameters are now intentionally centralized in `analysis/defaults.py` to reduce CLI verbosity and improve maintainability.
			
 
				+
			
 
				 ### Output Layout
			
 
				 
			
 
				 Each run creates a dedicated directory:
			
@@ -73,15 +77,14 @@ alnn_rewrite/analysis_output/
 
				 		run_manifest.json
			
 
				 		ensemble/
			
 
				 			backend_summary.json
			
 
				+			plots_report.md
			
 
				 			performance_threshold_sweep.csv
			
 
				-			performance_threshold_sweep.png
			
 
				 			calibration_bins.csv
			
 
				-			calibration_reliability.png
			
 
				+			performance_uncertainty_cutoff.csv
			
 
				+			performance_uncertainty_percentile_cutoff.csv
			
 
				 			physician_grouped_metrics.csv
			
 
				 			physician_confidence_grouped_metrics.csv
			
 
				 			physician_std_grouped_metrics.csv
			
 
				-			physician_confidence_boxplot.png
			
 
				-			physician_std_boxplot.png
			
 
				 			longitudinal_patient_summary.csv
			
 
				 			longitudinal_cohort_summary.csv
			
 
				 			longitudinal_uncertainty_by_cohort.csv
			
@@ -89,21 +92,33 @@ alnn_rewrite/analysis_output/
 
				 			longitudinal_std_patient_summary.csv
			
 
				 			longitudinal_confidence_cohort_summary.csv
			
 
				 			longitudinal_std_cohort_summary.csv
			
 
				-			longitudinal_cohort_confidence.png
			
 
				-			longitudinal_cohort_std.png
			
 
				 			noise_sensitivity.csv
			
 
				-			noise_sensitivity.png
			
 
				-			noise_uncertainty.png
			
 
				-			noise_confidence_certainty.png
			
 
				-			noise_examples/
			
 
				-				ensemble_noise_examples.png
			
 
				-				bayesian_noise_examples.png
			
 
				+			plots/
			
 
				+				performance_threshold_sweep.png
			
 
				+				calibration_reliability.png
			
 
				+				performance_uncertainty_cutoff.png
			
 
				+				performance_uncertainty_percentile_cutoff.png
			
 
				+				physician_confidence_boxplot.png
			
 
				+				physician_std_boxplot.png
			
 
				+				longitudinal_cohort_confidence.png
			
 
				+				longitudinal_cohort_std.png
			
 
				+				noise_sensitivity.png
			
 
				+				noise_uncertainty.png
			
 
				+				noise_confidence_certainty.png
			
 
				+				noise_examples/
			
 
				+					ensemble_noise_examples.png
			
 
				+					bayesian_noise_examples.png
			
 
				 		bayesian/
			
 
				 			... (same structure)
			
 
				 ```
			
 
				 
			
 
				 The default noise schedule now includes noisier settings beyond the earlier small-sigma cases, so the saved example images show a clearer progression from lightly noised to heavily noised volumes.
			
 
				 
			
 
				+Performance analysis now includes two uncertainty cutoff views:
			
 
				+
			
 
				+- A raw-value cutoff sweep that keeps samples with uncertainty below the percentile-derived cutoff value.
			
 
				+- A percentile-ranked cutoff sweep that is plotted from least restricted (left, all samples) to most restricted (right, lowest-uncertainty subset).
			
 
				+
			
 
				 For the noise analysis, the uncertainty plot uses the confidence metric in its uncertainty orientation, so higher values always mean more uncertainty. A separate certainty plot is also saved for direct inspection of `2 * |p - 0.5|`.
			
 
				 
			
 
				-Noise is now scaled by each sample's MRI intensity standard deviation, so sigma is dimensionless and interpretable across raw intensity ranges. In this setup, `sigma=1.0` means the injected noise standard deviation is approximately equal to the image's own standard deviation.
			
 
				+Noise plots now label the x-axis as Gaussian Noise Factor to reflect that values are multipliers on the actual noise sigma.
			
--- a/analysis/analysis_modules.py
+++ b/analysis/analysis_modules.py
@@ -5,13 +5,21 @@ from __future__ import annotations
 
				 from pathlib import Path
			
 
				 from typing import Any
			
 
				 
			
 
				-import matplotlib.pyplot as plt
			
 
				 import numpy as np
			
 
				 import pandas as pd
			
 
				 
			
 
				 from .data_access import BackendEvaluation, physician_column
			
 
				+from .defaults import DEFAULT_DECISION_THRESHOLD, uncertainty_cutoff_percentiles
			
 
				 from .metrics import calibration_stats, performance_at_threshold, threshold_sweep
			
 
				+from .plotting import (
			
 
				+    plots_dir,
			
 
				+    save_boxplot,
			
 
				+    save_calibration_plot,
			
 
				+    save_performance_threshold_plot,
			
 
				+    save_uncertainty_cutoff_plot,
			
 
				+)
			
 
				 from .runtime import write_json
			
 
				+from .uncertainty import confidence_uncertainty
			
 
				 
			
 
				 
			
 
				 def _save_table(rows: list[dict[str, Any]], out_path: Path) -> pd.DataFrame:
			
@@ -21,45 +29,61 @@ def _save_table(rows: list[dict[str, Any]], out_path: Path) -> pd.DataFrame:
 
				     return df
			
 
				 
			
 
				 
			
 
				-def run_performance(
			
 
				+def _uncertainty_percentiles(values: np.ndarray) -> np.ndarray:
			
 
				+    vals = np.asarray(values, dtype=float)
			
 
				+    n = len(vals)
			
 
				+    if n == 0:
			
 
				+        return np.asarray([], dtype=float)
			
 
				+    if n == 1:
			
 
				+        return np.asarray([0.0], dtype=float)
			
 
				+
			
 
				+    order = np.argsort(vals)
			
 
				+    ranks = np.empty(n, dtype=float)
			
 
				+    ranks[order] = np.arange(n, dtype=float)
			
 
				+    return (ranks / float(n - 1)) * 100.0
			
 
				+
			
 
				+
			
 
				+def _save_ensemble_prediction_debug(
			
 
				     evaluation: BackendEvaluation,
			
 
				     output_dir: Path,
			
 
				-    thresholds: np.ndarray,
			
 
				-) -> dict[str, Any]:
			
 
				-    rows = threshold_sweep(evaluation.y_true, evaluation.y_prob, thresholds)
			
 
				-    table_path = output_dir / "performance_threshold_sweep.csv"
			
 
				-    df = _save_table(rows, table_path)
			
 
				-
			
 
				-    fig, ax = plt.subplots(figsize=(10, 5))
			
 
				-    ax.plot(df["threshold"], df["accuracy"], label="accuracy", marker="o")
			
 
				-    ax.plot(df["threshold"], df["f1"], label="f1", marker="s")
			
 
				-    ax.set_xlabel("Threshold")
			
 
				-    ax.set_ylabel("Score")
			
 
				-    ax.set_title(f"Performance vs Threshold ({evaluation.backend})")
			
 
				-    ax.grid(True, alpha=0.3)
			
 
				-    ax.legend()
			
 
				-    fig.tight_layout()
			
 
				-    plot_path = output_dir / "performance_threshold_sweep.png"
			
 
				-    fig.savefig(plot_path)
			
 
				-    plt.close(fig)
			
 
				+) -> str:
			
 
				+    uncertainty_vals = confidence_uncertainty(evaluation.y_prob)
			
 
				+    uncertainty_pct = _uncertainty_percentiles(uncertainty_vals)
			
 
				+    pred_label = (evaluation.y_prob >= DEFAULT_DECISION_THRESHOLD).astype(int)
			
 
				+    true_label = np.asarray(evaluation.y_true, dtype=int)
			
 
				+    is_correct = (pred_label == true_label).astype(int)
			
 
				+
			
 
				+    debug_df = pd.DataFrame(
			
 
				+        {
			
 
				+            "image_id": evaluation.image_ids.astype(int),
			
 
				+            "predicted_probability": np.asarray(evaluation.y_prob, dtype=float),
			
 
				+            "predicted_label": pred_label,
			
 
				+            "actual_label": true_label,
			
 
				+            "is_correct": is_correct,
			
 
				+            "uncertainty": uncertainty_vals,
			
 
				+            "uncertainty_percentile": uncertainty_pct,
			
 
				+        }
			
 
				+    ).sort_values("uncertainty", ascending=True)
			
 
				 
			
 
				-    best_idx = int(df["f1"].idxmax())
			
 
				-    best = df.iloc[best_idx].to_dict()
			
 
				+    path = output_dir / "ensemble_prediction_debug.csv"
			
 
				+    debug_df.to_csv(path, index=False)
			
 
				+    return str(path)
			
 
				 
			
 
				-    cutoff_percentiles = np.array(
			
 
				-        [100, 95, 90, 85, 80, 75, 70, 60, 50, 40, 30, 20, 10, 5, 2, 1],
			
 
				-        dtype=float,
			
 
				-    )
			
 
				-    confidence_uncertainty = 1.0 - np.asarray(
			
 
				-        evaluation.uncertainty_confidence, dtype=float
			
 
				-    )
			
 
				-    secondary_uncertainty = np.asarray(evaluation.uncertainty_std, dtype=float)
			
 
				-    uncertainty_types = [
			
 
				-        ("confidence_uncertainty", confidence_uncertainty),
			
 
				-        (evaluation.uncertainty_metric, secondary_uncertainty),
			
 
				-    ]
			
 
				 
			
 
				+def _uncertainty_cutoff_analysis(
			
 
				+    evaluation: BackendEvaluation,
			
 
				+    output_dir: Path,
			
 
				+    uncertainty_types: list[tuple[str, np.ndarray]],
			
 
				+    cutoff_percentiles: np.ndarray,
			
 
				+    keep_higher: bool,
			
 
				+    table_filename: str,
			
 
				+    plot_filename: str,
			
 
				+    title_prefix: str,
			
 
				+    x_label: str,
			
 
				+    selection_label: str,
			
 
				+) -> dict[str, Any]:
			
 
				     cutoff_rows: list[dict[str, Any]] = []
			
 
				+
			
 
				     for uncertainty_name, values in uncertainty_types:
			
 
				         finite_mask = np.isfinite(values)
			
 
				         if not finite_mask.any():
			
@@ -70,9 +94,12 @@ def run_performance(
 
				         y_prob_valid = evaluation.y_prob[finite_mask]
			
 
				 
			
 
				         for cutoff_percentile in cutoff_percentiles:
			
 
				-            # Keep predictions whose uncertainty is <= percentile cutoff.
			
 
				             cutoff_value = float(np.percentile(values_valid, cutoff_percentile))
			
 
				-            keep_mask = values_valid <= cutoff_value
			
 
				+            if keep_higher:
			
 
				+                keep_mask = values_valid >= cutoff_value
			
 
				+            else:
			
 
				+                keep_mask = values_valid <= cutoff_value
			
 
				+
			
 
				             retained = int(keep_mask.sum())
			
 
				             if retained == 0:
			
 
				                 continue
			
@@ -80,7 +107,7 @@ def run_performance(
 
				             perf = performance_at_threshold(
			
 
				                 y_true=y_true_valid[keep_mask],
			
 
				                 y_prob=y_prob_valid[keep_mask],
			
 
				-                threshold=0.5,
			
 
				+                threshold=DEFAULT_DECISION_THRESHOLD,
			
 
				             )
			
 
				             cutoff_rows.append(
			
 
				                 {
			
@@ -89,44 +116,98 @@ def run_performance(
 
				                     "cutoff_value": cutoff_value,
			
 
				                     "n_retained": retained,
			
 
				                     "coverage": float(retained / len(values_valid)),
			
 
				+                    "selection_rule": selection_label,
			
 
				                     "accuracy": float(perf["accuracy"]),
			
 
				                     "f1": float(perf["f1"]),
			
 
				                 }
			
 
				             )
			
 
				 
			
 
				-    cutoff_table_path = output_dir / "performance_uncertainty_cutoff.csv"
			
 
				-    cutoff_plot_path = output_dir / "performance_uncertainty_cutoff.png"
			
 
				-    if cutoff_rows:
			
 
				-        cutoff_df = pd.DataFrame(cutoff_rows)
			
 
				-        cutoff_df.to_csv(cutoff_table_path, index=False)
			
 
				-
			
 
				-        fig_u, axes_u = plt.subplots(1, 2, figsize=(14, 5), sharex=True)
			
 
				-        for uncertainty_name, group in cutoff_df.groupby("uncertainty_type"):
			
 
				-            g = group.sort_values("cutoff_percentile", ascending=False)
			
 
				-            axes_u[0].plot(
			
 
				-                g["cutoff_percentile"],
			
 
				-                g["accuracy"],
			
 
				-                marker="o",
			
 
				-                label=uncertainty_name,
			
 
				-            )
			
 
				-            axes_u[1].plot(
			
 
				-                g["cutoff_percentile"],
			
 
				-                g["f1"],
			
 
				-                marker="s",
			
 
				-                label=uncertainty_name,
			
 
				-            )
			
 
				+    table_path = output_dir / table_filename
			
 
				+    plot_path = plots_dir(output_dir) / plot_filename
			
 
				+    if not cutoff_rows:
			
 
				+        return {"table": str(table_path), "plot": str(plot_path), "rows": 0}
			
 
				+
			
 
				+    cutoff_df = pd.DataFrame(cutoff_rows)
			
 
				+    cutoff_df.to_csv(table_path, index=False)
			
 
				+
			
 
				+    # Restriction level increases from left to right so right-most points are most restricted.
			
 
				+    if keep_higher:
			
 
				+        cutoff_df["restriction_level"] = cutoff_df["cutoff_percentile"]
			
 
				+    else:
			
 
				+        cutoff_df["restriction_level"] = 100.0 - cutoff_df["cutoff_percentile"]
			
 
				+
			
 
				+    save_uncertainty_cutoff_plot(
			
 
				+        cutoff_df=cutoff_df,
			
 
				+        title_prefix=title_prefix,
			
 
				+        x_label=x_label,
			
 
				+        output_path=plot_path,
			
 
				+    )
			
 
				 
			
 
				-        axes_u[0].set_title("Accuracy vs Uncertainty Cutoff Percentile")
			
 
				-        axes_u[1].set_title("F1 vs Uncertainty Cutoff Percentile")
			
 
				-        for ax in axes_u:
			
 
				-            ax.set_xlabel("Uncertainty Cutoff Percentile (100 = no cutoff)")
			
 
				-            ax.grid(True, alpha=0.3)
			
 
				-            ax.legend()
			
 
				-        axes_u[0].set_ylabel("Accuracy")
			
 
				-        axes_u[1].set_ylabel("F1")
			
 
				-        fig_u.tight_layout()
			
 
				-        fig_u.savefig(cutoff_plot_path)
			
 
				-        plt.close(fig_u)
			
 
				+    return {
			
 
				+        "table": str(table_path),
			
 
				+        "plot": str(plot_path),
			
 
				+        "rows": int(len(cutoff_df)),
			
 
				+    }
			
 
				+
			
 
				+
			
 
				+def run_performance(
			
 
				+    evaluation: BackendEvaluation,
			
 
				+    output_dir: Path,
			
 
				+    thresholds: np.ndarray,
			
 
				+) -> dict[str, Any]:
			
 
				+    rows = threshold_sweep(evaluation.y_true, evaluation.y_prob, thresholds)
			
 
				+    table_path = output_dir / "performance_threshold_sweep.csv"
			
 
				+    df = _save_table(rows, table_path)
			
 
				+
			
 
				+    plot_path = plots_dir(output_dir) / "performance_threshold_sweep.png"
			
 
				+    save_performance_threshold_plot(
			
 
				+        df=df,
			
 
				+        backend=evaluation.backend,
			
 
				+        output_path=plot_path,
			
 
				+    )
			
 
				+
			
 
				+    best_idx = int(df["f1"].idxmax())
			
 
				+    best = df.iloc[best_idx].to_dict()
			
 
				+
			
 
				+    cutoff_percentiles = uncertainty_cutoff_percentiles()
			
 
				+    confidence_uncertainty_vals = confidence_uncertainty(evaluation.y_prob)
			
 
				+    secondary_uncertainty = np.asarray(evaluation.uncertainty_std, dtype=float)
			
 
				+    uncertainty_types = [
			
 
				+        ("confidence_uncertainty", confidence_uncertainty_vals),
			
 
				+        (evaluation.uncertainty_metric, secondary_uncertainty),
			
 
				+    ]
			
 
				+
			
 
				+    uncertainty_cutoff = _uncertainty_cutoff_analysis(
			
 
				+        evaluation=evaluation,
			
 
				+        output_dir=output_dir,
			
 
				+        uncertainty_types=uncertainty_types,
			
 
				+        cutoff_percentiles=cutoff_percentiles,
			
 
				+        keep_higher=False,
			
 
				+        table_filename="performance_uncertainty_cutoff.csv",
			
 
				+        plot_filename="performance_uncertainty_cutoff.png",
			
 
				+        title_prefix="Uncertainty Cutoff Percentile",
			
 
				+        x_label="Restriction Level (0 = all results, 100 = lowest-uncertainty subset)",
			
 
				+        selection_label="uncertainty <= percentile cutoff",
			
 
				+    )
			
 
				+
			
 
				+    percentile_cutoffs = uncertainty_cutoff_percentiles()
			
 
				+    uncertainty_percentile_cutoff = _uncertainty_cutoff_analysis(
			
 
				+        evaluation=evaluation,
			
 
				+        output_dir=output_dir,
			
 
				+        uncertainty_types=uncertainty_types,
			
 
				+        cutoff_percentiles=percentile_cutoffs,
			
 
				+        keep_higher=True,
			
 
				+        table_filename="performance_uncertainty_percentile_cutoff.csv",
			
 
				+        plot_filename="performance_uncertainty_percentile_cutoff.png",
			
 
				+        title_prefix="Uncertainty Percentile Floor",
			
 
				+        x_label="Uncertainty Percentile Floor (0 = all results, 100 = highest-uncertainty subset)",
			
 
				+        selection_label="uncertainty >= percentile floor",
			
 
				+    )
			
 
				+
			
 
				+    cutoff_table_path = Path(uncertainty_cutoff["table"])
			
 
				+    cutoff_plot_path = Path(uncertainty_cutoff["plot"])
			
 
				+    percentile_cutoff_table_path = Path(uncertainty_percentile_cutoff["table"])
			
 
				+    percentile_cutoff_plot_path = Path(uncertainty_percentile_cutoff["plot"])
			
 
				     summary = {
			
 
				         "best_by_f1": {
			
 
				             k: float(v) for k, v in best.items() if isinstance(v, (int, float))
			
@@ -136,9 +217,19 @@ def run_performance(
 
				         "uncertainty_cutoff": {
			
 
				             "table": str(cutoff_table_path),
			
 
				             "plot": str(cutoff_plot_path),
			
 
				-            "decision_threshold": 0.5,
			
 
				+            "decision_threshold": DEFAULT_DECISION_THRESHOLD,
			
 
				+        },
			
 
				+        "uncertainty_percentile_cutoff": {
			
 
				+            "table": str(percentile_cutoff_table_path),
			
 
				+            "plot": str(percentile_cutoff_plot_path),
			
 
				+            "decision_threshold": DEFAULT_DECISION_THRESHOLD,
			
 
				         },
			
 
				     }
			
 
				+    if evaluation.backend == "ensemble":
			
 
				+        summary["ensemble_prediction_debug"] = _save_ensemble_prediction_debug(
			
 
				+            evaluation=evaluation,
			
 
				+            output_dir=output_dir,
			
 
				+        )
			
 
				     write_json(output_dir / "performance_summary.json", summary)
			
 
				     return summary
			
 
				 
			
@@ -159,24 +250,12 @@ def run_calibration(
 
				     table_path = output_dir / "calibration_bins.csv"
			
 
				     bin_df.to_csv(table_path, index=False)
			
 
				 
			
 
				-    fig, ax = plt.subplots(figsize=(6, 6))
			
 
				-    valid = ~np.isnan(per_bin[:, 1])
			
 
				-    ax.plot([0, 1], [0, 1], linestyle="--", color="gray", label="ideal")
			
 
				-    ax.plot(
			
 
				-        per_bin[valid, 0],
			
 
				-        per_bin[valid, 1],
			
 
				-        marker="o",
			
 
				-        label=f"{evaluation.backend}",
			
 
				+    plot_path = plots_dir(output_dir) / "calibration_reliability.png"
			
 
				+    save_calibration_plot(
			
 
				+        per_bin=per_bin,
			
 
				+        backend=evaluation.backend,
			
 
				+        output_path=plot_path,
			
 
				     )
			
 
				-    ax.set_xlabel("Mean Predicted Probability")
			
 
				-    ax.set_ylabel("Empirical Fraction Positive")
			
 
				-    ax.set_title(f"Reliability Diagram ({evaluation.backend})")
			
 
				-    ax.legend()
			
 
				-    ax.grid(True, alpha=0.3)
			
 
				-    fig.tight_layout()
			
 
				-    plot_path = output_dir / "calibration_reliability.png"
			
 
				-    fig.savefig(plot_path)
			
 
				-    plt.close(fig)
			
 
				 
			
 
				     out = {
			
 
				         **summary,
			
@@ -252,20 +331,19 @@ def run_physician(
 
				             ]
			
 
				         )
			
 
				 
			
 
				-        fig, ax = plt.subplots(figsize=(9, 5))
			
 
				         data = [
			
 
				             np.asarray(merged.loc[merged[col] == r, metric_col], dtype=float)
			
 
				             for r in ratings
			
 
				         ]
			
 
				-        ax.boxplot(data, tick_labels=[str(r) for r in ratings])
			
 
				-        ax.set_xlabel("Physician Confidence Rating (DXCONFID)")
			
 
				-        ax.set_ylabel(metric_label)
			
 
				-        ax.set_title(f"{metric_label} vs Physician Confidence ({evaluation.backend})")
			
 
				-        ax.grid(True, axis="y", alpha=0.3)
			
 
				-        fig.tight_layout()
			
 
				-        plot_path = output_dir / f"physician_{metric_name}_boxplot.png"
			
 
				-        fig.savefig(plot_path)
			
 
				-        plt.close(fig)
			
 
				+        plot_path = plots_dir(output_dir) / f"physician_{metric_name}_boxplot.png"
			
 
				+        save_boxplot(
			
 
				+            data=data,
			
 
				+            tick_labels=[str(r) for r in ratings],
			
 
				+            x_label="Physician Confidence Rating (DXCONFID)",
			
 
				+            y_label=metric_label,
			
 
				+            title=f"{metric_label} vs Physician Confidence ({evaluation.backend})",
			
 
				+            output_path=plot_path,
			
 
				+        )
			
 
				 
			
 
				         corr = float(
			
 
				             pd.to_numeric(
			
@@ -431,21 +509,21 @@ def run_longitudinal(
 
				     ]
			
 
				     plot_paths: dict[str, str] = {}
			
 
				     for metric_name, metric_col, metric_label in uncertainty_specs:
			
 
				-        fig, ax = plt.subplots(figsize=(9, 5))
			
 
				         values = [
			
 
				             np.asarray(
			
 
				                 patient_df.loc[patient_df["cohort"] == c, metric_col], dtype=float
			
 
				             )
			
 
				             for c in cohorts
			
 
				         ]
			
 
				-        ax.boxplot(values, tick_labels=cohorts)
			
 
				-        ax.set_ylabel(metric_label)
			
 
				-        ax.set_title(f"Longitudinal Cohort {metric_label} ({evaluation.backend})")
			
 
				-        ax.grid(True, axis="y", alpha=0.3)
			
 
				-        fig.tight_layout()
			
 
				-        plot_path = output_dir / f"longitudinal_cohort_{metric_name}.png"
			
 
				-        fig.savefig(plot_path)
			
 
				-        plt.close(fig)
			
 
				+        plot_path = plots_dir(output_dir) / f"longitudinal_cohort_{metric_name}.png"
			
 
				+        save_boxplot(
			
 
				+            data=values,
			
 
				+            tick_labels=cohorts,
			
 
				+            x_label="Cohort",
			
 
				+            y_label=metric_label,
			
 
				+            title=f"Longitudinal Cohort {metric_label} ({evaluation.backend})",
			
 
				+            output_path=plot_path,
			
 
				+        )
			
 
				         plot_paths[metric_name] = str(plot_path)
			
 
				 
			
 
				     uncertainty_by_cohort = cohort_df.melt(
			
--- a/analysis/data_access.py
+++ b/analysis/data_access.py
@@ -11,6 +11,8 @@ import pandas as pd
 
				 import xarray as xr
			
 
				 from bayesian_torch.utils.util import predictive_entropy
			
 
				 
			
 
				+from .uncertainty import confidence_certainty
			
 
				+
			
 
				 
			
 
				 @dataclass
			
 
				 class BackendEvaluation:
			
@@ -130,7 +132,7 @@ def load_backend_evaluation(
 
				     y_prob, y_std, uncertainty_metric = _positive_probability(
			
 
				         predictions, class_index=class_index
			
 
				     )
			
 
				-    conf = 2.0 * np.abs(y_prob - 0.5)
			
 
				+    conf = confidence_certainty(y_prob)
			
 
				 
			
 
				     if len(y_true) != len(y_prob):
			
 
				         raise ValueError(
			
--- a/analysis/data_pipeline.py
+++ b/analysis/data_pipeline.py
@@ -0,0 +1,88 @@
 
				+# pyright: basic
			
 
				+
			
 
				+from __future__ import annotations
			
 
				+
			
 
				+from pathlib import Path
			
 
				+from typing import Any
			
 
				+
			
 
				+import pandas as pd
			
 
				+from torch.utils.data import ConcatDataset, DataLoader, Subset
			
 
				+
			
 
				+from data.dataset import (
			
 
				+    ADNIDataset,
			
 
				+    divide_dataset_by_patient_id,
			
 
				+    initalize_dataloaders,
			
 
				+    load_adni_data_from_file,
			
 
				+)
			
 
				+
			
 
				+
			
 
				+def xls_preprocess(df: pd.DataFrame) -> pd.DataFrame:
			
 
				+    data = df[["Image Data ID", "Sex", "Age (current)"]].copy()
			
 
				+    data["Sex"] = data["Sex"].astype(str).str.strip()
			
 
				+    data = data.replace({"M": 0, "F": 1})
			
 
				+    return data
			
 
				+
			
 
				+
			
 
				+def _patient_ids(xls_file: Path) -> list[tuple[int, str]]:
			
 
				+    ptid_df = pd.read_csv(xls_file)
			
 
				+    ptid_df.columns = ptid_df.columns.str.strip()
			
 
				+    ptid_df = ptid_df[["Image Data ID", "PTID"]].dropna(
			
 
				+        subset=["Image Data ID", "PTID"]
			
 
				+    )
			
 
				+    ptid_df["Image Data ID"] = ptid_df["Image Data ID"].astype(int)
			
 
				+    ptid_df["PTID"] = ptid_df["PTID"].astype(str).str.strip()
			
 
				+    ptid_df = ptid_df[ptid_df["PTID"] != ""]
			
 
				+    return list(zip(ptid_df["Image Data ID"].tolist(), ptid_df["PTID"].tolist()))
			
 
				+
			
 
				+
			
 
				+def build_dataset(config: dict[str, Any], root_dir: Path) -> tuple[ADNIDataset, Path]:
			
 
				+    mri_files = (root_dir / config["data"]["mri_files_path"]).resolve().glob("*.nii")
			
 
				+    xls_file = (root_dir / config["data"]["xls_file_path"]).resolve()
			
 
				+
			
 
				+    dataset = load_adni_data_from_file(
			
 
				+        mri_files,
			
 
				+        xls_file,
			
 
				+        device=config["training"]["device"],
			
 
				+        xls_preprocessor=xls_preprocess,
			
 
				+    )
			
 
				+    return dataset, xls_file
			
 
				+
			
 
				+
			
 
				+def build_dataset_splits(
			
 
				+    config: dict[str, Any],
			
 
				+    dataset: ADNIDataset,
			
 
				+    xls_file: Path,
			
 
				+    seed: int,
			
 
				+) -> list[Subset[ADNIDataset]]:
			
 
				+    return divide_dataset_by_patient_id(
			
 
				+        dataset,
			
 
				+        _patient_ids(xls_file),
			
 
				+        tuple(config["data"]["data_splits"]),
			
 
				+        seed=seed,
			
 
				+    )
			
 
				+
			
 
				+
			
 
				+def build_dataset_and_test_loader(
			
 
				+    config: dict[str, Any],
			
 
				+    root_dir: Path,
			
 
				+    seed: int,
			
 
				+) -> tuple[ADNIDataset, DataLoader]:
			
 
				+    dataset, xls_file = build_dataset(config, root_dir)
			
 
				+    splits = build_dataset_splits(config, dataset, xls_file, seed=seed)
			
 
				+    _, _, test_loader = initalize_dataloaders(
			
 
				+        splits,
			
 
				+        batch_size=int(config["training"]["batch_size"]),
			
 
				+    )
			
 
				+    return dataset, test_loader
			
 
				+
			
 
				+
			
 
				+def build_holdout_loader(
			
 
				+    config: dict[str, Any],
			
 
				+    root_dir: Path,
			
 
				+    seed: int,
			
 
				+) -> DataLoader:
			
 
				+    dataset, xls_file = build_dataset(config, root_dir)
			
 
				+    splits = build_dataset_splits(config, dataset, xls_file, seed=seed)
			
 
				+    _, val_loader, test_loader = initalize_dataloaders(splits, batch_size=1)
			
 
				+    combined = ConcatDataset([val_loader.dataset, test_loader.dataset])
			
 
				+    return DataLoader(combined, batch_size=1, shuffle=False)
			
--- a/analysis/defaults.py
+++ b/analysis/defaults.py
@@ -0,0 +1,45 @@
 
				+# pyright: basic
			
 
				+
			
 
				+from __future__ import annotations
			
 
				+
			
 
				+import numpy as np
			
 
				+
			
 
				+DEFAULT_BACKENDS = ["ensemble", "bayesian"]
			
 
				+DEFAULT_POSITIVE_CLASS_INDEX = 0
			
 
				+DEFAULT_CALIBRATION_BINS = 10
			
 
				+DEFAULT_BAYESIAN_MC_PASSES = 20
			
 
				+DEFAULT_DECISION_THRESHOLD = 0.5
			
 
				+
			
 
				+THRESHOLD_START = 0.5
			
 
				+THRESHOLD_STOP = 0.95
			
 
				+THRESHOLD_STEP = 0.05
			
 
				+
			
 
				+NOISE_FACTOR_MIN = 0.0
			
 
				+NOISE_FACTOR_MAX = 7.0
			
 
				+NOISE_FACTOR_COUNT = 21
			
 
				+
			
 
				+UNCERTAINTY_CUTOFF_COUNT = 21
			
 
				+
			
 
				+
			
 
				+def threshold_grid() -> np.ndarray:
			
 
				+    if THRESHOLD_STEP <= 0:
			
 
				+        raise ValueError("THRESHOLD_STEP must be > 0")
			
 
				+    if THRESHOLD_STOP < THRESHOLD_START:
			
 
				+        raise ValueError("THRESHOLD_STOP must be >= THRESHOLD_START")
			
 
				+
			
 
				+    n = int(round((THRESHOLD_STOP - THRESHOLD_START) / THRESHOLD_STEP))
			
 
				+    return np.linspace(THRESHOLD_START, THRESHOLD_STOP, num=n + 1, dtype=float)
			
 
				+
			
 
				+
			
 
				+def uncertainty_cutoff_percentiles() -> np.ndarray:
			
 
				+    return np.linspace(0.0, 100.0, num=UNCERTAINTY_CUTOFF_COUNT, dtype=float)
			
 
				+
			
 
				+
			
 
				+def noise_factor_grid() -> list[float]:
			
 
				+    factors = np.linspace(
			
 
				+        NOISE_FACTOR_MIN,
			
 
				+        NOISE_FACTOR_MAX,
			
 
				+        num=NOISE_FACTOR_COUNT,
			
 
				+        dtype=float,
			
 
				+    )
			
 
				+    return [float(v) for v in factors]
			
--- a/analysis/evaluate_models.py
+++ b/analysis/evaluate_models.py
@@ -6,7 +6,6 @@ import argparse
 
				 from pathlib import Path
			
 
				 from typing import Any
			
 
				 
			
 
				-import numpy as np
			
 
				 import pandas as pd
			
 
				 
			
 
				 from analysis.analysis_modules import (
			
@@ -16,11 +15,68 @@ from analysis.analysis_modules import (
 
				     run_physician,
			
 
				 )
			
 
				 from analysis.data_access import load_backend_evaluation, load_clinical_table
			
 
				+from analysis.defaults import (
			
 
				+    DEFAULT_BACKENDS,
			
 
				+    DEFAULT_BAYESIAN_MC_PASSES,
			
 
				+    DEFAULT_CALIBRATION_BINS,
			
 
				+    DEFAULT_DECISION_THRESHOLD,
			
 
				+    DEFAULT_POSITIVE_CLASS_INDEX,
			
 
				+    noise_factor_grid,
			
 
				+    threshold_grid,
			
 
				+)
			
 
				 from analysis.holdout_evaluation import ensure_backend_netcdf
			
 
				 from analysis.noise_analysis import run_noise_analysis
			
 
				 from analysis.runtime import backend_dir, init_runtime_paths, load_config, write_json
			
 
				 
			
 
				 
			
 
				+def _plot_description(filename: str) -> str:
			
 
				+    descriptions = {
			
 
				+        "performance_threshold_sweep.png": "Accuracy and F1 as the decision threshold varies.",
			
 
				+        "performance_uncertainty_cutoff.png": "Performance while progressively restricting to lower-uncertainty predictions.",
			
 
				+        "performance_uncertainty_percentile_cutoff.png": "Percentile-ranked low-uncertainty subset performance from least to most restricted.",
			
 
				+        "calibration_reliability.png": "Reliability diagram comparing predicted probability to empirical outcome frequency.",
			
 
				+        "physician_confidence_boxplot.png": "Model confidence grouped by physician confidence ratings.",
			
 
				+        "physician_std_boxplot.png": "Model secondary uncertainty grouped by physician confidence ratings.",
			
 
				+        "physician_predictive_entropy_boxplot.png": "Predictive entropy grouped by physician confidence ratings.",
			
 
				+        "longitudinal_cohort_confidence.png": "Longitudinal cohort comparison using model confidence.",
			
 
				+        "longitudinal_cohort_std.png": "Longitudinal cohort comparison using ensemble standard deviation uncertainty.",
			
 
				+        "longitudinal_cohort_predictive_entropy.png": "Longitudinal cohort comparison using predictive entropy uncertainty.",
			
 
				+        "noise_sensitivity.png": "Performance metrics across increasing Gaussian noise factors.",
			
 
				+        "noise_uncertainty.png": "Uncertainty metrics across increasing Gaussian noise factors.",
			
 
				+        "noise_confidence_certainty.png": "Confidence certainty trend across increasing Gaussian noise factors.",
			
 
				+        "ensemble_noise_examples.png": "Representative image slices with progressively larger Gaussian noise factors.",
			
 
				+        "bayesian_noise_examples.png": "Representative image slices with progressively larger Gaussian noise factors.",
			
 
				+    }
			
 
				+    return descriptions.get(filename, "Generated analysis plot.")
			
 
				+
			
 
				+
			
 
				+def _write_backend_plot_report(backend: str, out_dir: Path) -> Path:
			
 
				+    plots_dir = out_dir / "plots"
			
 
				+    images = sorted(plots_dir.rglob("*.png")) if plots_dir.exists() else []
			
 
				+
			
 
				+    report_path = out_dir / "plots_report.md"
			
 
				+    lines = [
			
 
				+        f"# {backend.title()} Analysis Plot Report",
			
 
				+        "",
			
 
				+        "This document lists generated analysis plots with brief descriptions.",
			
 
				+        "",
			
 
				+    ]
			
 
				+    if not images:
			
 
				+        lines.append("No plot images were generated for this backend run.")
			
 
				+    else:
			
 
				+        for image_path in images:
			
 
				+            rel = image_path.relative_to(out_dir).as_posix()
			
 
				+            title = image_path.stem.replace("_", " ").title()
			
 
				+            lines.append(f"## {title}")
			
 
				+            lines.append(_plot_description(image_path.name))
			
 
				+            lines.append("")
			
 
				+            lines.append(f"![{title}]({rel})")
			
 
				+            lines.append("")
			
 
				+
			
 
				+    report_path.write_text("\n".join(lines), encoding="utf-8")
			
 
				+    return report_path
			
 
				+
			
 
				+
			
 
				 def _parse_args() -> argparse.Namespace:
			
 
				     parser = argparse.ArgumentParser(
			
 
				         description=(
			
@@ -32,7 +88,7 @@ def _parse_args() -> argparse.Namespace:
 
				         "--backend",
			
 
				         nargs="+",
			
 
				         choices=["ensemble", "bayesian"],
			
 
				-        default=["ensemble", "bayesian"],
			
 
				+        default=DEFAULT_BACKENDS,
			
 
				         help="Backends to evaluate.",
			
 
				     )
			
 
				     parser.add_argument(
			
@@ -41,112 +97,36 @@ def _parse_args() -> argparse.Namespace:
 
				         help="Optional run directory name under analysis_output.",
			
 
				     )
			
 
				     parser.add_argument(
			
 
				-        "--threshold-start",
			
 
				-        type=float,
			
 
				-        default=0.5,
			
 
				-        help="Threshold sweep start.",
			
 
				-    )
			
 
				-    parser.add_argument(
			
 
				-        "--threshold-stop",
			
 
				-        type=float,
			
 
				-        default=0.95,
			
 
				-        help="Threshold sweep stop (inclusive).",
			
 
				-    )
			
 
				-    parser.add_argument(
			
 
				-        "--threshold-step",
			
 
				-        type=float,
			
 
				-        default=0.05,
			
 
				-        help="Threshold sweep step.",
			
 
				-    )
			
 
				-    parser.add_argument(
			
 
				-        "--decision-threshold",
			
 
				-        type=float,
			
 
				-        default=0.5,
			
 
				-        help="Decision threshold used in noise analysis.",
			
 
				-    )
			
 
				-    parser.add_argument(
			
 
				-        "--positive-class-index",
			
 
				-        type=int,
			
 
				-        default=0,
			
 
				-        help="Positive class index in both predictions.img_class and labels.label.",
			
 
				-    )
			
 
				-    parser.add_argument(
			
 
				-        "--calibration-bins",
			
 
				-        type=int,
			
 
				-        default=10,
			
 
				-        help="Number of reliability bins used for ECE/MCE.",
			
 
				-    )
			
 
				-    parser.add_argument(
			
 
				         "--skip-noise",
			
 
				         action="store_true",
			
 
				         help="Skip Gaussian noise sensitivity analysis.",
			
 
				     )
			
 
				-    parser.add_argument(
			
 
				-        "--noise-sigmas",
			
 
				-        nargs="+",
			
 
				-        type=float,
			
 
				-        default=[
			
 
				-            0.0,
			
 
				-            0.01,
			
 
				-            0.03,
			
 
				-            0.05,
			
 
				-            0.1,
			
 
				-            0.2,
			
 
				-            0.3,
			
 
				-            0.4,
			
 
				-            0.5,
			
 
				-            0.6,
			
 
				-            0.75,
			
 
				-            1.0,
			
 
				-        ],
			
 
				-        help="Gaussian noise sigmas for sensitivity analysis.",
			
 
				-    )
			
 
				-    parser.add_argument(
			
 
				-        "--bayesian-mc-passes",
			
 
				-        type=int,
			
 
				-        default=20,
			
 
				-        help="MC forward passes for bayesian noise analysis.",
			
 
				-    )
			
 
				     return parser.parse_args()
			
 
				 
			
 
				 
			
 
				-def _threshold_array(start: float, stop: float, step: float) -> np.ndarray:
			
 
				-    if step <= 0:
			
 
				-        raise ValueError("threshold-step must be > 0")
			
 
				-    if stop < start:
			
 
				-        raise ValueError("threshold-stop must be >= threshold-start")
			
 
				-
			
 
				-    # Include stop when it lands on a step boundary.
			
 
				-    n = int(round((stop - start) / step))
			
 
				-    return np.array([start + i * step for i in range(n + 1)], dtype=float)
			
 
				-
			
 
				-
			
 
				 def _run_backend(
			
 
				     config: dict[str, Any],
			
 
				     root_dir: Path,
			
 
				     backend: str,
			
 
				     clinical_df: pd.DataFrame,
			
 
				-    args: argparse.Namespace,
			
 
				+    skip_noise: bool,
			
 
				     out_dir: Path,
			
 
				 ) -> dict[str, Any]:
			
 
				     netcdf_path = ensure_backend_netcdf(
			
 
				         config=config,
			
 
				         root_dir=root_dir,
			
 
				         backend=backend,
			
 
				-        bayesian_mc_passes=int(args.bayesian_mc_passes),
			
 
				+        bayesian_mc_passes=DEFAULT_BAYESIAN_MC_PASSES,
			
 
				     )
			
 
				 
			
 
				     evaluation = load_backend_evaluation(
			
 
				         config=config,
			
 
				         backend=backend,
			
 
				-        class_index=int(args.positive_class_index),
			
 
				+        class_index=DEFAULT_POSITIVE_CLASS_INDEX,
			
 
				     )
			
 
				 
			
 
				-    thresholds = _threshold_array(
			
 
				-        start=float(args.threshold_start),
			
 
				-        stop=float(args.threshold_stop),
			
 
				-        step=float(args.threshold_step),
			
 
				-    )
			
 
				+    thresholds = threshold_grid()
			
 
				+    noise_factors = noise_factor_grid()
			
 
				 
			
 
				     summary: dict[str, Any] = {
			
 
				         "backend": backend,
			
@@ -163,7 +143,7 @@ def _run_backend(
 
				     summary["calibration"] = run_calibration(
			
 
				         evaluation=evaluation,
			
 
				         output_dir=out_dir,
			
 
				-        bins=int(args.calibration_bins),
			
 
				+        bins=DEFAULT_CALIBRATION_BINS,
			
 
				     )
			
 
				     summary["physician"] = run_physician(
			
 
				         evaluation=evaluation,
			
@@ -176,20 +156,20 @@ def _run_backend(
 
				         output_dir=out_dir,
			
 
				     )
			
 
				 
			
 
				-    if args.skip_noise:
			
 
				+    if skip_noise:
			
 
				         summary["noise"] = {"skipped": True, "reason": "--skip-noise supplied"}
			
 
				     else:
			
 
				         try:
			
 
				             summary["noise"] = run_noise_analysis(
			
 
				                 config=config,
			
 
				-                root_dir=Path(__file__).resolve().parents[1],
			
 
				+                root_dir=root_dir,
			
 
				                 backend=backend,
			
 
				                 output_dir=out_dir,
			
 
				-                class_index=int(args.positive_class_index),
			
 
				-                noise_sigmas=[float(x) for x in args.noise_sigmas],
			
 
				-                threshold=float(args.decision_threshold),
			
 
				-                calibration_bins=int(args.calibration_bins),
			
 
				-                bayesian_mc_passes=int(args.bayesian_mc_passes),
			
 
				+                class_index=DEFAULT_POSITIVE_CLASS_INDEX,
			
 
				+                noise_sigmas=noise_factors,
			
 
				+                threshold=DEFAULT_DECISION_THRESHOLD,
			
 
				+                calibration_bins=DEFAULT_CALIBRATION_BINS,
			
 
				+                bayesian_mc_passes=DEFAULT_BAYESIAN_MC_PASSES,
			
 
				             )
			
 
				         except Exception as exc:
			
 
				             summary["noise"] = {
			
@@ -197,6 +177,8 @@ def _run_backend(
 
				                 "reason": f"Noise analysis failed: {exc}",
			
 
				             }
			
 
				 
			
 
				+    report_path = _write_backend_plot_report(backend=backend, out_dir=out_dir)
			
 
				+    summary["plots_report"] = str(report_path)
			
 
				     write_json(out_dir / "backend_summary.json", summary)
			
 
				     return summary
			
 
				 
			
@@ -212,14 +194,14 @@ def main() -> None:
 
				     manifest: dict[str, Any] = {
			
 
				         "run_dir": str(paths.run_dir),
			
 
				         "output_root": str(paths.output_root),
			
 
				-        "positive_class_index": int(args.positive_class_index),
			
 
				+        "positive_class_index": DEFAULT_POSITIVE_CLASS_INDEX,
			
 
				         "threshold_sweep": {
			
 
				-            "start": float(args.threshold_start),
			
 
				-            "stop": float(args.threshold_stop),
			
 
				-            "step": float(args.threshold_step),
			
 
				+            "values": [float(v) for v in threshold_grid().tolist()],
			
 
				         },
			
 
				-        "calibration_bins": int(args.calibration_bins),
			
 
				-        "noise_sigmas": [float(x) for x in args.noise_sigmas],
			
 
				+        "calibration_bins": DEFAULT_CALIBRATION_BINS,
			
 
				+        "noise_factors": noise_factor_grid(),
			
 
				+        "bayesian_mc_passes": DEFAULT_BAYESIAN_MC_PASSES,
			
 
				+        "decision_threshold": DEFAULT_DECISION_THRESHOLD,
			
 
				         "backends": {},
			
 
				     }
			
 
				 
			
@@ -230,7 +212,7 @@ def main() -> None:
 
				             root_dir=paths.root_dir,
			
 
				             backend=backend,
			
 
				             clinical_df=clinical_df,
			
 
				-            args=args,
			
 
				+            skip_noise=bool(args.skip_noise),
			
 
				             out_dir=out_dir,
			
 
				         )
			
 
				 
			
--- a/analysis/holdout_evaluation.py
+++ b/analysis/holdout_evaluation.py
@@ -7,33 +7,14 @@ from pathlib import Path
 
				 from typing import Any
			
 
				 
			
 
				 import numpy as np
			
 
				-import pandas as pd
			
 
				 import torch
			
 
				-from torch.utils.data import ConcatDataset, DataLoader
			
 
				+from torch.utils.data import DataLoader
			
 
				 import xarray as xr
			
 
				 
			
 
				-from data.dataset import (
			
 
				-    divide_dataset_by_patient_id,
			
 
				-    initalize_dataloaders,
			
 
				-    load_adni_data_from_file,
			
 
				-)
			
 
				 from model.cnn import CNN3D
			
 
				 
			
 
				-
			
 
				-def _enable_bayesian_sampling_mode(model: torch.nn.Module) -> None:
			
 
				-    # Keep stochastic Bayesian layers in training mode, but freeze BatchNorm
			
 
				-    # statistics to support batch_size=1 inference on holdout samples.
			
 
				-    model.train()
			
 
				-    for module in model.modules():
			
 
				-        if isinstance(module, torch.nn.modules.batchnorm._BatchNorm):
			
 
				-            module.eval()
			
 
				-
			
 
				-
			
 
				-def _xls_pre(df: pd.DataFrame) -> pd.DataFrame:
			
 
				-    data = df[["Image Data ID", "Sex", "Age (current)"]].copy()
			
 
				-    data["Sex"] = data["Sex"].astype(str).str.strip()
			
 
				-    data = data.replace({"M": 0, "F": 1})
			
 
				-    return data
			
 
				+from .data_pipeline import build_holdout_loader
			
 
				+from .model_utils import configure_bayesian_sampling_mode
			
 
				 
			
 
				 
			
 
				 def _training_seed(config: dict[str, Any], backend_dir: Path) -> int:
			
@@ -48,41 +29,6 @@ def _training_seed(config: dict[str, Any], backend_dir: Path) -> int:
 
				     return int(config["data"]["seed"])
			
 
				 
			
 
				 
			
 
				-def _build_holdout_loader(
			
 
				-    config: dict[str, Any], root_dir: Path, backend_dir: Path
			
 
				-) -> DataLoader:
			
 
				-    mri_files = (root_dir / config["data"]["mri_files_path"]).resolve().glob("*.nii")
			
 
				-    xls_file = (root_dir / config["data"]["xls_file_path"]).resolve()
			
 
				-
			
 
				-    dataset = load_adni_data_from_file(
			
 
				-        mri_files,
			
 
				-        xls_file,
			
 
				-        device=config["training"]["device"],
			
 
				-        xls_preprocessor=_xls_pre,
			
 
				-    )
			
 
				-
			
 
				-    ptid_df = pd.read_csv(xls_file)
			
 
				-    ptid_df.columns = ptid_df.columns.str.strip()
			
 
				-    ptid_df = ptid_df[["Image Data ID", "PTID"]].dropna(
			
 
				-        subset=["Image Data ID", "PTID"]
			
 
				-    )
			
 
				-    ptid_df["Image Data ID"] = ptid_df["Image Data ID"].astype(int)
			
 
				-    ptid_df["PTID"] = ptid_df["PTID"].astype(str).str.strip()
			
 
				-    ptid_df = ptid_df[ptid_df["PTID"] != ""]
			
 
				-
			
 
				-    ptids = list(zip(ptid_df["Image Data ID"].tolist(), ptid_df["PTID"].tolist()))
			
 
				-    datasets = divide_dataset_by_patient_id(
			
 
				-        dataset,
			
 
				-        ptids,
			
 
				-        tuple(config["data"]["data_splits"]),
			
 
				-        seed=_training_seed(config, backend_dir),
			
 
				-    )
			
 
				-
			
 
				-    _, val_loader, test_loader = initalize_dataloaders(datasets, batch_size=1)
			
 
				-    combined = ConcatDataset([val_loader.dataset, test_loader.dataset])
			
 
				-    return DataLoader(combined, batch_size=1, shuffle=False)
			
 
				-
			
 
				-
			
 
				 def _init_cnn(config: dict[str, Any]) -> torch.nn.Module:
			
 
				     return (
			
 
				         CNN3D(
			
@@ -196,7 +142,11 @@ def _evaluate_bayesian(
 
				     model.to(device)
			
 
				     model.load_state_dict(torch.load(ckpt, map_location=device), strict=False)
			
 
				     model.to(device)
			
 
				-    _enable_bayesian_sampling_mode(model)
			
 
				+    configure_bayesian_sampling_mode(
			
 
				+        model,
			
 
				+        stochastic=False,
			
 
				+        freeze_batchnorm=True,
			
 
				+    )
			
 
				 
			
 
				     n_samples = len(holdout_loader)
			
 
				     n_classes = int(config["data"]["num_classes"])
			
@@ -253,7 +203,11 @@ def ensure_backend_netcdf(
 
				     if output_path.exists():
			
 
				         return output_path
			
 
				 
			
 
				-    holdout_loader = _build_holdout_loader(config, root_dir, backend_dir)
			
 
				+    holdout_loader = build_holdout_loader(
			
 
				+        config=config,
			
 
				+        root_dir=root_dir,
			
 
				+        seed=_training_seed(config, backend_dir),
			
 
				+    )
			
 
				 
			
 
				     if backend == "ensemble":
			
 
				         ds = _evaluate_ensemble(config, backend_dir, holdout_loader)
			
--- a/analysis/model_utils.py
+++ b/analysis/model_utils.py
@@ -0,0 +1,46 @@
 
				+# pyright: basic
			
 
				+
			
 
				+from __future__ import annotations
			
 
				+
			
 
				+import torch
			
 
				+from data.dataset import ADNIDataset
			
 
				+
			
 
				+
			
 
				+def configure_bayesian_sampling_mode(
			
 
				+    model: torch.nn.Module,
			
 
				+    *,
			
 
				+    stochastic: bool,
			
 
				+    freeze_batchnorm: bool = True,
			
 
				+) -> None:
			
 
				+    # if stochastic:
			
 
				+    #     model.train()
			
 
				+    #     if freeze_batchnorm:
			
 
				+    #         for module in model.modules():
			
 
				+    #             if isinstance(module, torch.nn.modules.batchnorm._BatchNorm):
			
 
				+    #                 module.eval()
			
 
				+    # else:
			
 
				+    model.eval()
			
 
				+
			
 
				+
			
 
				+def _compute_mri_intensity_range(
			
 
				+    dataset: ADNIDataset,
			
 
				+) -> tuple[float, float, float]:
			
 
				+    global_min = float("inf")
			
 
				+    global_max = float("-inf")
			
 
				+    for idx in range(len(dataset)):
			
 
				+        mri, _, _, _ = dataset[idx]
			
 
				+        sample_min = float(mri.detach().min().cpu())
			
 
				+        sample_max = float(mri.detach().max().cpu())
			
 
				+        if sample_min < global_min:
			
 
				+            global_min = sample_min
			
 
				+        if sample_max > global_max:
			
 
				+            global_max = sample_max
			
 
				+
			
 
				+    intensity_range = global_max - global_min
			
 
				+    if not np.isfinite(intensity_range) or intensity_range <= 0:
			
 
				+        intensity_range = 1.0
			
 
				+    print(
			
 
				+        "Computed MRI intensity range across the full dataset: "
			
 
				+        f"min={global_min:.6g}, max={global_max:.6g}, range={intensity_range:.6g}"
			
 
				+    )
			
 
				+    return float(global_min), float(global_max), float(intensity_range)
			
--- a/analysis/noise_analysis.py
+++ b/analysis/noise_analysis.py
@@ -5,142 +5,45 @@ from __future__ import annotations
 
				 from pathlib import Path
			
 
				 from typing import Any
			
 
				 
			
 
				-import matplotlib.pyplot as plt
			
 
				 import numpy as np
			
 
				 import pandas as pd
			
 
				 import torch
			
 
				 from bayesian_torch.utils.util import predictive_entropy
			
 
				 
			
 
				-from data.dataset import (
			
 
				-    divide_dataset_by_patient_id,
			
 
				-    initalize_dataloaders,
			
 
				-    load_adni_data_from_file,
			
 
				-)
			
 
				 from model.cnn import CNN3D
			
 
				 
			
 
				+from .data_pipeline import build_dataset_and_test_loader
			
 
				 from .metrics import calibration_stats, performance_at_threshold
			
 
				+from .model_utils import configure_bayesian_sampling_mode
			
 
				+from .plotting import (
			
 
				+    plots_dir,
			
 
				+    save_noise_example_grid,
			
 
				+    save_noise_metrics_plot,
			
 
				+)
			
 
				 from .runtime import write_json
			
 
				+from .uncertainty import confidence_certainty, confidence_uncertainty
			
 
				 
			
 
				 
			
 
				-def _enable_bayesian_sampling_mode(model: torch.nn.Module) -> None:
			
 
				-    model.eval()
			
 
				-
			
 
				-
			
 
				-def _central_slice(volume: torch.Tensor) -> np.ndarray:
			
 
				-    tensor = volume.detach().cpu()
			
 
				-    if tensor.ndim == 5:
			
 
				-        tensor = tensor[0]
			
 
				-    if tensor.ndim == 4:
			
 
				-        tensor = tensor[0]
			
 
				-    if tensor.ndim != 3:
			
 
				-        raise ValueError(
			
 
				-            f"Expected a 3D volume after squeezing batch/channel, got shape {tuple(tensor.shape)}"
			
 
				-        )
			
 
				-
			
 
				-    center_index = tensor.shape[0] // 2
			
 
				-    return tensor[center_index].numpy().astype(float)
			
 
				-
			
 
				-
			
 
				-def _normalize_for_display(image: np.ndarray) -> np.ndarray:
			
 
				-    low = float(np.percentile(image, 1))
			
 
				-    high = float(np.percentile(image, 99))
			
 
				-    if high <= low:
			
 
				-        return np.zeros_like(image, dtype=float)
			
 
				-    clipped = np.clip(image, low, high)
			
 
				-    return (clipped - low) / (high - low)
			
 
				-
			
 
				-
			
 
				-def _save_noise_example_grid(
			
 
				-    original_mri: torch.Tensor,
			
 
				-    noisy_by_sigma: list[tuple[float, torch.Tensor]],
			
 
				-    output_path: Path,
			
 
				-    title: str,
			
 
				-) -> None:
			
 
				-    if not noisy_by_sigma:
			
 
				-        return
			
 
				-
			
 
				-    original_slice = _normalize_for_display(_central_slice(original_mri))
			
 
				-    n_rows = len(noisy_by_sigma)
			
 
				-    fig, axes = plt.subplots(n_rows, 2, figsize=(8, 3.2 * n_rows))
			
 
				-    if n_rows == 1:
			
 
				-        axes = np.array([axes])
			
 
				-
			
 
				-    for row_idx, (sigma, noisy_tensor) in enumerate(noisy_by_sigma):
			
 
				-        noisy_slice = _normalize_for_display(_central_slice(noisy_tensor))
			
 
				-        ax_orig, ax_noisy = axes[row_idx]
			
 
				-        ax_orig.imshow(original_slice, cmap="gray")
			
 
				-        ax_orig.set_title(f"Original")
			
 
				-        ax_orig.axis("off")
			
 
				-
			
 
				-        ax_noisy.imshow(noisy_slice, cmap="gray")
			
 
				-        ax_noisy.set_title(f"Noisy sigma={sigma:g}")
			
 
				-        ax_noisy.axis("off")
			
 
				-
			
 
				-    fig.suptitle(title)
			
 
				-    fig.tight_layout()
			
 
				-    output_path.parent.mkdir(parents=True, exist_ok=True)
			
 
				-    fig.savefig(output_path)
			
 
				-    plt.close(fig)
			
 
				-
			
 
				-
			
 
				-def _confidence_certainty(y_prob: np.ndarray) -> np.ndarray:
			
 
				-    # 2 * |p - 0.5| maps 0 -> very uncertain and 1 -> very certain.
			
 
				-    return 2.0 * np.abs(y_prob - 0.5)
			
 
				 
			
 
				+def _apply_scaled_noise(
			
 
				+    volume: torch.Tensor, sigma: float, intensity_range: float
			
 
				+) -> torch.Tensor:
			
 
				+    # Scale by global MRI intensity range measured from holdout set.
			
 
				+    return volume + (torch.randn_like(volume) * sigma * intensity_range)
			
 
				 
			
 
				-def _confidence_uncertainty(y_prob: np.ndarray) -> np.ndarray:
			
 
				-    # Invert certainty so that larger values mean more uncertainty, matching std.
			
 
				-    return 1.0 - _confidence_certainty(y_prob)
			
 
				 
			
 
				+def _uniform_sigma_schedule(noise_sigmas: list[float]) -> list[float]:
			
 
				+    if not noise_sigmas:
			
 
				+        raise ValueError("noise_sigmas must contain at least one value")
			
 
				 
			
 
				-def _apply_scaled_noise(volume: torch.Tensor, sigma: float) -> torch.Tensor:
			
 
				-    # Make sigma dimensionless with respect to MRI intensity scale.
			
 
				-    # sigma=1.0 means noise std equals total MRI intensity range, which is 65535 .
			
 
				-    return volume + (torch.randn_like(volume) * sigma * 65535)
			
 
				+    ordered = np.array(sorted(float(s) for s in noise_sigmas), dtype=float)
			
 
				+    if len(ordered) == 1:
			
 
				+        return [float(ordered[0])]
			
 
				 
			
 
				-
			
 
				-def _xls_pre(df: pd.DataFrame) -> pd.DataFrame:
			
 
				-    data = df[["Image Data ID", "Sex", "Age (current)"]].copy()
			
 
				-    data["Sex"] = data["Sex"].astype(str).str.strip()
			
 
				-    data = data.replace({"M": 0, "F": 1})
			
 
				-    return data
			
 
				-
			
 
				-
			
 
				-def _build_test_loader(
			
 
				-    config: dict[str, Any], root_dir: Path
			
 
				-) -> torch.utils.data.DataLoader:
			
 
				-    mri_files = (root_dir / config["data"]["mri_files_path"]).resolve().glob("*.nii")
			
 
				-    xls_file = (root_dir / config["data"]["xls_file_path"]).resolve()
			
 
				-
			
 
				-    dataset = load_adni_data_from_file(
			
 
				-        mri_files,
			
 
				-        xls_file,
			
 
				-        device=config["training"]["device"],
			
 
				-        xls_preprocessor=_xls_pre,
			
 
				+    uniform = np.linspace(
			
 
				+        float(ordered[0]), float(ordered[-1]), num=len(ordered), dtype=float
			
 
				     )
			
 
				-
			
 
				-    ptid_df = pd.read_csv(xls_file)
			
 
				-    ptid_df.columns = ptid_df.columns.str.strip()
			
 
				-    ptid_df = ptid_df[["Image Data ID", "PTID"]].dropna(
			
 
				-        subset=["Image Data ID", "PTID"]
			
 
				-    )
			
 
				-    ptid_df["Image Data ID"] = ptid_df["Image Data ID"].astype(int)
			
 
				-    ptid_df["PTID"] = ptid_df["PTID"].astype(str).str.strip()
			
 
				-    ptid_df = ptid_df[ptid_df["PTID"] != ""]
			
 
				-    ptids = list(zip(ptid_df["Image Data ID"].tolist(), ptid_df["PTID"].tolist()))
			
 
				-
			
 
				-    splits = divide_dataset_by_patient_id(
			
 
				-        dataset,
			
 
				-        ptids,
			
 
				-        tuple(config["data"]["data_splits"]),
			
 
				-        seed=int(config["data"]["seed"]),
			
 
				-    )
			
 
				-
			
 
				-    _, _, test_loader = initalize_dataloaders(
			
 
				-        splits,
			
 
				-        batch_size=int(config["training"]["batch_size"]),
			
 
				-    )
			
 
				-    return test_loader
			
 
				+    return [float(s) for s in uniform]
			
 
				 
			
 
				 
			
 
				 def _load_ensemble_models(config: dict[str, Any]) -> list[torch.nn.Module]:
			
@@ -208,7 +111,7 @@ def _load_bayesian_model(config: dict[str, Any]) -> torch.nn.Module:
 
				     model.to(device)
			
 
				     model.load_state_dict(torch.load(model_path, map_location=device), strict=False)
			
 
				     model.to(device)
			
 
				-    _enable_bayesian_sampling_mode(model)
			
 
				+    configure_bayesian_sampling_mode(model, stochastic=False)
			
 
				     return model
			
 
				 
			
 
				 
			
@@ -216,6 +119,7 @@ def _infer_with_noise_ensemble(
 
				     test_loader: torch.utils.data.DataLoader,
			
 
				     models: list[torch.nn.Module],
			
 
				     sigma: float,
			
 
				+    intensity_range: float,
			
 
				     class_index: int,
			
 
				 ) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
			
 
				     if not models:
			
@@ -231,7 +135,7 @@ def _infer_with_noise_ensemble(
 
				             mri_device = mri.float().to(device)
			
 
				             xls_device = xls.float().to(device)
			
 
				             labels_device = labels.to(device)
			
 
				-            noisy = _apply_scaled_noise(mri_device, sigma)
			
 
				+            noisy = _apply_scaled_noise(mri_device, sigma, intensity_range)
			
 
				             preds = []
			
 
				             for model in models:
			
 
				                 out = model((noisy, xls_device))
			
@@ -253,6 +157,7 @@ def _infer_with_noise_bayesian(
 
				     test_loader: torch.utils.data.DataLoader,
			
 
				     model: torch.nn.Module,
			
 
				     sigma: float,
			
 
				+    intensity_range: float,
			
 
				     class_index: int,
			
 
				     mc_passes: int,
			
 
				 ) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
			
@@ -266,7 +171,7 @@ def _infer_with_noise_bayesian(
 
				             mri_device = mri.float().to(device)
			
 
				             xls_device = xls.float().to(device)
			
 
				             labels_device = labels.to(device)
			
 
				-            noisy = _apply_scaled_noise(mri_device, sigma)
			
 
				+            noisy = _apply_scaled_noise(mri_device, sigma, intensity_range)
			
 
				             draws = []
			
 
				             for _ in range(mc_passes):
			
 
				                 out = model((noisy, xls_device))
			
@@ -295,8 +200,21 @@ def run_noise_analysis(
 
				     calibration_bins: int,
			
 
				     bayesian_mc_passes: int,
			
 
				 ) -> dict[str, Any]:
			
 
				-    test_loader = _build_test_loader(config, root_dir)
			
 
				-    examples_dir = output_dir / "noise_examples"
			
 
				+    noise_sigmas = _uniform_sigma_schedule(noise_sigmas)
			
 
				+    dataset, test_loader = build_dataset_and_test_loader(
			
 
				+        config=config,
			
 
				+        root_dir=root_dir,
			
 
				+        seed=int(config["data"]["seed"]),
			
 
				+    )
			
 
				+    # intensity_min, intensity_max, intensity_range = _compute_mri_intensity_range(
			
 
				+    #     dataset
			
 
				+    # )
			
 
				+
			
 
				+    # Just use a fixed intensity range for noise scaling since all that matters is that it's consistent
			
 
				+    intensity_range = 10_000.0
			
 
				+
			
 
				+    out_plots_dir = plots_dir(output_dir)
			
 
				+    examples_dir = out_plots_dir / "noise_examples"
			
 
				     examples_dir.mkdir(parents=True, exist_ok=True)
			
 
				 
			
 
				     rows: list[dict[str, Any]] = []
			
@@ -308,6 +226,7 @@ def run_noise_analysis(
 
				                 test_loader,
			
 
				                 models,
			
 
				                 sigma,
			
 
				+                intensity_range,
			
 
				                 class_index=class_index,
			
 
				             )
			
 
				             perf = performance_at_threshold(y_true, y_prob, threshold)
			
@@ -315,19 +234,20 @@ def run_noise_analysis(
 
				             rows.append(
			
 
				                 {
			
 
				                     "uncertainty_metric": "std",
			
 
				-                    "sigma": float(sigma),
			
 
				+                    "noise_factor": float(sigma),
			
 
				                     "accuracy": float(perf["accuracy"]),
			
 
				                     "f1": float(perf["f1"]),
			
 
				                     "ece": float(cal["ece"]),
			
 
				                     "mce": float(cal["mce"]),
			
 
				                     "mean_confidence_certainty": float(
			
 
				-                        np.nanmean(_confidence_certainty(y_prob))
			
 
				+                        np.nanmean(confidence_certainty(y_prob))
			
 
				                     ),
			
 
				                     "mean_confidence_uncertainty": float(
			
 
				-                        np.nanmean(_confidence_uncertainty(y_prob))
			
 
				+                        np.nanmean(confidence_uncertainty(y_prob))
			
 
				                     ),
			
 
				                     "mean_std": float(np.nanmean(y_std)),
			
 
				                     "mean_predictive_entropy": float("nan"),
			
 
				+                    "mri_intensity_range": float(intensity_range),
			
 
				                 }
			
 
				             )
			
 
				 
			
@@ -337,10 +257,10 @@ def run_noise_analysis(
 
				             device = next(models[0].parameters()).device
			
 
				             original_device = original_mri.float().to(device)
			
 
				             for sigma in noise_sigmas:
			
 
				-                noisy_mri = _apply_scaled_noise(original_device, sigma)
			
 
				+                noisy_mri = _apply_scaled_noise(original_device, sigma, intensity_range)
			
 
				                 example_rows.append((float(sigma), noisy_mri.detach().cpu()))
			
 
				 
			
 
				-        _save_noise_example_grid(
			
 
				+        save_noise_example_grid(
			
 
				             original_mri=original_mri,
			
 
				             noisy_by_sigma=example_rows,
			
 
				             output_path=examples_dir / f"{backend}_noise_examples.png",
			
@@ -354,6 +274,7 @@ def run_noise_analysis(
 
				                 test_loader,
			
 
				                 model,
			
 
				                 sigma,
			
 
				+                intensity_range,
			
 
				                 class_index=class_index,
			
 
				                 mc_passes=bayesian_mc_passes,
			
 
				             )
			
@@ -362,20 +283,21 @@ def run_noise_analysis(
 
				             rows.append(
			
 
				                 {
			
 
				                     "uncertainty_metric": "predictive_entropy",
			
 
				-                    "sigma": float(sigma),
			
 
				+                    "noise_factor": float(sigma),
			
 
				                     "accuracy": float(perf["accuracy"]),
			
 
				                     "f1": float(perf["f1"]),
			
 
				                     "ece": float(cal["ece"]),
			
 
				                     "mce": float(cal["mce"]),
			
 
				                     "mean_confidence_certainty": float(
			
 
				-                        np.nanmean(_confidence_certainty(y_prob))
			
 
				+                        np.nanmean(confidence_certainty(y_prob))
			
 
				                     ),
			
 
				                     "mean_confidence_uncertainty": float(
			
 
				-                        np.nanmean(_confidence_uncertainty(y_prob))
			
 
				+                        np.nanmean(confidence_uncertainty(y_prob))
			
 
				                     ),
			
 
				                     # Compatibility field name retained for downstream code.
			
 
				                     "mean_std": float(np.nanmean(y_std)),
			
 
				                     "mean_predictive_entropy": float(np.nanmean(y_std)),
			
 
				+                    "mri_intensity_range": float(intensity_range),
			
 
				                 }
			
 
				             )
			
 
				 
			
@@ -385,10 +307,10 @@ def run_noise_analysis(
 
				             device = next(model.parameters()).device
			
 
				             original_device = original_mri.float().to(device)
			
 
				             for sigma in noise_sigmas:
			
 
				-                noisy_mri = _apply_scaled_noise(original_device, sigma)
			
 
				+                noisy_mri = _apply_scaled_noise(original_device, sigma, intensity_range)
			
 
				                 example_rows.append((float(sigma), noisy_mri.detach().cpu()))
			
 
				 
			
 
				-        _save_noise_example_grid(
			
 
				+        save_noise_example_grid(
			
 
				             original_mri=original_mri,
			
 
				             noisy_by_sigma=example_rows,
			
 
				             output_path=examples_dir / f"{backend}_noise_examples.png",
			
@@ -397,65 +319,58 @@ def run_noise_analysis(
 
				     else:
			
 
				         raise ValueError(f"Unsupported backend for noise analysis: {backend}")
			
 
				 
			
 
				-    df = pd.DataFrame(rows).sort_values("sigma")
			
 
				+    df = pd.DataFrame(rows).sort_values("noise_factor")
			
 
				     csv_path = output_dir / "noise_sensitivity.csv"
			
 
				     df.to_csv(csv_path, index=False)
			
 
				 
			
 
				-    fig, ax = plt.subplots(figsize=(10, 5))
			
 
				-    ax.plot(df["sigma"], df["accuracy"], marker="o", label="accuracy")
			
 
				-    ax.plot(df["sigma"], df["f1"], marker="s", label="f1")
			
 
				-    ax.plot(df["sigma"], df["ece"], marker="^", label="ece")
			
 
				-    ax.set_xlabel("Gaussian Noise Sigma")
			
 
				-    ax.set_ylabel("Score")
			
 
				-    ax.set_title(f"Noise Sensitivity ({backend})")
			
 
				-    ax.grid(True, alpha=0.3)
			
 
				-    ax.legend()
			
 
				-    fig.tight_layout()
			
 
				-    plot_path = output_dir / "noise_sensitivity.png"
			
 
				-    fig.savefig(plot_path)
			
 
				-    plt.close(fig)
			
 
				-
			
 
				-    fig_u, ax_u = plt.subplots(figsize=(10, 5))
			
 
				-    ax_u.plot(
			
 
				-        df["sigma"],
			
 
				-        df["mean_confidence_uncertainty"],
			
 
				-        marker="o",
			
 
				-        label="confidence_uncertainty",
			
 
				+    plot_path = out_plots_dir / "noise_sensitivity.png"
			
 
				+    uncertainty_plot_path = out_plots_dir / "noise_uncertainty.png"
			
 
				+    certainty_plot_path = out_plots_dir / "noise_confidence_certainty.png"
			
 
				+
			
 
				+    save_noise_metrics_plot(
			
 
				+        x=df["noise_factor"],
			
 
				+        y_by_label=[
			
 
				+            (df["accuracy"], "o", "accuracy"),
			
 
				+            (df["f1"], "s", "f1"),
			
 
				+            (df["ece"], "^", "ece"),
			
 
				+        ],
			
 
				+        x_label="Gaussian Noise Factor",
			
 
				+        y_label="Score",
			
 
				+        title=f"Noise Sensitivity ({backend})",
			
 
				+        output_path=plot_path,
			
 
				+    )
			
 
				+    save_noise_metrics_plot(
			
 
				+        x=df["noise_factor"],
			
 
				+        y_by_label=[
			
 
				+            (df["mean_confidence_uncertainty"], "o", "confidence_uncertainty"),
			
 
				+            (df["mean_std"], "s", "std_uncertainty"),
			
 
				+        ],
			
 
				+        x_label="Gaussian Noise Factor",
			
 
				+        y_label="Uncertainty",
			
 
				+        title=f"Uncertainty vs Noise ({backend})",
			
 
				+        output_path=uncertainty_plot_path,
			
 
				     )
			
 
				-    ax_u.plot(df["sigma"], df["mean_std"], marker="s", label="std_uncertainty")
			
 
				-    ax_u.set_xlabel("Gaussian Noise Sigma")
			
 
				-    ax_u.set_ylabel("Uncertainty")
			
 
				-    ax_u.set_title(f"Uncertainty vs Noise ({backend})")
			
 
				-    ax_u.grid(True, alpha=0.3)
			
 
				-    ax_u.legend()
			
 
				-    fig_u.tight_layout()
			
 
				-    uncertainty_plot_path = output_dir / "noise_uncertainty.png"
			
 
				-    fig_u.savefig(uncertainty_plot_path)
			
 
				-    plt.close(fig_u)
			
 
				-
			
 
				-    fig_c, ax_c = plt.subplots(figsize=(10, 5))
			
 
				-    ax_c.plot(
			
 
				-        df["sigma"],
			
 
				-        df["mean_confidence_certainty"],
			
 
				-        marker="o",
			
 
				-        label="confidence_certainty",
			
 
				+    save_noise_metrics_plot(
			
 
				+        x=df["noise_factor"],
			
 
				+        y_by_label=[
			
 
				+            (df["mean_confidence_certainty"], "o", "confidence_certainty"),
			
 
				+        ],
			
 
				+        x_label="Gaussian Noise Factor",
			
 
				+        y_label="Certainty",
			
 
				+        title=f"Confidence Certainty vs Noise ({backend})",
			
 
				+        output_path=certainty_plot_path,
			
 
				     )
			
 
				-    ax_c.set_xlabel("Gaussian Noise Sigma")
			
 
				-    ax_c.set_ylabel("Certainty")
			
 
				-    ax_c.set_title(f"Confidence Certainty vs Noise ({backend})")
			
 
				-    ax_c.grid(True, alpha=0.3)
			
 
				-    ax_c.legend()
			
 
				-    fig_c.tight_layout()
			
 
				-    certainty_plot_path = output_dir / "noise_confidence_certainty.png"
			
 
				-    fig_c.savefig(certainty_plot_path)
			
 
				-    plt.close(fig_c)
			
 
				 
			
 
				     out = {
			
 
				         "table": str(csv_path),
			
 
				         "plot": str(plot_path),
			
 
				         "uncertainty_plot": str(uncertainty_plot_path),
			
 
				         "certainty_plot": str(certainty_plot_path),
			
 
				+        "noise_factors": noise_sigmas,
			
 
				         "noise_sigmas": noise_sigmas,
			
 
				+        # "mri_intensity_min": float(intensity_min),
			
 
				+        # "mri_intensity_max": float(intensity_max),
			
 
				+        "mri_intensity_range": float(intensity_range),
			
 
				     }
			
 
				     write_json(output_dir / "noise_summary.json", out)
			
 
				     return out
			
--- a/analysis/plotting.py
+++ b/analysis/plotting.py
@@ -0,0 +1,178 @@
 
				+# pyright: basic
			
 
				+
			
 
				+from __future__ import annotations
			
 
				+
			
 
				+from pathlib import Path
			
 
				+
			
 
				+import matplotlib.pyplot as plt
			
 
				+import numpy as np
			
 
				+import pandas as pd
			
 
				+import torch
			
 
				+
			
 
				+
			
 
				+def plots_dir(output_dir: Path) -> Path:
			
 
				+    plots = output_dir / "plots"
			
 
				+    plots.mkdir(parents=True, exist_ok=True)
			
 
				+    return plots
			
 
				+
			
 
				+
			
 
				+def save_performance_threshold_plot(
			
 
				+    df: pd.DataFrame, backend: str, output_path: Path
			
 
				+) -> None:
			
 
				+    fig, ax = plt.subplots(figsize=(10, 5))
			
 
				+    ax.plot(df["threshold"], df["accuracy"], label="accuracy", marker="o")
			
 
				+    ax.plot(df["threshold"], df["f1"], label="f1", marker="s")
			
 
				+    ax.set_xlabel("Threshold")
			
 
				+    ax.set_ylabel("Score")
			
 
				+    ax.set_title(f"Performance vs Threshold ({backend})")
			
 
				+    ax.grid(True, alpha=0.3)
			
 
				+    ax.legend()
			
 
				+    fig.tight_layout()
			
 
				+    output_path.parent.mkdir(parents=True, exist_ok=True)
			
 
				+    fig.savefig(output_path)
			
 
				+    plt.close(fig)
			
 
				+
			
 
				+
			
 
				+def save_uncertainty_cutoff_plot(
			
 
				+    cutoff_df: pd.DataFrame,
			
 
				+    title_prefix: str,
			
 
				+    x_label: str,
			
 
				+    output_path: Path,
			
 
				+) -> None:
			
 
				+    fig, axes = plt.subplots(1, 2, figsize=(14, 5), sharex=True)
			
 
				+    for uncertainty_name, group in cutoff_df.groupby("uncertainty_type"):
			
 
				+        g = group.sort_values("restriction_level")
			
 
				+        axes[0].plot(
			
 
				+            g["restriction_level"], g["accuracy"], marker="o", label=uncertainty_name
			
 
				+        )
			
 
				+        axes[1].plot(
			
 
				+            g["restriction_level"], g["f1"], marker="s", label=uncertainty_name
			
 
				+        )
			
 
				+
			
 
				+    axes[0].set_title(f"Accuracy vs {title_prefix}")
			
 
				+    axes[1].set_title(f"F1 vs {title_prefix}")
			
 
				+    for ax in axes:
			
 
				+        ax.set_xlabel(x_label)
			
 
				+        ax.grid(True, alpha=0.3)
			
 
				+        ax.legend()
			
 
				+    axes[0].set_ylabel("Accuracy")
			
 
				+    axes[1].set_ylabel("F1")
			
 
				+    fig.tight_layout()
			
 
				+    output_path.parent.mkdir(parents=True, exist_ok=True)
			
 
				+    fig.savefig(output_path)
			
 
				+    plt.close(fig)
			
 
				+
			
 
				+
			
 
				+def save_calibration_plot(per_bin: np.ndarray, backend: str, output_path: Path) -> None:
			
 
				+    fig, ax = plt.subplots(figsize=(6, 6))
			
 
				+    valid = ~np.isnan(per_bin[:, 1])
			
 
				+    ax.plot([0, 1], [0, 1], linestyle="--", color="gray", label="ideal")
			
 
				+    ax.plot(per_bin[valid, 0], per_bin[valid, 1], marker="o", label=backend)
			
 
				+    ax.set_xlabel("Mean Predicted Probability")
			
 
				+    ax.set_ylabel("Empirical Fraction Positive")
			
 
				+    ax.set_title(f"Reliability Diagram ({backend})")
			
 
				+    ax.legend()
			
 
				+    ax.grid(True, alpha=0.3)
			
 
				+    fig.tight_layout()
			
 
				+    output_path.parent.mkdir(parents=True, exist_ok=True)
			
 
				+    fig.savefig(output_path)
			
 
				+    plt.close(fig)
			
 
				+
			
 
				+
			
 
				+def save_boxplot(
			
 
				+    data: list[np.ndarray],
			
 
				+    tick_labels: list[str],
			
 
				+    x_label: str,
			
 
				+    y_label: str,
			
 
				+    title: str,
			
 
				+    output_path: Path,
			
 
				+) -> None:
			
 
				+    fig, ax = plt.subplots(figsize=(9, 5))
			
 
				+    ax.boxplot(data, tick_labels=tick_labels)
			
 
				+    ax.set_xlabel(x_label)
			
 
				+    ax.set_ylabel(y_label)
			
 
				+    ax.set_title(title)
			
 
				+    ax.grid(True, axis="y", alpha=0.3)
			
 
				+    fig.tight_layout()
			
 
				+    output_path.parent.mkdir(parents=True, exist_ok=True)
			
 
				+    fig.savefig(output_path)
			
 
				+    plt.close(fig)
			
 
				+
			
 
				+
			
 
				+def _central_slice(volume: torch.Tensor) -> np.ndarray:
			
 
				+    tensor = volume.detach().cpu()
			
 
				+    if tensor.ndim == 5:
			
 
				+        tensor = tensor[0]
			
 
				+    if tensor.ndim == 4:
			
 
				+        tensor = tensor[0]
			
 
				+    if tensor.ndim != 3:
			
 
				+        raise ValueError(
			
 
				+            f"Expected a 3D volume after squeezing batch/channel, got shape {tuple(tensor.shape)}"
			
 
				+        )
			
 
				+
			
 
				+    center_index = tensor.shape[0] // 2
			
 
				+    return tensor[center_index].numpy().astype(float)
			
 
				+
			
 
				+
			
 
				+def _normalize_for_display(image: np.ndarray) -> np.ndarray:
			
 
				+    low = float(np.percentile(image, 1))
			
 
				+    high = float(np.percentile(image, 99))
			
 
				+    if high <= low:
			
 
				+        return np.zeros_like(image, dtype=float)
			
 
				+    clipped = np.clip(image, low, high)
			
 
				+    return (clipped - low) / (high - low)
			
 
				+
			
 
				+
			
 
				+def save_noise_example_grid(
			
 
				+    original_mri: torch.Tensor,
			
 
				+    noisy_by_sigma: list[tuple[float, torch.Tensor]],
			
 
				+    output_path: Path,
			
 
				+    title: str,
			
 
				+) -> None:
			
 
				+    if not noisy_by_sigma:
			
 
				+        return
			
 
				+
			
 
				+    original_slice = _normalize_for_display(_central_slice(original_mri))
			
 
				+    n_rows = len(noisy_by_sigma)
			
 
				+    fig, axes = plt.subplots(n_rows, 2, figsize=(8, 3.2 * n_rows))
			
 
				+    if n_rows == 1:
			
 
				+        axes = np.array([axes])
			
 
				+
			
 
				+    for row_idx, (sigma, noisy_tensor) in enumerate(noisy_by_sigma):
			
 
				+        noisy_slice = _normalize_for_display(_central_slice(noisy_tensor))
			
 
				+        ax_orig, ax_noisy = axes[row_idx]
			
 
				+        ax_orig.imshow(original_slice, cmap="gray")
			
 
				+        ax_orig.set_title("Original")
			
 
				+        ax_orig.axis("off")
			
 
				+
			
 
				+        ax_noisy.imshow(noisy_slice, cmap="gray")
			
 
				+        ax_noisy.set_title(f"Noisy factor={sigma:g}")
			
 
				+        ax_noisy.axis("off")
			
 
				+
			
 
				+    fig.suptitle(title)
			
 
				+    fig.tight_layout()
			
 
				+    output_path.parent.mkdir(parents=True, exist_ok=True)
			
 
				+    fig.savefig(output_path)
			
 
				+    plt.close(fig)
			
 
				+
			
 
				+
			
 
				+def save_noise_metrics_plot(
			
 
				+    x: pd.Series,
			
 
				+    y_by_label: list[tuple[pd.Series, str, str]],
			
 
				+    x_label: str,
			
 
				+    y_label: str,
			
 
				+    title: str,
			
 
				+    output_path: Path,
			
 
				+) -> None:
			
 
				+    fig, ax = plt.subplots(figsize=(10, 5))
			
 
				+    for series, marker, label in y_by_label:
			
 
				+        ax.plot(x, series, marker=marker, label=label)
			
 
				+    ax.set_xlabel(x_label)
			
 
				+    ax.set_ylabel(y_label)
			
 
				+    ax.set_title(title)
			
 
				+    ax.grid(True, alpha=0.3)
			
 
				+    ax.legend()
			
 
				+    fig.tight_layout()
			
 
				+    output_path.parent.mkdir(parents=True, exist_ok=True)
			
 
				+    fig.savefig(output_path)
			
 
				+    plt.close(fig)
			
--- a/analysis/uncertainty.py
+++ b/analysis/uncertainty.py
@@ -0,0 +1,16 @@
 
				+# pyright: basic
			
 
				+
			
 
				+from __future__ import annotations
			
 
				+
			
 
				+import numpy as np
			
 
				+
			
 
				+
			
 
				+def confidence_certainty(y_prob: np.ndarray) -> np.ndarray:
			
 
				+    # 2 * |p - 0.5| maps 0 -> very uncertain and 1 -> very certain.
			
 
				+    return 2.0 * np.abs(np.asarray(y_prob, dtype=float) - 0.5)
			
 
				+
			
 
				+
			
 
				+def confidence_uncertainty(y_prob: np.ndarray) -> np.ndarray:
			
 
				+    # Normalized uncertainty in [0, 1]: 1 at p=0.5 and 0 at p in {0, 1}.
			
 
				+    uncertainty = 1.0 - confidence_certainty(np.asarray(y_prob, dtype=float))
			
 
				+    return np.clip(uncertainty, 0.0, 1.0)