Nicholas Schense 2 месяцев назад
Родитель
Сommit
2e4c5b386d

+ 2 - 1
.gitignore

@@ -1,2 +1,3 @@
 .venv/
-__pycache__/
+__pycache__/
+analysis_output/

+ 109 - 0
analysis/README.md

@@ -0,0 +1,109 @@
+# Model Evaluation Code
+
+## Description
+
+This folder should contain all the necessary code for the evaluation of the Bayeisan model and the Deep Ensemble. It should generate and save graphs and statistics for this. The included analyses should include
+
+- Performance information (i.e. basic information on accuracy, number correct, number incorrect, F1 score, etc.)
+- Some basic metrics for uncertainty (ECE, MCE)
+- Physican Confidence Analysis (graphing uncertainty vs physican confidence)
+- Longitudinal Analysis (graphing uncertainty on patients who remained stable CN, stable AD, or switched from CN to AD)
+- Noise introduction analysis (graphing uncertainty on normal image and images with increasing levels of Gaussian noise applied)
+
+"Uncertainty" should be taken to mean EITHER confidence or standard deviation for the ensemble and the bayesian network (i.e. the raw outputs distance from 0.5 or either the stdev from the Bayesian or the stdev of all the model outputs). Analyses should be evaluated with both.
+
+The code in the senior_research_thesis and the prebious alnn_rewrite/analysis folders should be consulted. The models should be loaded from the currently in-place config.toml
+
+## Implementation Status
+
+The modular implementation now lives entirely in this folder.
+
+- All new source code is under `alnn_rewrite/analysis`
+- All generated artifacts are written under `alnn_rewrite/analysis_output`
+
+If a backend does not already have `model_evaluation_results.nc`, the pipeline now automatically evaluates that backend on the holdout datasets (validation + test) first, saves the generated netCDF into that backend's model output directory, and then runs the analyses.
+
+Uncertainty analyses are now run using both:
+
+- Confidence-based certainty: `2 * |p - 0.5|` where `0` means very uncertain and `1` means very certain
+- Confidence-based uncertainty: `1 - 2 * |p - 0.5|` so larger values mean more uncertainty, matching std-based plots
+- Secondary uncertainty metric: ensemble uses std across models; bayesian uses predictive entropy across MC samples (`bayesian_torch.utils.util.predictive_entropy`)
+
+### Current Modules
+
+- `evaluate_models.py`: Orchestrator CLI for running selected analyses across ensemble and bayesian backends.
+- `runtime.py`: Runtime paths and JSON helpers.
+- `data_access.py`: netCDF loading, class probability extraction, and clinical table access.
+- `metrics.py`: Shared performance and calibration metrics.
+- `analysis_modules.py`: Performance, calibration, physician confidence, and longitudinal analyses.
+- `noise_analysis.py`: Evaluation-time Gaussian noise sensitivity analysis.
+
+### How To Run
+
+From `alnn_rewrite`:
+
+```bash
+python -m analysis.evaluate_models
+```
+
+Useful options:
+
+```bash
+python -m analysis.evaluate_models \
+	--backend ensemble bayesian \
+	--run-name first_modular_run \
+	--positive-class-index 0 \
+	--calibration-bins 10 \
+	--noise-sigmas 0.0 0.01 0.03 0.05 0.1
+```
+
+If you want to skip noise analysis while validating the pipeline:
+
+```bash
+python -m analysis.evaluate_models --skip-noise
+```
+
+### Output Layout
+
+Each run creates a dedicated directory:
+
+```text
+alnn_rewrite/analysis_output/
+	run_YYYYMMDD_HHMMSS/
+		run_manifest.json
+		ensemble/
+			backend_summary.json
+			performance_threshold_sweep.csv
+			performance_threshold_sweep.png
+			calibration_bins.csv
+			calibration_reliability.png
+			physician_grouped_metrics.csv
+			physician_confidence_grouped_metrics.csv
+			physician_std_grouped_metrics.csv
+			physician_confidence_boxplot.png
+			physician_std_boxplot.png
+			longitudinal_patient_summary.csv
+			longitudinal_cohort_summary.csv
+			longitudinal_uncertainty_by_cohort.csv
+			longitudinal_confidence_patient_summary.csv
+			longitudinal_std_patient_summary.csv
+			longitudinal_confidence_cohort_summary.csv
+			longitudinal_std_cohort_summary.csv
+			longitudinal_cohort_confidence.png
+			longitudinal_cohort_std.png
+			noise_sensitivity.csv
+			noise_sensitivity.png
+			noise_uncertainty.png
+			noise_confidence_certainty.png
+			noise_examples/
+				ensemble_noise_examples.png
+				bayesian_noise_examples.png
+		bayesian/
+			... (same structure)
+```
+
+The default noise schedule now includes noisier settings beyond the earlier small-sigma cases, so the saved example images show a clearer progression from lightly noised to heavily noised volumes.
+
+For the noise analysis, the uncertainty plot uses the confidence metric in its uncertainty orientation, so higher values always mean more uncertainty. A separate certainty plot is also saved for direct inspection of `2 * |p - 0.5|`.
+
+Noise is now scaled by each sample's MRI intensity standard deviation, so sigma is dimensionless and interpretable across raw intensity ranges. In this setup, `sigma=1.0` means the injected noise standard deviation is approximately equal to the image's own standard deviation.

+ 1 - 0
analysis/__init__.py

@@ -0,0 +1 @@
+"""Modular analysis package for model evaluation."""

+ 521 - 0
analysis/analysis_modules.py

@@ -0,0 +1,521 @@
+# pyright: basic
+
+from __future__ import annotations
+
+from pathlib import Path
+from typing import Any
+
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+
+from .data_access import BackendEvaluation, physician_column
+from .metrics import calibration_stats, performance_at_threshold, threshold_sweep
+from .runtime import write_json
+
+
+def _save_table(rows: list[dict[str, Any]], out_path: Path) -> pd.DataFrame:
+    df = pd.DataFrame(rows)
+    out_path.parent.mkdir(parents=True, exist_ok=True)
+    df.to_csv(out_path, index=False)
+    return df
+
+
+def run_performance(
+    evaluation: BackendEvaluation,
+    output_dir: Path,
+    thresholds: np.ndarray,
+) -> dict[str, Any]:
+    rows = threshold_sweep(evaluation.y_true, evaluation.y_prob, thresholds)
+    table_path = output_dir / "performance_threshold_sweep.csv"
+    df = _save_table(rows, table_path)
+
+    fig, ax = plt.subplots(figsize=(10, 5))
+    ax.plot(df["threshold"], df["accuracy"], label="accuracy", marker="o")
+    ax.plot(df["threshold"], df["f1"], label="f1", marker="s")
+    ax.set_xlabel("Threshold")
+    ax.set_ylabel("Score")
+    ax.set_title(f"Performance vs Threshold ({evaluation.backend})")
+    ax.grid(True, alpha=0.3)
+    ax.legend()
+    fig.tight_layout()
+    plot_path = output_dir / "performance_threshold_sweep.png"
+    fig.savefig(plot_path)
+    plt.close(fig)
+
+    best_idx = int(df["f1"].idxmax())
+    best = df.iloc[best_idx].to_dict()
+
+    cutoff_percentiles = np.array(
+        [100, 95, 90, 85, 80, 75, 70, 60, 50, 40, 30, 20, 10, 5, 2, 1],
+        dtype=float,
+    )
+    confidence_uncertainty = 1.0 - np.asarray(
+        evaluation.uncertainty_confidence, dtype=float
+    )
+    secondary_uncertainty = np.asarray(evaluation.uncertainty_std, dtype=float)
+    uncertainty_types = [
+        ("confidence_uncertainty", confidence_uncertainty),
+        (evaluation.uncertainty_metric, secondary_uncertainty),
+    ]
+
+    cutoff_rows: list[dict[str, Any]] = []
+    for uncertainty_name, values in uncertainty_types:
+        finite_mask = np.isfinite(values)
+        if not finite_mask.any():
+            continue
+
+        values_valid = values[finite_mask]
+        y_true_valid = evaluation.y_true[finite_mask]
+        y_prob_valid = evaluation.y_prob[finite_mask]
+
+        for cutoff_percentile in cutoff_percentiles:
+            # Keep predictions whose uncertainty is <= percentile cutoff.
+            cutoff_value = float(np.percentile(values_valid, cutoff_percentile))
+            keep_mask = values_valid <= cutoff_value
+            retained = int(keep_mask.sum())
+            if retained == 0:
+                continue
+
+            perf = performance_at_threshold(
+                y_true=y_true_valid[keep_mask],
+                y_prob=y_prob_valid[keep_mask],
+                threshold=0.5,
+            )
+            cutoff_rows.append(
+                {
+                    "uncertainty_type": uncertainty_name,
+                    "cutoff_percentile": float(cutoff_percentile),
+                    "cutoff_value": cutoff_value,
+                    "n_retained": retained,
+                    "coverage": float(retained / len(values_valid)),
+                    "accuracy": float(perf["accuracy"]),
+                    "f1": float(perf["f1"]),
+                }
+            )
+
+    cutoff_table_path = output_dir / "performance_uncertainty_cutoff.csv"
+    cutoff_plot_path = output_dir / "performance_uncertainty_cutoff.png"
+    if cutoff_rows:
+        cutoff_df = pd.DataFrame(cutoff_rows)
+        cutoff_df.to_csv(cutoff_table_path, index=False)
+
+        fig_u, axes_u = plt.subplots(1, 2, figsize=(14, 5), sharex=True)
+        for uncertainty_name, group in cutoff_df.groupby("uncertainty_type"):
+            g = group.sort_values("cutoff_percentile", ascending=False)
+            axes_u[0].plot(
+                g["cutoff_percentile"],
+                g["accuracy"],
+                marker="o",
+                label=uncertainty_name,
+            )
+            axes_u[1].plot(
+                g["cutoff_percentile"],
+                g["f1"],
+                marker="s",
+                label=uncertainty_name,
+            )
+
+        axes_u[0].set_title("Accuracy vs Uncertainty Cutoff Percentile")
+        axes_u[1].set_title("F1 vs Uncertainty Cutoff Percentile")
+        for ax in axes_u:
+            ax.set_xlabel("Uncertainty Cutoff Percentile (100 = no cutoff)")
+            ax.grid(True, alpha=0.3)
+            ax.legend()
+        axes_u[0].set_ylabel("Accuracy")
+        axes_u[1].set_ylabel("F1")
+        fig_u.tight_layout()
+        fig_u.savefig(cutoff_plot_path)
+        plt.close(fig_u)
+    summary = {
+        "best_by_f1": {
+            k: float(v) for k, v in best.items() if isinstance(v, (int, float))
+        },
+        "table": str(table_path),
+        "plot": str(plot_path),
+        "uncertainty_cutoff": {
+            "table": str(cutoff_table_path),
+            "plot": str(cutoff_plot_path),
+            "decision_threshold": 0.5,
+        },
+    }
+    write_json(output_dir / "performance_summary.json", summary)
+    return summary
+
+
+def run_calibration(
+    evaluation: BackendEvaluation,
+    output_dir: Path,
+    bins: int,
+) -> dict[str, Any]:
+    summary, per_bin = calibration_stats(
+        evaluation.y_true, evaluation.y_prob, bins=bins
+    )
+
+    bin_df = pd.DataFrame(
+        per_bin,
+        columns=["mean_confidence", "fraction_positive", "count"],
+    )
+    table_path = output_dir / "calibration_bins.csv"
+    bin_df.to_csv(table_path, index=False)
+
+    fig, ax = plt.subplots(figsize=(6, 6))
+    valid = ~np.isnan(per_bin[:, 1])
+    ax.plot([0, 1], [0, 1], linestyle="--", color="gray", label="ideal")
+    ax.plot(
+        per_bin[valid, 0],
+        per_bin[valid, 1],
+        marker="o",
+        label=f"{evaluation.backend}",
+    )
+    ax.set_xlabel("Mean Predicted Probability")
+    ax.set_ylabel("Empirical Fraction Positive")
+    ax.set_title(f"Reliability Diagram ({evaluation.backend})")
+    ax.legend()
+    ax.grid(True, alpha=0.3)
+    fig.tight_layout()
+    plot_path = output_dir / "calibration_reliability.png"
+    fig.savefig(plot_path)
+    plt.close(fig)
+
+    out = {
+        **summary,
+        "table": str(table_path),
+        "plot": str(plot_path),
+    }
+    write_json(output_dir / "calibration_summary.json", out)
+    return out
+
+
+def run_physician(
+    evaluation: BackendEvaluation,
+    clinical_df: pd.DataFrame,
+    output_dir: Path,
+) -> dict[str, Any]:
+    secondary_key = (
+        "predictive_entropy"
+        if evaluation.uncertainty_metric == "predictive_entropy"
+        else "std"
+    )
+    secondary_label = (
+        "Model Predictive Entropy"
+        if secondary_key == "predictive_entropy"
+        else "Model Uncertainty Std"
+    )
+
+    col = physician_column(clinical_df)
+    subset = clinical_df[["Image Data ID", col]].copy()
+    subset[col] = pd.to_numeric(subset[col], errors="coerce")
+    subset = subset.dropna(subset=["Image Data ID", col])
+    subset["Image Data ID"] = subset["Image Data ID"].astype(int)
+    subset[col] = subset[col].astype(int)
+
+    eval_df = pd.DataFrame(
+        {
+            "Image Data ID": evaluation.image_ids.astype(int),
+            "model_confidence": evaluation.uncertainty_confidence,
+            "model_std": evaluation.uncertainty_std,
+            "model_prob": evaluation.y_prob,
+        }
+    )
+    merged = eval_df.merge(subset, on="Image Data ID", how="inner")
+
+    if merged.empty:
+        raise ValueError("No overlapping Image Data ID rows for physician analysis")
+
+    grouped_rows: list[dict[str, Any]] = []
+    uncertainty_specs = [
+        ("confidence", "model_confidence", "Model Confidence (2*|p-0.5|)"),
+        (secondary_key, "model_std", secondary_label),
+    ]
+    ratings = [int(r) for r in sorted(pd.unique(merged[col]))]
+    plot_paths: dict[str, str] = {}
+    correlations: dict[str, float] = {}
+
+    for metric_name, metric_col, metric_label in uncertainty_specs:
+        grouped_metric = (
+            merged.groupby(col)
+            .agg(
+                n=("Image Data ID", "count"),
+                mean_value=(metric_col, "mean"),
+                std_value=(metric_col, "std"),
+                mean_prob=("model_prob", "mean"),
+            )
+            .reset_index()
+            .rename(columns={col: "physician_rating"})
+        )
+        grouped_metric["uncertainty_type"] = metric_name
+        grouped_rows.extend(
+            [
+                {str(k): v for k, v in rec.items()}
+                for rec in grouped_metric.to_dict(orient="records")
+            ]
+        )
+
+        fig, ax = plt.subplots(figsize=(9, 5))
+        data = [
+            np.asarray(merged.loc[merged[col] == r, metric_col], dtype=float)
+            for r in ratings
+        ]
+        ax.boxplot(data, tick_labels=[str(r) for r in ratings])
+        ax.set_xlabel("Physician Confidence Rating (DXCONFID)")
+        ax.set_ylabel(metric_label)
+        ax.set_title(f"{metric_label} vs Physician Confidence ({evaluation.backend})")
+        ax.grid(True, axis="y", alpha=0.3)
+        fig.tight_layout()
+        plot_path = output_dir / f"physician_{metric_name}_boxplot.png"
+        fig.savefig(plot_path)
+        plt.close(fig)
+
+        corr = float(
+            pd.to_numeric(
+                merged[[metric_col, col]].corr(method="spearman").iloc[0, 1],
+                errors="coerce",
+            )
+        )
+        correlations[metric_name] = corr
+        plot_paths[metric_name] = str(plot_path)
+
+    grouped = pd.DataFrame(grouped_rows)
+    table_path = output_dir / "physician_grouped_metrics.csv"
+    grouped.to_csv(table_path, index=False)
+
+    confidence_table = output_dir / "physician_confidence_grouped_metrics.csv"
+    std_table = output_dir / "physician_std_grouped_metrics.csv"
+    secondary_table = output_dir / f"physician_{secondary_key}_grouped_metrics.csv"
+    grouped[grouped["uncertainty_type"] == "confidence"].to_csv(
+        confidence_table, index=False
+    )
+    grouped[grouped["uncertainty_type"] == secondary_key].to_csv(
+        secondary_table, index=False
+    )
+    grouped[grouped["uncertainty_type"] == secondary_key].to_csv(std_table, index=False)
+
+    out = {
+        "n_overlap": int(len(merged)),
+        "spearman_vs_dxconfid": correlations,
+        "table": str(table_path),
+        "tables": {
+            "confidence": str(confidence_table),
+            secondary_key: str(secondary_table),
+            "std": str(std_table),
+        },
+        "plots": plot_paths,
+    }
+    write_json(output_dir / "physician_summary.json", out)
+    return out
+
+
+def _normalize_dx(value: Any) -> str:
+    if value is None or (isinstance(value, float) and np.isnan(value)):
+        return ""
+
+    v = str(value).strip().upper()
+    if v in {"NL", "NORMAL"}:
+        return "CN"
+    return v
+
+
+def run_longitudinal(
+    evaluation: BackendEvaluation,
+    clinical_df: pd.DataFrame,
+    output_dir: Path,
+) -> dict[str, Any]:
+    secondary_key = (
+        "predictive_entropy"
+        if evaluation.uncertainty_metric == "predictive_entropy"
+        else "std"
+    )
+    secondary_label = (
+        "Mean Model Predictive Entropy"
+        if secondary_key == "predictive_entropy"
+        else "Mean Model Uncertainty Std"
+    )
+
+    required = ["Image Data ID", "PTID"]
+    missing = [c for c in required if c not in clinical_df.columns]
+    if missing:
+        raise KeyError(f"Missing columns for longitudinal analysis: {missing}")
+
+    diagnosis_col = None
+    for candidate in ["Class", "DX", "Diagnosis"]:
+        if candidate in clinical_df.columns:
+            diagnosis_col = candidate
+            break
+    if diagnosis_col is None:
+        raise KeyError(
+            "No diagnosis column found. Expected one of: Class, DX, Diagnosis"
+        )
+
+    work = clinical_df[
+        ["Image Data ID", "PTID", diagnosis_col]
+        + [c for c in ["EXAMDATE"] if c in clinical_df.columns]
+    ].copy()
+    work["Image Data ID"] = pd.to_numeric(work["Image Data ID"], errors="coerce")
+    work = work.dropna(subset=["Image Data ID", "PTID"])
+    work["Image Data ID"] = work["Image Data ID"].astype(int)
+    work["PTID"] = work["PTID"].astype(str).str.strip()
+    work["diagnosis"] = work[diagnosis_col].map(_normalize_dx)
+
+    if "EXAMDATE" in work.columns:
+        work["EXAMDATE"] = pd.to_datetime(work["EXAMDATE"], errors="coerce")
+        work = work.sort_values(["PTID", "EXAMDATE"], na_position="last")
+    else:
+        work = work.sort_values(["PTID", "Image Data ID"])
+
+    eval_df = pd.DataFrame(
+        {
+            "Image Data ID": evaluation.image_ids.astype(int),
+            "model_confidence": evaluation.uncertainty_confidence,
+            "model_std": evaluation.uncertainty_std,
+            "model_prob": evaluation.y_prob,
+        }
+    )
+    merged = work.merge(eval_df, on="Image Data ID", how="inner")
+
+    if merged.empty:
+        raise ValueError("No overlapping Image Data ID rows for longitudinal analysis")
+
+    patient_rows: list[dict[str, Any]] = []
+    for ptid, group in merged.groupby("PTID"):
+        diagnoses = [d for d in group["diagnosis"].tolist() if d]
+        if len(diagnoses) < 2:
+            continue
+
+        first_dx = diagnoses[0]
+        last_dx = diagnoses[-1]
+        unique_dx = set(diagnoses)
+
+        cohort = "other"
+        if unique_dx.issubset({"CN"}):
+            cohort = "stable_cn"
+        elif unique_dx.issubset({"AD"}):
+            cohort = "stable_ad"
+        elif first_dx == "CN" and "AD" in unique_dx and last_dx == "AD":
+            cohort = "cn_to_ad"
+
+        patient_rows.append(
+            {
+                "PTID": ptid,
+                "n_visits": int(len(group)),
+                "first_dx": first_dx,
+                "last_dx": last_dx,
+                "cohort": cohort,
+                "mean_confidence": float(group["model_confidence"].mean()),
+                "mean_std": float(group["model_std"].mean()),
+                "mean_prob": float(group["model_prob"].mean()),
+            }
+        )
+
+    patient_df = pd.DataFrame(patient_rows)
+    table_path = output_dir / "longitudinal_patient_summary.csv"
+    patient_df.to_csv(table_path, index=False)
+
+    cohort_df = (
+        patient_df.groupby("cohort")
+        .agg(
+            n_patients=("PTID", "count"),
+            mean_confidence=("mean_confidence", "mean"),
+            mean_std=("mean_std", "mean"),
+            mean_prob=("mean_prob", "mean"),
+        )
+        .reset_index()
+    )
+    cohort_table = output_dir / "longitudinal_cohort_summary.csv"
+    cohort_df.to_csv(cohort_table, index=False)
+
+    cohorts = ["stable_cn", "stable_ad", "cn_to_ad"]
+    uncertainty_specs = [
+        ("confidence", "mean_confidence", "Mean Model Confidence"),
+        (secondary_key, "mean_std", secondary_label),
+    ]
+    plot_paths: dict[str, str] = {}
+    for metric_name, metric_col, metric_label in uncertainty_specs:
+        fig, ax = plt.subplots(figsize=(9, 5))
+        values = [
+            np.asarray(
+                patient_df.loc[patient_df["cohort"] == c, metric_col], dtype=float
+            )
+            for c in cohorts
+        ]
+        ax.boxplot(values, tick_labels=cohorts)
+        ax.set_ylabel(metric_label)
+        ax.set_title(f"Longitudinal Cohort {metric_label} ({evaluation.backend})")
+        ax.grid(True, axis="y", alpha=0.3)
+        fig.tight_layout()
+        plot_path = output_dir / f"longitudinal_cohort_{metric_name}.png"
+        fig.savefig(plot_path)
+        plt.close(fig)
+        plot_paths[metric_name] = str(plot_path)
+
+    uncertainty_by_cohort = cohort_df.melt(
+        id_vars=["cohort", "n_patients"],
+        value_vars=["mean_confidence", "mean_std"],
+        var_name="uncertainty_type",
+        value_name="mean_value",
+    ).replace(
+        {
+            "uncertainty_type": {
+                "mean_confidence": "confidence",
+                "mean_std": secondary_key,
+            }
+        }
+    )
+    uncertainty_table = output_dir / "longitudinal_uncertainty_by_cohort.csv"
+    uncertainty_by_cohort.to_csv(uncertainty_table, index=False)
+
+    confidence_patient_table = (
+        output_dir / "longitudinal_confidence_patient_summary.csv"
+    )
+    std_patient_table = output_dir / "longitudinal_std_patient_summary.csv"
+    confidence_cohort_table = output_dir / "longitudinal_confidence_cohort_summary.csv"
+    std_cohort_table = output_dir / "longitudinal_std_cohort_summary.csv"
+    secondary_patient_table = (
+        output_dir / f"longitudinal_{secondary_key}_patient_summary.csv"
+    )
+    secondary_cohort_table = (
+        output_dir / f"longitudinal_{secondary_key}_cohort_summary.csv"
+    )
+
+    patient_df[
+        ["PTID", "n_visits", "first_dx", "last_dx", "cohort", "mean_confidence"]
+    ].to_csv(confidence_patient_table, index=False)
+    patient_df[
+        ["PTID", "n_visits", "first_dx", "last_dx", "cohort", "mean_std"]
+    ].to_csv(std_patient_table, index=False)
+    patient_df[
+        ["PTID", "n_visits", "first_dx", "last_dx", "cohort", "mean_std"]
+    ].to_csv(secondary_patient_table, index=False)
+    cohort_df[["cohort", "n_patients", "mean_confidence"]].to_csv(
+        confidence_cohort_table, index=False
+    )
+    cohort_df[["cohort", "n_patients", "mean_std"]].to_csv(
+        std_cohort_table, index=False
+    )
+    cohort_df[["cohort", "n_patients", "mean_std"]].to_csv(
+        secondary_cohort_table, index=False
+    )
+
+    out = {
+        "n_patients_analyzed": int(len(patient_df)),
+        "table_patient": str(table_path),
+        "table_cohort": str(cohort_table),
+        "table_uncertainty": str(uncertainty_table),
+        "tables": {
+            "confidence": {
+                "patient": str(confidence_patient_table),
+                "cohort": str(confidence_cohort_table),
+            },
+            secondary_key: {
+                "patient": str(secondary_patient_table),
+                "cohort": str(secondary_cohort_table),
+            },
+            "std": {
+                "patient": str(std_patient_table),
+                "cohort": str(std_cohort_table),
+            },
+        },
+        "plots": plot_paths,
+    }
+    write_json(output_dir / "longitudinal_summary.json", out)
+    return out

+ 170 - 0
analysis/data_access.py

@@ -0,0 +1,170 @@
+# pyright: basic
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any
+
+import numpy as np
+import pandas as pd
+import xarray as xr
+from bayesian_torch.utils.util import predictive_entropy
+
+
+@dataclass
+class BackendEvaluation:
+    backend: str
+    source_file: Path
+    image_ids: np.ndarray
+    y_true: np.ndarray
+    y_prob: np.ndarray
+    uncertainty_confidence: np.ndarray
+    uncertainty_std: np.ndarray
+    uncertainty_metric: str
+
+
+def _resolve_dataset_path(model_output_dir: Path) -> Path:
+    primary = model_output_dir / "model_evaluation_results.nc"
+    if primary.exists():
+        return primary
+
+    candidates = sorted(model_output_dir.glob("*.nc"))
+    if not candidates:
+        raise FileNotFoundError(f"No netCDF file found under {model_output_dir}")
+
+    return candidates[0]
+
+
+def _positive_probability(
+    predictions: xr.DataArray,
+    class_index: int,
+) -> tuple[np.ndarray, np.ndarray, str]:
+    if "img_class" not in predictions.dims:
+        raise ValueError("predictions is missing required dim: img_class")
+
+    if class_index >= predictions.sizes["img_class"]:
+        raise ValueError(
+            f"positive class index {class_index} is out of bounds for img_class size {predictions.sizes['img_class']}"
+        )
+
+    if "model" in predictions.dims:
+        prob_mean = predictions.mean(dim="model").isel(img_class=class_index).values
+        prob_std = predictions.std(dim="model").isel(img_class=class_index).values
+        return (
+            np.asarray(prob_mean, dtype=float),
+            np.asarray(prob_std, dtype=float),
+            "std",
+        )
+
+    sample_like = [d for d in predictions.dims if d in {"sample", "mc_sample", "draw"}]
+    if sample_like:
+        dim = str(sample_like[0])
+        prob_mean = predictions.mean(dim=dim).isel(img_class=class_index).values
+
+        # For Bayesian MC predictions, uncertainty should come from predictive
+        # entropy of the predictive distribution rather than classwise std.
+        mc_preds = predictions.transpose(dim, "img_id", "img_class").values
+        entropy_uncertainty = predictive_entropy(np.asarray(mc_preds, dtype=float))
+        return (
+            np.asarray(prob_mean, dtype=float),
+            np.asarray(entropy_uncertainty, dtype=float),
+            "predictive_entropy",
+        )
+
+    prob = predictions.isel(img_class=class_index).values
+    return (
+        np.asarray(prob, dtype=float),
+        np.full_like(np.asarray(prob, dtype=float), np.nan),
+        "unknown",
+    )
+
+
+def _labels_to_binary(labels: xr.DataArray, class_index: int) -> np.ndarray:
+    if "label" in labels.dims:
+        if class_index >= labels.sizes["label"]:
+            raise ValueError(
+                f"positive class index {class_index} is out of bounds for label size {labels.sizes['label']}"
+            )
+
+        # One-hot labels expected in this repository.
+        binary = labels.argmax(dim="label").values == class_index
+        return np.asarray(binary, dtype=int)
+
+    # Fallback if labels are already binary.
+    return np.asarray(labels.values, dtype=int)
+
+
+def load_backend_evaluation(
+    config: dict[str, Any],
+    backend: str,
+    class_index: int,
+) -> BackendEvaluation:
+    output_key = f"{backend}_path"
+    if output_key not in config["output"]:
+        raise KeyError(f"Missing output path key in config: output.{output_key}")
+
+    model_output_dir = Path(config["output"][output_key]).expanduser().resolve()
+    ds_path = _resolve_dataset_path(model_output_dir)
+    ds = xr.open_dataset(ds_path)
+
+    if "predictions" not in ds or "labels" not in ds:
+        raise ValueError(
+            f"Dataset {ds_path} must contain predictions and labels variables"
+        )
+
+    predictions = ds["predictions"]
+    labels = ds["labels"]
+
+    if "img_id" in predictions.coords:
+        image_ids = np.asarray(predictions.coords["img_id"].values)
+    elif "img_id" in labels.coords:
+        image_ids = np.asarray(labels.coords["img_id"].values)
+    else:
+        length = predictions.sizes.get("img_id", labels.sizes.get("img_id"))
+        if length is None:
+            raise ValueError("Could not infer img_id length from predictions/labels")
+        image_ids = np.arange(length)
+
+    y_true = _labels_to_binary(labels, class_index=class_index)
+    y_prob, y_std, uncertainty_metric = _positive_probability(
+        predictions, class_index=class_index
+    )
+    conf = 2.0 * np.abs(y_prob - 0.5)
+
+    if len(y_true) != len(y_prob):
+        raise ValueError(
+            f"Length mismatch after loading backend {backend}: labels={len(y_true)}, probs={len(y_prob)}"
+        )
+
+    return BackendEvaluation(
+        backend=backend,
+        source_file=ds_path,
+        image_ids=image_ids,
+        y_true=y_true,
+        y_prob=y_prob,
+        uncertainty_confidence=conf,
+        uncertainty_std=y_std,
+        uncertainty_metric=uncertainty_metric,
+    )
+
+
+def load_clinical_table(config: dict[str, Any], root_dir: Path) -> pd.DataFrame:
+    csv_path = (root_dir / config["data"]["xls_file_path"]).resolve()
+    df = pd.read_csv(csv_path)
+    df.columns = df.columns.str.strip()
+    return df
+
+
+def physician_column(df: pd.DataFrame) -> str:
+    exact = "DXCONFID"
+    if exact in df.columns:
+        return exact
+
+    for col in df.columns:
+        if "dxconfid" in col.lower():
+            return col
+
+    raise KeyError(
+        "No physician confidence column with DXCONFID found in clinical table"
+    )

+ 242 - 0
analysis/evaluate_models.py

@@ -0,0 +1,242 @@
+# pyright: basic
+
+from __future__ import annotations
+
+import argparse
+from pathlib import Path
+from typing import Any
+
+import numpy as np
+import pandas as pd
+
+from analysis.analysis_modules import (
+    run_calibration,
+    run_longitudinal,
+    run_performance,
+    run_physician,
+)
+from analysis.data_access import load_backend_evaluation, load_clinical_table
+from analysis.holdout_evaluation import ensure_backend_netcdf
+from analysis.noise_analysis import run_noise_analysis
+from analysis.runtime import backend_dir, init_runtime_paths, load_config, write_json
+
+
+def _parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description=(
+            "Run modular evaluation analyses for ensemble and bayesian models. "
+            "All outputs are written to alnn_rewrite/analysis_output."
+        )
+    )
+    parser.add_argument(
+        "--backend",
+        nargs="+",
+        choices=["ensemble", "bayesian"],
+        default=["ensemble", "bayesian"],
+        help="Backends to evaluate.",
+    )
+    parser.add_argument(
+        "--run-name",
+        default=None,
+        help="Optional run directory name under analysis_output.",
+    )
+    parser.add_argument(
+        "--threshold-start",
+        type=float,
+        default=0.5,
+        help="Threshold sweep start.",
+    )
+    parser.add_argument(
+        "--threshold-stop",
+        type=float,
+        default=0.95,
+        help="Threshold sweep stop (inclusive).",
+    )
+    parser.add_argument(
+        "--threshold-step",
+        type=float,
+        default=0.05,
+        help="Threshold sweep step.",
+    )
+    parser.add_argument(
+        "--decision-threshold",
+        type=float,
+        default=0.5,
+        help="Decision threshold used in noise analysis.",
+    )
+    parser.add_argument(
+        "--positive-class-index",
+        type=int,
+        default=0,
+        help="Positive class index in both predictions.img_class and labels.label.",
+    )
+    parser.add_argument(
+        "--calibration-bins",
+        type=int,
+        default=10,
+        help="Number of reliability bins used for ECE/MCE.",
+    )
+    parser.add_argument(
+        "--skip-noise",
+        action="store_true",
+        help="Skip Gaussian noise sensitivity analysis.",
+    )
+    parser.add_argument(
+        "--noise-sigmas",
+        nargs="+",
+        type=float,
+        default=[
+            0.0,
+            0.01,
+            0.03,
+            0.05,
+            0.1,
+            0.2,
+            0.3,
+            0.4,
+            0.5,
+            0.6,
+            0.75,
+            1.0,
+        ],
+        help="Gaussian noise sigmas for sensitivity analysis.",
+    )
+    parser.add_argument(
+        "--bayesian-mc-passes",
+        type=int,
+        default=20,
+        help="MC forward passes for bayesian noise analysis.",
+    )
+    return parser.parse_args()
+
+
+def _threshold_array(start: float, stop: float, step: float) -> np.ndarray:
+    if step <= 0:
+        raise ValueError("threshold-step must be > 0")
+    if stop < start:
+        raise ValueError("threshold-stop must be >= threshold-start")
+
+    # Include stop when it lands on a step boundary.
+    n = int(round((stop - start) / step))
+    return np.array([start + i * step for i in range(n + 1)], dtype=float)
+
+
+def _run_backend(
+    config: dict[str, Any],
+    root_dir: Path,
+    backend: str,
+    clinical_df: pd.DataFrame,
+    args: argparse.Namespace,
+    out_dir: Path,
+) -> dict[str, Any]:
+    netcdf_path = ensure_backend_netcdf(
+        config=config,
+        root_dir=root_dir,
+        backend=backend,
+        bayesian_mc_passes=int(args.bayesian_mc_passes),
+    )
+
+    evaluation = load_backend_evaluation(
+        config=config,
+        backend=backend,
+        class_index=int(args.positive_class_index),
+    )
+
+    thresholds = _threshold_array(
+        start=float(args.threshold_start),
+        stop=float(args.threshold_stop),
+        step=float(args.threshold_step),
+    )
+
+    summary: dict[str, Any] = {
+        "backend": backend,
+        "netcdf": str(netcdf_path),
+        "source_file": str(evaluation.source_file),
+        "uncertainty_metric": evaluation.uncertainty_metric,
+    }
+
+    summary["performance"] = run_performance(
+        evaluation=evaluation,
+        output_dir=out_dir,
+        thresholds=thresholds,
+    )
+    summary["calibration"] = run_calibration(
+        evaluation=evaluation,
+        output_dir=out_dir,
+        bins=int(args.calibration_bins),
+    )
+    summary["physician"] = run_physician(
+        evaluation=evaluation,
+        clinical_df=clinical_df,
+        output_dir=out_dir,
+    )
+    summary["longitudinal"] = run_longitudinal(
+        evaluation=evaluation,
+        clinical_df=clinical_df,
+        output_dir=out_dir,
+    )
+
+    if args.skip_noise:
+        summary["noise"] = {"skipped": True, "reason": "--skip-noise supplied"}
+    else:
+        try:
+            summary["noise"] = run_noise_analysis(
+                config=config,
+                root_dir=Path(__file__).resolve().parents[1],
+                backend=backend,
+                output_dir=out_dir,
+                class_index=int(args.positive_class_index),
+                noise_sigmas=[float(x) for x in args.noise_sigmas],
+                threshold=float(args.decision_threshold),
+                calibration_bins=int(args.calibration_bins),
+                bayesian_mc_passes=int(args.bayesian_mc_passes),
+            )
+        except Exception as exc:
+            summary["noise"] = {
+                "skipped": True,
+                "reason": f"Noise analysis failed: {exc}",
+            }
+
+    write_json(out_dir / "backend_summary.json", summary)
+    return summary
+
+
+def main() -> None:
+    args = _parse_args()
+
+    analysis_dir = Path(__file__).resolve().parent
+    paths = init_runtime_paths(analysis_dir=analysis_dir, run_name=args.run_name)
+    config = load_config(paths.root_dir)
+    clinical_df = load_clinical_table(config=config, root_dir=paths.root_dir)
+
+    manifest: dict[str, Any] = {
+        "run_dir": str(paths.run_dir),
+        "output_root": str(paths.output_root),
+        "positive_class_index": int(args.positive_class_index),
+        "threshold_sweep": {
+            "start": float(args.threshold_start),
+            "stop": float(args.threshold_stop),
+            "step": float(args.threshold_step),
+        },
+        "calibration_bins": int(args.calibration_bins),
+        "noise_sigmas": [float(x) for x in args.noise_sigmas],
+        "backends": {},
+    }
+
+    for backend in args.backend:
+        out_dir = backend_dir(paths, backend)
+        manifest["backends"][backend] = _run_backend(
+            config=config,
+            root_dir=paths.root_dir,
+            backend=backend,
+            clinical_df=clinical_df,
+            args=args,
+            out_dir=out_dir,
+        )
+
+    write_json(paths.run_dir / "run_manifest.json", manifest)
+    print(f"Analysis complete. Results saved to {paths.run_dir}")
+
+
+if __name__ == "__main__":
+    main()

+ 266 - 0
analysis/holdout_evaluation.py

@@ -0,0 +1,266 @@
+# pyright: basic
+
+from __future__ import annotations
+
+import json
+from pathlib import Path
+from typing import Any
+
+import numpy as np
+import pandas as pd
+import torch
+from torch.utils.data import ConcatDataset, DataLoader
+import xarray as xr
+
+from data.dataset import (
+    divide_dataset_by_patient_id,
+    initalize_dataloaders,
+    load_adni_data_from_file,
+)
+from model.cnn import CNN3D
+
+
+def _enable_bayesian_sampling_mode(model: torch.nn.Module) -> None:
+    # Keep stochastic Bayesian layers in training mode, but freeze BatchNorm
+    # statistics to support batch_size=1 inference on holdout samples.
+    model.train()
+    for module in model.modules():
+        if isinstance(module, torch.nn.modules.batchnorm._BatchNorm):
+            module.eval()
+
+
+def _xls_pre(df: pd.DataFrame) -> pd.DataFrame:
+    data = df[["Image Data ID", "Sex", "Age (current)"]].copy()
+    data["Sex"] = data["Sex"].astype(str).str.strip()
+    data = data.replace({"M": 0, "F": 1})
+    return data
+
+
+def _training_seed(config: dict[str, Any], backend_dir: Path) -> int:
+    config_json = backend_dir / "config.json"
+    if config_json.exists():
+        try:
+            with config_json.open("r", encoding="utf-8") as f:
+                trained = json.load(f)
+            return int(trained["data"]["seed"])
+        except Exception:
+            pass
+    return int(config["data"]["seed"])
+
+
+def _build_holdout_loader(
+    config: dict[str, Any], root_dir: Path, backend_dir: Path
+) -> DataLoader:
+    mri_files = (root_dir / config["data"]["mri_files_path"]).resolve().glob("*.nii")
+    xls_file = (root_dir / config["data"]["xls_file_path"]).resolve()
+
+    dataset = load_adni_data_from_file(
+        mri_files,
+        xls_file,
+        device=config["training"]["device"],
+        xls_preprocessor=_xls_pre,
+    )
+
+    ptid_df = pd.read_csv(xls_file)
+    ptid_df.columns = ptid_df.columns.str.strip()
+    ptid_df = ptid_df[["Image Data ID", "PTID"]].dropna(
+        subset=["Image Data ID", "PTID"]
+    )
+    ptid_df["Image Data ID"] = ptid_df["Image Data ID"].astype(int)
+    ptid_df["PTID"] = ptid_df["PTID"].astype(str).str.strip()
+    ptid_df = ptid_df[ptid_df["PTID"] != ""]
+
+    ptids = list(zip(ptid_df["Image Data ID"].tolist(), ptid_df["PTID"].tolist()))
+    datasets = divide_dataset_by_patient_id(
+        dataset,
+        ptids,
+        tuple(config["data"]["data_splits"]),
+        seed=_training_seed(config, backend_dir),
+    )
+
+    _, val_loader, test_loader = initalize_dataloaders(datasets, batch_size=1)
+    combined = ConcatDataset([val_loader.dataset, test_loader.dataset])
+    return DataLoader(combined, batch_size=1, shuffle=False)
+
+
+def _init_cnn(config: dict[str, Any]) -> torch.nn.Module:
+    return (
+        CNN3D(
+            image_channels=int(config["data"]["image_channels"]),
+            clin_data_channels=int(config["data"]["clin_data_channels"]),
+            num_classes=int(config["data"]["num_classes"]),
+            droprate=float(config["training"]["droprate"]),
+        )
+        .float()
+        .to(config["training"]["device"])
+    )
+
+
+def _extract_img_id(img_id_tensor: Any) -> int:
+    if hasattr(img_id_tensor, "detach"):
+        return int(img_id_tensor.detach().cpu().item())
+    return int(img_id_tensor)
+
+
+def _evaluate_ensemble(
+    config: dict[str, Any],
+    backend_dir: Path,
+    holdout_loader: DataLoader,
+) -> xr.Dataset:
+    device = str(config["training"]["device"])
+    model_files = sorted(backend_dir.glob("model_run_*.pt"))
+    if not model_files:
+        raise FileNotFoundError(f"No ensemble checkpoints found in {backend_dir}")
+
+    n_models = len(model_files)
+    n_samples = len(holdout_loader)
+    n_classes = int(config["data"]["num_classes"])
+
+    predictions = np.zeros((n_models, n_samples, n_classes), dtype=np.float32)
+    labels = np.zeros((n_samples, n_classes), dtype=np.float32)
+    image_ids = np.zeros((n_samples,), dtype=int)
+
+    for model_i, model_file in enumerate(model_files):
+        model = _init_cnn(config)
+        model.load_state_dict(
+            torch.load(model_file, map_location=device),
+            strict=False,
+        )
+        model.to(device)
+        model.eval()
+
+        with torch.no_grad():
+            for sample_i, (mri, xls, label, img_id) in enumerate(holdout_loader):
+                mri_device = mri.float().to(device)
+                xls_device = xls.float().to(device)
+                output = model((mri_device, xls_device))
+                predictions[model_i, sample_i, :] = output.detach().cpu().numpy()[0, :]
+
+                if model_i == 0:
+                    labels[sample_i, :] = label.detach().cpu().numpy()[0, :]
+                    image_ids[sample_i] = _extract_img_id(img_id)
+
+    model_coord = [int(f.stem.split("_")[2]) for f in model_files]
+    return xr.Dataset(
+        {
+            "predictions": xr.DataArray(
+                predictions,
+                dims=["model", "img_id", "img_class"],
+                coords={
+                    "model": model_coord,
+                    "img_id": image_ids,
+                    "img_class": list(range(n_classes)),
+                },
+            ),
+            "labels": xr.DataArray(
+                labels,
+                dims=["img_id", "label"],
+                coords={
+                    "img_id": image_ids,
+                    "label": list(range(n_classes)),
+                },
+            ),
+        }
+    )
+
+
+def _evaluate_bayesian(
+    config: dict[str, Any],
+    backend_dir: Path,
+    holdout_loader: DataLoader,
+    mc_passes: int,
+) -> xr.Dataset:
+    device = str(config["training"]["device"])
+    try:
+        from bayesian_torch.models.dnn_to_bnn import dnn_to_bnn  # type: ignore[import-untyped]
+    except ImportError as e:
+        raise ImportError(
+            "bayesian_torch is required to evaluate bayesian checkpoints"
+        ) from e
+
+    ckpt = backend_dir / "model_bayesian.pt"
+    if not ckpt.exists():
+        raise FileNotFoundError(f"Bayesian checkpoint not found: {ckpt}")
+
+    model = _init_cnn(config)
+    prior_params: dict[str, float | bool | str] = {
+        "prior_mu": 0.0,
+        "prior_sigma": 1.0,
+        "posterior_mu_init": 0.0,
+        "posterior_rho_init": -3.0,
+        "type": "Reparameterization",
+        "moped_enable": False,
+        "moped_delta": 0.5,
+    }
+    dnn_to_bnn(model, prior_params)
+    model.to(device)
+    model.load_state_dict(torch.load(ckpt, map_location=device), strict=False)
+    model.to(device)
+    _enable_bayesian_sampling_mode(model)
+
+    n_samples = len(holdout_loader)
+    n_classes = int(config["data"]["num_classes"])
+
+    predictions = np.zeros((mc_passes, n_samples, n_classes), dtype=np.float32)
+    labels = np.zeros((n_samples, n_classes), dtype=np.float32)
+    image_ids = np.zeros((n_samples,), dtype=int)
+
+    with torch.no_grad():
+        for pass_i in range(mc_passes):
+            for sample_i, (mri, xls, label, img_id) in enumerate(holdout_loader):
+                mri_device = mri.float().to(device)
+                xls_device = xls.float().to(device)
+                output = model((mri_device, xls_device))
+                predictions[pass_i, sample_i, :] = output.detach().cpu().numpy()[0, :]
+
+                if pass_i == 0:
+                    labels[sample_i, :] = label.detach().cpu().numpy()[0, :]
+                    image_ids[sample_i] = _extract_img_id(img_id)
+
+    return xr.Dataset(
+        {
+            "predictions": xr.DataArray(
+                predictions,
+                dims=["sample", "img_id", "img_class"],
+                coords={
+                    "sample": list(range(mc_passes)),
+                    "img_id": image_ids,
+                    "img_class": list(range(n_classes)),
+                },
+            ),
+            "labels": xr.DataArray(
+                labels,
+                dims=["img_id", "label"],
+                coords={
+                    "img_id": image_ids,
+                    "label": list(range(n_classes)),
+                },
+            ),
+        }
+    )
+
+
+def ensure_backend_netcdf(
+    config: dict[str, Any],
+    root_dir: Path,
+    backend: str,
+    bayesian_mc_passes: int,
+) -> Path:
+    backend_dir = Path(config["output"][f"{backend}_path"]).expanduser().resolve()
+    backend_dir.mkdir(parents=True, exist_ok=True)
+
+    output_path = backend_dir / "model_evaluation_results.nc"
+    if output_path.exists():
+        return output_path
+
+    holdout_loader = _build_holdout_loader(config, root_dir, backend_dir)
+
+    if backend == "ensemble":
+        ds = _evaluate_ensemble(config, backend_dir, holdout_loader)
+    elif backend == "bayesian":
+        ds = _evaluate_bayesian(config, backend_dir, holdout_loader, bayesian_mc_passes)
+    else:
+        raise ValueError(f"Unsupported backend: {backend}")
+
+    ds.to_netcdf(output_path, mode="w")
+    return output_path

+ 109 - 0
analysis/metrics.py

@@ -0,0 +1,109 @@
+# pyright: basic
+
+from __future__ import annotations
+
+import numpy as np
+
+
+def binary_confusion(y_true: np.ndarray, y_prob: np.ndarray, threshold: float) -> dict[str, int]:
+    y_true_int = y_true.astype(int)
+    y_pred = (y_prob >= threshold).astype(int)
+
+    tp = int(np.logical_and(y_pred == 1, y_true_int == 1).sum())
+    fp = int(np.logical_and(y_pred == 1, y_true_int == 0).sum())
+    tn = int(np.logical_and(y_pred == 0, y_true_int == 0).sum())
+    fn = int(np.logical_and(y_pred == 0, y_true_int == 1).sum())
+
+    return {"tp": tp, "fp": fp, "tn": tn, "fn": fn}
+
+
+def _safe_div(num: float, den: float) -> float:
+    if den == 0:
+        return 0.0
+    return num / den
+
+
+def performance_at_threshold(
+    y_true: np.ndarray,
+    y_prob: np.ndarray,
+    threshold: float,
+) -> dict[str, float]:
+    c = binary_confusion(y_true, y_prob, threshold)
+    tp = c["tp"]
+    fp = c["fp"]
+    tn = c["tn"]
+    fn = c["fn"]
+
+    total = tp + fp + tn + fn
+    accuracy = _safe_div(tp + tn, total)
+    precision = _safe_div(tp, tp + fp)
+    recall = _safe_div(tp, tp + fn)
+    f1 = _safe_div(2 * precision * recall, precision + recall)
+
+    return {
+        "threshold": float(threshold),
+        "accuracy": float(accuracy),
+        "precision": float(precision),
+        "recall": float(recall),
+        "f1": float(f1),
+        "tp": float(tp),
+        "fp": float(fp),
+        "tn": float(tn),
+        "fn": float(fn),
+    }
+
+
+def threshold_sweep(
+    y_true: np.ndarray,
+    y_prob: np.ndarray,
+    thresholds: np.ndarray,
+) -> list[dict[str, float]]:
+    return [performance_at_threshold(y_true, y_prob, float(t)) for t in thresholds]
+
+
+def calibration_stats(
+    y_true: np.ndarray,
+    y_prob: np.ndarray,
+    bins: int = 10,
+) -> tuple[dict[str, float], np.ndarray]:
+    y_true_int = y_true.astype(int)
+    y_prob_f = y_prob.astype(float)
+
+    edges = np.linspace(0.0, 1.0, bins + 1)
+    bin_data: list[tuple[float, float, int]] = []
+    ece = 0.0
+    mce = 0.0
+
+    n = len(y_prob_f)
+    for i in range(bins):
+        lo = edges[i]
+        hi = edges[i + 1]
+        if i == bins - 1:
+            mask = (y_prob_f >= lo) & (y_prob_f <= hi)
+        else:
+            mask = (y_prob_f >= lo) & (y_prob_f < hi)
+
+        count = int(mask.sum())
+        if count == 0:
+            bin_data.append((float((lo + hi) / 2.0), np.nan, 0))
+            continue
+
+        mean_conf = float(y_prob_f[mask].mean())
+        frac_pos = float(y_true_int[mask].mean())
+        gap = abs(frac_pos - mean_conf)
+        ece += (count / n) * gap
+        mce = max(mce, gap)
+
+        bin_data.append((mean_conf, frac_pos, count))
+
+    brier = float(np.mean((y_prob_f - y_true_int) ** 2))
+
+    summary = {
+        "ece": float(ece),
+        "mce": float(mce),
+        "brier": brier,
+        "bins": float(bins),
+    }
+
+    arr = np.array(bin_data, dtype=float)
+    return summary, arr

+ 461 - 0
analysis/noise_analysis.py

@@ -0,0 +1,461 @@
+# pyright: basic
+
+from __future__ import annotations
+
+from pathlib import Path
+from typing import Any
+
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+import torch
+from bayesian_torch.utils.util import predictive_entropy
+
+from data.dataset import (
+    divide_dataset_by_patient_id,
+    initalize_dataloaders,
+    load_adni_data_from_file,
+)
+from model.cnn import CNN3D
+
+from .metrics import calibration_stats, performance_at_threshold
+from .runtime import write_json
+
+
+def _enable_bayesian_sampling_mode(model: torch.nn.Module) -> None:
+    model.eval()
+
+
+def _central_slice(volume: torch.Tensor) -> np.ndarray:
+    tensor = volume.detach().cpu()
+    if tensor.ndim == 5:
+        tensor = tensor[0]
+    if tensor.ndim == 4:
+        tensor = tensor[0]
+    if tensor.ndim != 3:
+        raise ValueError(
+            f"Expected a 3D volume after squeezing batch/channel, got shape {tuple(tensor.shape)}"
+        )
+
+    center_index = tensor.shape[0] // 2
+    return tensor[center_index].numpy().astype(float)
+
+
+def _normalize_for_display(image: np.ndarray) -> np.ndarray:
+    low = float(np.percentile(image, 1))
+    high = float(np.percentile(image, 99))
+    if high <= low:
+        return np.zeros_like(image, dtype=float)
+    clipped = np.clip(image, low, high)
+    return (clipped - low) / (high - low)
+
+
+def _save_noise_example_grid(
+    original_mri: torch.Tensor,
+    noisy_by_sigma: list[tuple[float, torch.Tensor]],
+    output_path: Path,
+    title: str,
+) -> None:
+    if not noisy_by_sigma:
+        return
+
+    original_slice = _normalize_for_display(_central_slice(original_mri))
+    n_rows = len(noisy_by_sigma)
+    fig, axes = plt.subplots(n_rows, 2, figsize=(8, 3.2 * n_rows))
+    if n_rows == 1:
+        axes = np.array([axes])
+
+    for row_idx, (sigma, noisy_tensor) in enumerate(noisy_by_sigma):
+        noisy_slice = _normalize_for_display(_central_slice(noisy_tensor))
+        ax_orig, ax_noisy = axes[row_idx]
+        ax_orig.imshow(original_slice, cmap="gray")
+        ax_orig.set_title(f"Original")
+        ax_orig.axis("off")
+
+        ax_noisy.imshow(noisy_slice, cmap="gray")
+        ax_noisy.set_title(f"Noisy sigma={sigma:g}")
+        ax_noisy.axis("off")
+
+    fig.suptitle(title)
+    fig.tight_layout()
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    fig.savefig(output_path)
+    plt.close(fig)
+
+
+def _confidence_certainty(y_prob: np.ndarray) -> np.ndarray:
+    # 2 * |p - 0.5| maps 0 -> very uncertain and 1 -> very certain.
+    return 2.0 * np.abs(y_prob - 0.5)
+
+
+def _confidence_uncertainty(y_prob: np.ndarray) -> np.ndarray:
+    # Invert certainty so that larger values mean more uncertainty, matching std.
+    return 1.0 - _confidence_certainty(y_prob)
+
+
+def _apply_scaled_noise(volume: torch.Tensor, sigma: float) -> torch.Tensor:
+    # Make sigma dimensionless with respect to MRI intensity scale.
+    # sigma=1.0 means noise std equals total MRI intensity range, which is 65535 .
+    return volume + (torch.randn_like(volume) * sigma * 65535)
+
+
+def _xls_pre(df: pd.DataFrame) -> pd.DataFrame:
+    data = df[["Image Data ID", "Sex", "Age (current)"]].copy()
+    data["Sex"] = data["Sex"].astype(str).str.strip()
+    data = data.replace({"M": 0, "F": 1})
+    return data
+
+
+def _build_test_loader(
+    config: dict[str, Any], root_dir: Path
+) -> torch.utils.data.DataLoader:
+    mri_files = (root_dir / config["data"]["mri_files_path"]).resolve().glob("*.nii")
+    xls_file = (root_dir / config["data"]["xls_file_path"]).resolve()
+
+    dataset = load_adni_data_from_file(
+        mri_files,
+        xls_file,
+        device=config["training"]["device"],
+        xls_preprocessor=_xls_pre,
+    )
+
+    ptid_df = pd.read_csv(xls_file)
+    ptid_df.columns = ptid_df.columns.str.strip()
+    ptid_df = ptid_df[["Image Data ID", "PTID"]].dropna(
+        subset=["Image Data ID", "PTID"]
+    )
+    ptid_df["Image Data ID"] = ptid_df["Image Data ID"].astype(int)
+    ptid_df["PTID"] = ptid_df["PTID"].astype(str).str.strip()
+    ptid_df = ptid_df[ptid_df["PTID"] != ""]
+    ptids = list(zip(ptid_df["Image Data ID"].tolist(), ptid_df["PTID"].tolist()))
+
+    splits = divide_dataset_by_patient_id(
+        dataset,
+        ptids,
+        tuple(config["data"]["data_splits"]),
+        seed=int(config["data"]["seed"]),
+    )
+
+    _, _, test_loader = initalize_dataloaders(
+        splits,
+        batch_size=int(config["training"]["batch_size"]),
+    )
+    return test_loader
+
+
+def _load_ensemble_models(config: dict[str, Any]) -> list[torch.nn.Module]:
+    model_dir = Path(config["output"]["ensemble_path"])
+    model_files = sorted(model_dir.glob("model_run_*.pt"))
+    if not model_files:
+        raise FileNotFoundError(f"No ensemble model files found in {model_dir}")
+
+    models: list[torch.nn.Module] = []
+    for model_file in model_files:
+        model = (
+            CNN3D(
+                image_channels=int(config["data"]["image_channels"]),
+                clin_data_channels=int(config["data"]["clin_data_channels"]),
+                num_classes=int(config["data"]["num_classes"]),
+                droprate=float(config["training"]["droprate"]),
+            )
+            .float()
+            .to(config["training"]["device"])
+        )
+        model.load_state_dict(
+            torch.load(model_file, map_location=config["training"]["device"]),
+            strict=False,
+        )
+        model.eval()
+        models.append(model)
+
+    return models
+
+
+def _load_bayesian_model(config: dict[str, Any]) -> torch.nn.Module:
+    device = str(config["training"]["device"])
+    try:
+        from bayesian_torch.models.dnn_to_bnn import dnn_to_bnn  # type: ignore[import-untyped]
+    except ImportError as e:
+        raise ImportError(
+            "bayesian_torch is required for bayesian noise analysis"
+        ) from e
+
+    model_path = Path(config["output"]["bayesian_path"]) / "model_bayesian.pt"
+    if not model_path.exists():
+        raise FileNotFoundError(f"Bayesian model checkpoint not found: {model_path}")
+
+    model = (
+        CNN3D(
+            image_channels=int(config["data"]["image_channels"]),
+            clin_data_channels=int(config["data"]["clin_data_channels"]),
+            num_classes=int(config["data"]["num_classes"]),
+            droprate=float(config["training"]["droprate"]),
+        )
+        .float()
+        .to(config["training"]["device"])
+    )
+
+    prior_params: dict[str, float | bool | str] = {
+        "prior_mu": 0.0,
+        "prior_sigma": 1.0,
+        "posterior_mu_init": 0.0,
+        "posterior_rho_init": -3.0,
+        "type": "Reparameterization",
+        "moped_enable": False,
+        "moped_delta": 0.5,
+    }
+    dnn_to_bnn(model, prior_params)
+    model.to(device)
+    model.load_state_dict(torch.load(model_path, map_location=device), strict=False)
+    model.to(device)
+    _enable_bayesian_sampling_mode(model)
+    return model
+
+
+def _infer_with_noise_ensemble(
+    test_loader: torch.utils.data.DataLoader,
+    models: list[torch.nn.Module],
+    sigma: float,
+    class_index: int,
+) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
+    if not models:
+        raise ValueError("No ensemble models were provided for noise inference")
+
+    device = next(models[0].parameters()).device
+    all_probs: list[float] = []
+    all_stds: list[float] = []
+    all_true: list[int] = []
+
+    with torch.no_grad():
+        for mri, xls, labels, _ in test_loader:
+            mri_device = mri.float().to(device)
+            xls_device = xls.float().to(device)
+            labels_device = labels.to(device)
+            noisy = _apply_scaled_noise(mri_device, sigma)
+            preds = []
+            for model in models:
+                out = model((noisy, xls_device))
+                preds.append(out[:, class_index].detach().cpu().numpy())
+
+            pred_mat = np.stack(preds, axis=0)
+            mean = pred_mat.mean(axis=0)
+            std = pred_mat.std(axis=0)
+            true = labels_device[:, class_index].detach().cpu().numpy().astype(int)
+
+            all_probs.extend(mean.tolist())
+            all_stds.extend(std.tolist())
+            all_true.extend(true.tolist())
+
+    return np.asarray(all_true), np.asarray(all_probs), np.asarray(all_stds)
+
+
+def _infer_with_noise_bayesian(
+    test_loader: torch.utils.data.DataLoader,
+    model: torch.nn.Module,
+    sigma: float,
+    class_index: int,
+    mc_passes: int,
+) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
+    device = next(model.parameters()).device
+    all_probs: list[float] = []
+    all_stds: list[float] = []
+    all_true: list[int] = []
+
+    with torch.no_grad():
+        for mri, xls, labels, _ in test_loader:
+            mri_device = mri.float().to(device)
+            xls_device = xls.float().to(device)
+            labels_device = labels.to(device)
+            noisy = _apply_scaled_noise(mri_device, sigma)
+            draws = []
+            for _ in range(mc_passes):
+                out = model((noisy, xls_device))
+                draws.append(out.detach().cpu().numpy())
+
+            draw_mat = np.stack(draws, axis=0)  # (mc_passes, batch, classes)
+            mean = draw_mat.mean(axis=0)[:, class_index]
+            entropy_uncertainty = predictive_entropy(draw_mat)
+            true = labels_device[:, class_index].detach().cpu().numpy().astype(int)
+
+            all_probs.extend(mean.tolist())
+            all_stds.extend(np.asarray(entropy_uncertainty, dtype=float).tolist())
+            all_true.extend(true.tolist())
+
+    return np.asarray(all_true), np.asarray(all_probs), np.asarray(all_stds)
+
+
+def run_noise_analysis(
+    config: dict[str, Any],
+    root_dir: Path,
+    backend: str,
+    output_dir: Path,
+    class_index: int,
+    noise_sigmas: list[float],
+    threshold: float,
+    calibration_bins: int,
+    bayesian_mc_passes: int,
+) -> dict[str, Any]:
+    test_loader = _build_test_loader(config, root_dir)
+    examples_dir = output_dir / "noise_examples"
+    examples_dir.mkdir(parents=True, exist_ok=True)
+
+    rows: list[dict[str, Any]] = []
+    if backend == "ensemble":
+        models = _load_ensemble_models(config)
+        example_rows: list[tuple[float, torch.Tensor]] = []
+        for sigma in noise_sigmas:
+            y_true, y_prob, y_std = _infer_with_noise_ensemble(
+                test_loader,
+                models,
+                sigma,
+                class_index=class_index,
+            )
+            perf = performance_at_threshold(y_true, y_prob, threshold)
+            cal, _ = calibration_stats(y_true, y_prob, bins=calibration_bins)
+            rows.append(
+                {
+                    "uncertainty_metric": "std",
+                    "sigma": float(sigma),
+                    "accuracy": float(perf["accuracy"]),
+                    "f1": float(perf["f1"]),
+                    "ece": float(cal["ece"]),
+                    "mce": float(cal["mce"]),
+                    "mean_confidence_certainty": float(
+                        np.nanmean(_confidence_certainty(y_prob))
+                    ),
+                    "mean_confidence_uncertainty": float(
+                        np.nanmean(_confidence_uncertainty(y_prob))
+                    ),
+                    "mean_std": float(np.nanmean(y_std)),
+                    "mean_predictive_entropy": float("nan"),
+                }
+            )
+
+        with torch.no_grad():
+            sample = next(iter(test_loader))
+            original_mri = sample[0]
+            device = next(models[0].parameters()).device
+            original_device = original_mri.float().to(device)
+            for sigma in noise_sigmas:
+                noisy_mri = _apply_scaled_noise(original_device, sigma)
+                example_rows.append((float(sigma), noisy_mri.detach().cpu()))
+
+        _save_noise_example_grid(
+            original_mri=original_mri,
+            noisy_by_sigma=example_rows,
+            output_path=examples_dir / f"{backend}_noise_examples.png",
+            title=f"{backend.title()} Noise Examples",
+        )
+    elif backend == "bayesian":
+        model = _load_bayesian_model(config)
+        example_rows = []
+        for sigma in noise_sigmas:
+            y_true, y_prob, y_std = _infer_with_noise_bayesian(
+                test_loader,
+                model,
+                sigma,
+                class_index=class_index,
+                mc_passes=bayesian_mc_passes,
+            )
+            perf = performance_at_threshold(y_true, y_prob, threshold)
+            cal, _ = calibration_stats(y_true, y_prob, bins=calibration_bins)
+            rows.append(
+                {
+                    "uncertainty_metric": "predictive_entropy",
+                    "sigma": float(sigma),
+                    "accuracy": float(perf["accuracy"]),
+                    "f1": float(perf["f1"]),
+                    "ece": float(cal["ece"]),
+                    "mce": float(cal["mce"]),
+                    "mean_confidence_certainty": float(
+                        np.nanmean(_confidence_certainty(y_prob))
+                    ),
+                    "mean_confidence_uncertainty": float(
+                        np.nanmean(_confidence_uncertainty(y_prob))
+                    ),
+                    # Compatibility field name retained for downstream code.
+                    "mean_std": float(np.nanmean(y_std)),
+                    "mean_predictive_entropy": float(np.nanmean(y_std)),
+                }
+            )
+
+        with torch.no_grad():
+            sample = next(iter(test_loader))
+            original_mri = sample[0]
+            device = next(model.parameters()).device
+            original_device = original_mri.float().to(device)
+            for sigma in noise_sigmas:
+                noisy_mri = _apply_scaled_noise(original_device, sigma)
+                example_rows.append((float(sigma), noisy_mri.detach().cpu()))
+
+        _save_noise_example_grid(
+            original_mri=original_mri,
+            noisy_by_sigma=example_rows,
+            output_path=examples_dir / f"{backend}_noise_examples.png",
+            title=f"{backend.title()} Noise Examples",
+        )
+    else:
+        raise ValueError(f"Unsupported backend for noise analysis: {backend}")
+
+    df = pd.DataFrame(rows).sort_values("sigma")
+    csv_path = output_dir / "noise_sensitivity.csv"
+    df.to_csv(csv_path, index=False)
+
+    fig, ax = plt.subplots(figsize=(10, 5))
+    ax.plot(df["sigma"], df["accuracy"], marker="o", label="accuracy")
+    ax.plot(df["sigma"], df["f1"], marker="s", label="f1")
+    ax.plot(df["sigma"], df["ece"], marker="^", label="ece")
+    ax.set_xlabel("Gaussian Noise Sigma")
+    ax.set_ylabel("Score")
+    ax.set_title(f"Noise Sensitivity ({backend})")
+    ax.grid(True, alpha=0.3)
+    ax.legend()
+    fig.tight_layout()
+    plot_path = output_dir / "noise_sensitivity.png"
+    fig.savefig(plot_path)
+    plt.close(fig)
+
+    fig_u, ax_u = plt.subplots(figsize=(10, 5))
+    ax_u.plot(
+        df["sigma"],
+        df["mean_confidence_uncertainty"],
+        marker="o",
+        label="confidence_uncertainty",
+    )
+    ax_u.plot(df["sigma"], df["mean_std"], marker="s", label="std_uncertainty")
+    ax_u.set_xlabel("Gaussian Noise Sigma")
+    ax_u.set_ylabel("Uncertainty")
+    ax_u.set_title(f"Uncertainty vs Noise ({backend})")
+    ax_u.grid(True, alpha=0.3)
+    ax_u.legend()
+    fig_u.tight_layout()
+    uncertainty_plot_path = output_dir / "noise_uncertainty.png"
+    fig_u.savefig(uncertainty_plot_path)
+    plt.close(fig_u)
+
+    fig_c, ax_c = plt.subplots(figsize=(10, 5))
+    ax_c.plot(
+        df["sigma"],
+        df["mean_confidence_certainty"],
+        marker="o",
+        label="confidence_certainty",
+    )
+    ax_c.set_xlabel("Gaussian Noise Sigma")
+    ax_c.set_ylabel("Certainty")
+    ax_c.set_title(f"Confidence Certainty vs Noise ({backend})")
+    ax_c.grid(True, alpha=0.3)
+    ax_c.legend()
+    fig_c.tight_layout()
+    certainty_plot_path = output_dir / "noise_confidence_certainty.png"
+    fig_c.savefig(certainty_plot_path)
+    plt.close(fig_c)
+
+    out = {
+        "table": str(csv_path),
+        "plot": str(plot_path),
+        "uncertainty_plot": str(uncertainty_plot_path),
+        "certainty_plot": str(certainty_plot_path),
+        "noise_sigmas": noise_sigmas,
+    }
+    write_json(output_dir / "noise_summary.json", out)
+    return out

+ 51 - 0
analysis/runtime.py

@@ -0,0 +1,51 @@
+from __future__ import annotations
+
+from dataclasses import dataclass
+from datetime import datetime
+import json
+from pathlib import Path
+import tomllib
+from typing import Any
+
+
+@dataclass(frozen=True)
+class RuntimePaths:
+    root_dir: Path
+    analysis_dir: Path
+    output_root: Path
+    run_dir: Path
+
+
+def load_config(root_dir: Path) -> dict[str, Any]:
+    config_path = root_dir / "config.toml"
+    with config_path.open("rb") as f:
+        return tomllib.load(f)
+
+
+def init_runtime_paths(analysis_dir: Path, run_name: str | None = None) -> RuntimePaths:
+    root_dir = analysis_dir.parent
+    output_root = root_dir / "analysis_output"
+    output_root.mkdir(parents=True, exist_ok=True)
+
+    safe_run_name = run_name or datetime.now().strftime("run_%Y%m%d_%H%M%S")
+    run_dir = output_root / safe_run_name
+    run_dir.mkdir(parents=True, exist_ok=True)
+
+    return RuntimePaths(
+        root_dir=root_dir,
+        analysis_dir=analysis_dir,
+        output_root=output_root,
+        run_dir=run_dir,
+    )
+
+
+def backend_dir(paths: RuntimePaths, backend: str) -> Path:
+    out = paths.run_dir / backend
+    out.mkdir(parents=True, exist_ok=True)
+    return out
+
+
+def write_json(path: Path, payload: dict[str, Any]) -> None:
+    path.parent.mkdir(parents=True, exist_ok=True)
+    with path.open("w", encoding="utf-8") as f:
+        json.dump(payload, f, indent=2)

+ 1 - 1
run_overnight_training.py

@@ -111,7 +111,7 @@ def main() -> int:
     log_dir = (
         pl.Path(args.log_dir).resolve()
         if args.log_dir is not None
-        else workdir / "logs"
+        else                 / "logs"
     )
     run_stamp = datetime.now().strftime("%Y%m%d_%H%M%S")
     run_log_dir = log_dir / f"overnight_{run_stamp}"