# pyright: basic from __future__ import annotations import argparse from pathlib import Path from typing import Any import pandas as pd from tqdm.auto import tqdm from analysis.analysis_modules import ( run_calibration, run_longitudinal, run_performance, run_physician, ) from analysis.dataset_summary import run_dataset_summary from analysis.data_access import load_backend_evaluation, load_clinical_table from analysis.defaults import ( DEFAULT_BACKENDS, DEFAULT_BAYESIAN_MC_PASSES, DEFAULT_CALIBRATION_BINS, DEFAULT_DECISION_THRESHOLD, DEFAULT_POSITIVE_CLASS_INDEX, noise_factor_grid, threshold_grid, ) from analysis.holdout_evaluation import ensure_backend_netcdf from analysis.longitudinal_audit import run_longitudinal_breakdown_audit from analysis.noise_correlation import run_noise_accuracy_uncertainty_analysis from analysis.noise_analysis import run_noise_analysis from analysis.runtime import backend_dir, init_runtime_paths, load_config, write_json def _plot_description(filename: str) -> str: descriptions = { "performance_threshold_accuracy.png": "Accuracy as the decision threshold varies.", "performance_threshold_f1.png": "F1 score as the decision threshold varies.", "performance_threshold_accuracy_f1.png": "Accuracy and F1 shown side-by-side as the decision threshold varies.", "performance_uncertainty_cutoff_accuracy.png": "Accuracy while progressively restricting to higher-confidence and uncertainty-metric subsets.", "performance_uncertainty_cutoff_f1.png": "F1 score while progressively restricting to higher-confidence and uncertainty-metric subsets.", "performance_uncertainty_cutoff_accuracy_f1.png": "Accuracy and F1 shown side-by-side across uncertainty-cutoff restriction levels.", "performance_uncertainty_percentile_cutoff_accuracy.png": "Accuracy from least to most restricted percentile-wise subset selection.", "performance_uncertainty_percentile_cutoff_f1.png": "F1 score from least to most restricted percentile-wise subset selection.", "performance_uncertainty_percentile_cutoff_accuracy_f1.png": "Accuracy and F1 shown side-by-side across percentile-floor restriction levels.", "calibration_reliability.png": "Reliability diagram comparing predicted probability to empirical outcome frequency.", "physician_confidence_boxplot.png": "Confidence grouped by physician confidence ratings.", "physician_std_boxplot.png": "Standard deviation grouped by physician confidence ratings.", "physician_predictive_entropy_boxplot.png": "Predictive uncertainty grouped by physician confidence ratings.", "longitudinal_cohort_confidence.png": "Longitudinal cohort comparison using confidence.", "longitudinal_cohort_std.png": "Longitudinal cohort comparison using standard deviation.", "longitudinal_cohort_predictive_entropy.png": "Longitudinal cohort comparison using predictive uncertainty.", "noise_sensitivity_accuracy.png": "Accuracy trend across increasing Gaussian noise factors.", "noise_sensitivity_f1.png": "F1 trend across increasing Gaussian noise factors.", "noise_sensitivity_accuracy_f1.png": "Accuracy and F1 shown side-by-side across increasing Gaussian noise factors.", "noise_confidence.png": "Confidence trend across increasing Gaussian noise factors.", "noise_standard_deviation.png": "Standard deviation trend across increasing Gaussian noise factors.", "noise_confidence_standard_deviation.png": "Confidence and standard deviation shown side-by-side across increasing Gaussian noise factors.", "noise_predictive_uncertainty.png": "Predictive uncertainty trend across increasing Gaussian noise factors.", "noise_confidence_predictive_uncertainty.png": "Confidence and predictive uncertainty shown side-by-side across increasing Gaussian noise factors.", "noise_accuracy_uncertainty_2d.png": "2D uncertainty-vs-accuracy relationship with linear fit (noise factor encoded by color).", "ensemble_noise_examples.png": "Representative noisy image slices across selected Gaussian noise factors.", "bayesian_noise_examples.png": "Representative noisy image slices across selected Gaussian noise factors.", "ensemble_clean_scan_example.png": "Example clean scan image with no added noise.", "bayesian_clean_scan_example.png": "Example clean scan image with no added noise.", } return descriptions.get(filename, "Generated analysis plot.") def _write_backend_plot_report(backend: str, out_dir: Path) -> Path: plots_dir = out_dir / "plots" images = sorted(plots_dir.rglob("*.png")) if plots_dir.exists() else [] report_path = out_dir / "plots_report.md" lines = [ f"# {backend.title()} Analysis Plot Report", "", "This document lists generated analysis plots with brief descriptions.", "", ] if not images: lines.append("No plot images were generated for this backend run.") else: for image_path in images: rel = image_path.relative_to(out_dir).as_posix() title = image_path.stem.replace("_", " ").title() lines.append(f"## {title}") lines.append(_plot_description(image_path.name)) lines.append("") lines.append(f"![{title}]({rel})") lines.append("") report_path.write_text("\n".join(lines), encoding="utf-8") return report_path def _parse_args() -> argparse.Namespace: parser = argparse.ArgumentParser( description=( "Run modular evaluation analyses for ensemble and bayesian models. " "All outputs are written to alnn_rewrite/analysis_output." ) ) parser.add_argument( "--backend", nargs="+", choices=["ensemble", "bayesian"], default=DEFAULT_BACKENDS, help="Backends to evaluate.", ) parser.add_argument( "--run-name", default=None, help="Optional run directory name under analysis_output.", ) parser.add_argument( "--skip-noise", action="store_true", help="Skip Gaussian noise sensitivity analysis.", ) parser.add_argument( "--longitudinal-breakdown-only", action="store_true", help=( "Run only longitudinal cohort breakdown audit from existing model " "evaluation outputs (no full analysis rerun)." ), ) parser.add_argument( "--noise-correlation-only", action="store_true", help=( "Run only the noise uncertainty-vs-accuracy correlation/regression " "analysis from an existing noise_sensitivity.csv per backend." ), ) parser.add_argument( "--dataset-summary-only", action="store_true", help=( "Generate only dataset composition summary documentation " "(overall and train/validation/test class breakdown)." ), ) args = parser.parse_args() only_modes = [ bool(args.longitudinal_breakdown_only), bool(args.noise_correlation_only), bool(args.dataset_summary_only), ] if sum(only_modes) > 1: parser.error( "Only one of --longitudinal-breakdown-only, " "--noise-correlation-only, and --dataset-summary-only may be used at once." ) return args def _run_longitudinal_breakdown_only( config: dict[str, Any], backend: str, clinical_df: pd.DataFrame, out_dir: Path, ) -> dict[str, Any]: evaluation = load_backend_evaluation( config=config, backend=backend, class_index=DEFAULT_POSITIVE_CLASS_INDEX, ) summary = run_longitudinal_breakdown_audit( evaluation=evaluation, clinical_df=clinical_df, output_dir=out_dir, ) write_json(out_dir / "longitudinal_breakdown_backend_summary.json", summary) return summary def _run_noise_correlation_only( backend: str, out_dir: Path, ) -> dict[str, Any]: noise_table_path = out_dir / "noise_sensitivity.csv" if not noise_table_path.exists(): raise FileNotFoundError( f"Expected existing noise table for --noise-correlation-only: {noise_table_path}" ) noise_df = pd.read_csv(noise_table_path) summary = run_noise_accuracy_uncertainty_analysis( noise_df=noise_df, backend=backend, output_dir=out_dir, ) write_json(out_dir / "noise_accuracy_uncertainty_backend_summary.json", summary) return summary def _run_backend( config: dict[str, Any], root_dir: Path, backend: str, clinical_df: pd.DataFrame, skip_noise: bool, out_dir: Path, ) -> dict[str, Any]: netcdf_path = ensure_backend_netcdf( config=config, root_dir=root_dir, backend=backend, bayesian_mc_passes=DEFAULT_BAYESIAN_MC_PASSES, ) evaluation = load_backend_evaluation( config=config, backend=backend, class_index=DEFAULT_POSITIVE_CLASS_INDEX, ) thresholds = threshold_grid() noise_factors = noise_factor_grid() summary: dict[str, Any] = { "backend": backend, "netcdf": str(netcdf_path), "source_file": str(evaluation.source_file), "uncertainty_metric": evaluation.uncertainty_metric, } n_stages = 4 + (0 if skip_noise else 2) stage_bar = tqdm( total=n_stages, desc=f"[{backend}] analysis stages", unit="stage", leave=False, ) try: stage_bar.set_postfix_str("performance") summary["performance"] = run_performance( evaluation=evaluation, output_dir=out_dir, thresholds=thresholds, ) stage_bar.update(1) stage_bar.set_postfix_str("calibration") summary["calibration"] = run_calibration( evaluation=evaluation, output_dir=out_dir, bins=DEFAULT_CALIBRATION_BINS, ) stage_bar.update(1) stage_bar.set_postfix_str("physician") summary["physician"] = run_physician( evaluation=evaluation, clinical_df=clinical_df, output_dir=out_dir, ) stage_bar.update(1) stage_bar.set_postfix_str("longitudinal") summary["longitudinal"] = run_longitudinal( evaluation=evaluation, clinical_df=clinical_df, output_dir=out_dir, ) stage_bar.update(1) if skip_noise: summary["noise"] = {"skipped": True, "reason": "--skip-noise supplied"} summary["noise_accuracy_uncertainty"] = { "skipped": True, "reason": "Noise analysis skipped, so no noise table available.", } else: try: stage_bar.set_postfix_str("noise") summary["noise"] = run_noise_analysis( config=config, root_dir=root_dir, backend=backend, output_dir=out_dir, class_index=DEFAULT_POSITIVE_CLASS_INDEX, noise_sigmas=noise_factors, threshold=DEFAULT_DECISION_THRESHOLD, calibration_bins=DEFAULT_CALIBRATION_BINS, bayesian_mc_passes=DEFAULT_BAYESIAN_MC_PASSES, ) stage_bar.update(1) stage_bar.set_postfix_str("noise-correlation") noise_table_path = Path(str(summary["noise"]["table"])) noise_df = pd.read_csv(noise_table_path) summary["noise_accuracy_uncertainty"] = ( run_noise_accuracy_uncertainty_analysis( noise_df=noise_df, backend=backend, output_dir=out_dir, ) ) stage_bar.update(1) except Exception as exc: summary["noise"] = { "skipped": True, "reason": f"Noise analysis failed: {exc}", } summary["noise_accuracy_uncertainty"] = { "skipped": True, "reason": f"Noise relationship analysis failed: {exc}", } stage_bar.update(2) finally: stage_bar.close() report_path = _write_backend_plot_report(backend=backend, out_dir=out_dir) summary["plots_report"] = str(report_path) write_json(out_dir / "backend_summary.json", summary) return summary def main() -> None: args = _parse_args() analysis_dir = Path(__file__).resolve().parent paths = init_runtime_paths(analysis_dir=analysis_dir, run_name=args.run_name) config = load_config(paths.root_dir) clinical_df = load_clinical_table(config=config, root_dir=paths.root_dir) manifest: dict[str, Any] = { "run_dir": str(paths.run_dir), "output_root": str(paths.output_root), "mode": ( "dataset_summary_only" if bool(args.dataset_summary_only) else ( "longitudinal_breakdown_only" if bool(args.longitudinal_breakdown_only) else ( "noise_correlation_only" if bool(args.noise_correlation_only) else "full" ) ) ), "positive_class_index": DEFAULT_POSITIVE_CLASS_INDEX, "threshold_sweep": { "values": [float(v) for v in threshold_grid().tolist()], }, "calibration_bins": DEFAULT_CALIBRATION_BINS, "noise_factors": noise_factor_grid(), "bayesian_mc_passes": DEFAULT_BAYESIAN_MC_PASSES, "decision_threshold": DEFAULT_DECISION_THRESHOLD, "backends": {}, } if args.dataset_summary_only: manifest["dataset_summary"] = run_dataset_summary( config=config, root_dir=paths.root_dir, output_dir=paths.run_dir, positive_class_index=DEFAULT_POSITIVE_CLASS_INDEX, ) write_json(paths.run_dir / "run_manifest.json", manifest) print(f"Dataset summary complete. Results saved to {paths.run_dir}") return backend_iter = tqdm(args.backend, desc="Backends", unit="backend") for backend in backend_iter: out_dir = backend_dir(paths, backend) backend_iter.set_postfix_str(backend) if args.longitudinal_breakdown_only: manifest["backends"][backend] = _run_longitudinal_breakdown_only( config=config, backend=backend, clinical_df=clinical_df, out_dir=out_dir, ) elif args.noise_correlation_only: manifest["backends"][backend] = _run_noise_correlation_only( backend=backend, out_dir=out_dir, ) else: manifest["backends"][backend] = _run_backend( config=config, root_dir=paths.root_dir, backend=backend, clinical_df=clinical_df, skip_noise=bool(args.skip_noise), out_dir=out_dir, ) write_json(paths.run_dir / "run_manifest.json", manifest) print(f"Analysis complete. Results saved to {paths.run_dir}") if __name__ == "__main__": main()