| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402 |
- # pyright: basic
- from __future__ import annotations
- import argparse
- from pathlib import Path
- from typing import Any
- import pandas as pd
- from tqdm.auto import tqdm
- from .analysis_modules import (
- run_calibration,
- run_longitudinal,
- run_performance,
- run_physician,
- )
- from .data_access import load_backend_evaluation, load_clinical_table
- from .dataset_summary import run_dataset_summary
- from .defaults import (
- DEFAULT_BACKENDS,
- DEFAULT_BAYESIAN_MC_PASSES,
- DEFAULT_CALIBRATION_BINS,
- DEFAULT_DECISION_THRESHOLD,
- DEFAULT_POSITIVE_CLASS_INDEX,
- noise_factor_grid,
- threshold_grid,
- )
- from .holdout_evaluation import ensure_backend_netcdf
- from .longitudinal_audit import run_longitudinal_breakdown_audit
- from .noise_analysis import run_noise_analysis
- from .noise_correlation import run_noise_accuracy_uncertainty_analysis
- from .runtime import backend_dir, init_runtime_paths, load_config, write_json
- def _plot_description(filename: str) -> str:
- descriptions = {
- "performance_threshold_accuracy.png": "Accuracy as the decision threshold varies.",
- "performance_threshold_f1.png": "F1 score as the decision threshold varies.",
- "performance_threshold_accuracy_f1.png": "Accuracy and F1 shown side-by-side as the decision threshold varies.",
- "performance_uncertainty_cutoff_accuracy.png": "Accuracy while progressively restricting to higher-confidence and uncertainty-metric subsets.",
- "performance_uncertainty_cutoff_f1.png": "F1 score while progressively restricting to higher-confidence and uncertainty-metric subsets.",
- "performance_uncertainty_cutoff_accuracy_f1.png": "Accuracy and F1 shown side-by-side across uncertainty-cutoff restriction levels.",
- "performance_uncertainty_percentile_cutoff_accuracy.png": "Accuracy from least to most restricted percentile-wise subset selection.",
- "performance_uncertainty_percentile_cutoff_f1.png": "F1 score from least to most restricted percentile-wise subset selection.",
- "performance_uncertainty_percentile_cutoff_accuracy_f1.png": "Accuracy and F1 shown side-by-side across percentile-floor restriction levels.",
- "calibration_reliability.png": "Reliability diagram comparing predicted probability to empirical outcome frequency.",
- "physician_confidence_boxplot.png": "Confidence grouped by physician confidence ratings.",
- "physician_std_boxplot.png": "Standard deviation grouped by physician confidence ratings.",
- "physician_predictive_entropy_boxplot.png": "Predictive uncertainty grouped by physician confidence ratings.",
- "longitudinal_cohort_confidence.png": "Longitudinal cohort comparison using confidence.",
- "longitudinal_cohort_std.png": "Longitudinal cohort comparison using standard deviation.",
- "longitudinal_cohort_predictive_entropy.png": "Longitudinal cohort comparison using predictive uncertainty.",
- "noise_sensitivity_accuracy.png": "Accuracy trend across increasing Gaussian noise factors.",
- "noise_sensitivity_f1.png": "F1 trend across increasing Gaussian noise factors.",
- "noise_sensitivity_accuracy_f1.png": "Accuracy and F1 shown side-by-side across increasing Gaussian noise factors.",
- "noise_confidence.png": "Confidence trend across increasing Gaussian noise factors.",
- "noise_standard_deviation.png": "Standard deviation trend across increasing Gaussian noise factors.",
- "noise_confidence_standard_deviation.png": "Confidence and standard deviation shown side-by-side across increasing Gaussian noise factors.",
- "noise_predictive_uncertainty.png": "Predictive uncertainty trend across increasing Gaussian noise factors.",
- "noise_confidence_predictive_uncertainty.png": "Confidence and predictive uncertainty shown side-by-side across increasing Gaussian noise factors.",
- "noise_accuracy_uncertainty_2d.png": "2D uncertainty-vs-accuracy relationship with linear fit (noise factor encoded by color).",
- "ensemble_noise_examples.png": "Representative noisy image slices across selected Gaussian noise factors.",
- "bayesian_noise_examples.png": "Representative noisy image slices across selected Gaussian noise factors.",
- "ensemble_clean_scan_example.png": "Example clean scan image with no added noise.",
- "bayesian_clean_scan_example.png": "Example clean scan image with no added noise.",
- }
- return descriptions.get(filename, "Generated analysis plot.")
- def _write_backend_plot_report(backend: str, out_dir: Path) -> Path:
- plots_dir = out_dir / "plots"
- images = sorted(plots_dir.rglob("*.png")) if plots_dir.exists() else []
- report_path = out_dir / "plots_report.md"
- lines = [
- f"# {backend.title()} Analysis Plot Report",
- "",
- "This document lists generated analysis plots with brief descriptions.",
- "",
- ]
- if not images:
- lines.append("No plot images were generated for this backend run.")
- else:
- for image_path in images:
- rel = image_path.relative_to(out_dir).as_posix()
- title = image_path.stem.replace("_", " ").title()
- lines.append(f"## {title}")
- lines.append(_plot_description(image_path.name))
- lines.append("")
- lines.append(f"")
- lines.append("")
- report_path.write_text("\n".join(lines), encoding="utf-8")
- return report_path
- def _parse_args() -> argparse.Namespace:
- parser = argparse.ArgumentParser(
- description=(
- "Run modular evaluation analyses for ensemble and bayesian models. "
- "All outputs are written to alnn_rewrite/analysis_output."
- )
- )
- parser.add_argument(
- "--backend",
- nargs="+",
- choices=["ensemble", "bayesian"],
- default=DEFAULT_BACKENDS,
- help="Backends to evaluate.",
- )
- parser.add_argument(
- "--run-name",
- default=None,
- help="Optional run directory name under analysis_output.",
- )
- parser.add_argument(
- "--skip-noise",
- action="store_true",
- help="Skip Gaussian noise sensitivity analysis.",
- )
- parser.add_argument(
- "--longitudinal-breakdown-only",
- action="store_true",
- help=(
- "Run only longitudinal cohort breakdown audit from existing model "
- "evaluation outputs (no full analysis rerun)."
- ),
- )
- parser.add_argument(
- "--noise-correlation-only",
- action="store_true",
- help=(
- "Run only the noise uncertainty-vs-accuracy correlation/regression "
- "analysis from an existing noise_sensitivity.csv per backend."
- ),
- )
- parser.add_argument(
- "--dataset-summary-only",
- action="store_true",
- help=(
- "Generate only dataset composition summary documentation "
- "(overall and train/validation/test class breakdown)."
- ),
- )
- args = parser.parse_args()
- only_modes = [
- bool(args.longitudinal_breakdown_only),
- bool(args.noise_correlation_only),
- bool(args.dataset_summary_only),
- ]
- if sum(only_modes) > 1:
- parser.error(
- "Only one of --longitudinal-breakdown-only, "
- "--noise-correlation-only, and --dataset-summary-only may be used at once."
- )
- return args
- def _run_longitudinal_breakdown_only(
- config: dict[str, Any],
- backend: str,
- clinical_df: pd.DataFrame,
- out_dir: Path,
- ) -> dict[str, Any]:
- evaluation = load_backend_evaluation(
- config=config,
- backend=backend,
- class_index=DEFAULT_POSITIVE_CLASS_INDEX,
- )
- summary = run_longitudinal_breakdown_audit(
- evaluation=evaluation,
- clinical_df=clinical_df,
- output_dir=out_dir,
- )
- write_json(out_dir / "longitudinal_breakdown_backend_summary.json", summary)
- return summary
- def _run_noise_correlation_only(
- backend: str,
- out_dir: Path,
- ) -> dict[str, Any]:
- noise_table_path = out_dir / "noise_sensitivity.csv"
- if not noise_table_path.exists():
- raise FileNotFoundError(
- f"Expected existing noise table for --noise-correlation-only: {noise_table_path}"
- )
- noise_df = pd.read_csv(noise_table_path)
- summary = run_noise_accuracy_uncertainty_analysis(
- noise_df=noise_df,
- backend=backend,
- output_dir=out_dir,
- )
- write_json(out_dir / "noise_accuracy_uncertainty_backend_summary.json", summary)
- return summary
- def _run_backend(
- config: dict[str, Any],
- root_dir: Path,
- backend: str,
- clinical_df: pd.DataFrame,
- skip_noise: bool,
- out_dir: Path,
- ) -> dict[str, Any]:
- netcdf_path = ensure_backend_netcdf(
- config=config,
- root_dir=root_dir,
- backend=backend,
- bayesian_mc_passes=DEFAULT_BAYESIAN_MC_PASSES,
- )
- evaluation = load_backend_evaluation(
- config=config,
- backend=backend,
- class_index=DEFAULT_POSITIVE_CLASS_INDEX,
- )
- thresholds = threshold_grid()
- noise_factors = noise_factor_grid()
- summary: dict[str, Any] = {
- "backend": backend,
- "netcdf": str(netcdf_path),
- "source_file": str(evaluation.source_file),
- "uncertainty_metric": evaluation.uncertainty_metric,
- }
- n_stages = 4 + (0 if skip_noise else 2)
- stage_bar = tqdm(
- total=n_stages,
- desc=f"[{backend}] analysis stages",
- unit="stage",
- leave=False,
- )
- try:
- stage_bar.set_postfix_str("performance")
- summary["performance"] = run_performance(
- evaluation=evaluation,
- output_dir=out_dir,
- thresholds=thresholds,
- )
- stage_bar.update(1)
- stage_bar.set_postfix_str("calibration")
- summary["calibration"] = run_calibration(
- evaluation=evaluation,
- output_dir=out_dir,
- bins=DEFAULT_CALIBRATION_BINS,
- )
- stage_bar.update(1)
- stage_bar.set_postfix_str("physician")
- summary["physician"] = run_physician(
- evaluation=evaluation,
- clinical_df=clinical_df,
- output_dir=out_dir,
- )
- stage_bar.update(1)
- stage_bar.set_postfix_str("longitudinal")
- summary["longitudinal"] = run_longitudinal(
- evaluation=evaluation,
- clinical_df=clinical_df,
- output_dir=out_dir,
- )
- stage_bar.update(1)
- if skip_noise:
- summary["noise"] = {"skipped": True, "reason": "--skip-noise supplied"}
- summary["noise_accuracy_uncertainty"] = {
- "skipped": True,
- "reason": "Noise analysis skipped, so no noise table available.",
- }
- else:
- try:
- stage_bar.set_postfix_str("noise")
- summary["noise"] = run_noise_analysis(
- config=config,
- root_dir=root_dir,
- backend=backend,
- output_dir=out_dir,
- class_index=DEFAULT_POSITIVE_CLASS_INDEX,
- noise_sigmas=noise_factors,
- threshold=DEFAULT_DECISION_THRESHOLD,
- calibration_bins=DEFAULT_CALIBRATION_BINS,
- bayesian_mc_passes=DEFAULT_BAYESIAN_MC_PASSES,
- )
- stage_bar.update(1)
- stage_bar.set_postfix_str("noise-correlation")
- noise_table_path = Path(str(summary["noise"]["table"]))
- noise_df = pd.read_csv(noise_table_path)
- summary["noise_accuracy_uncertainty"] = (
- run_noise_accuracy_uncertainty_analysis(
- noise_df=noise_df,
- backend=backend,
- output_dir=out_dir,
- )
- )
- stage_bar.update(1)
- except Exception as exc:
- summary["noise"] = {
- "skipped": True,
- "reason": f"Noise analysis failed: {exc}",
- }
- summary["noise_accuracy_uncertainty"] = {
- "skipped": True,
- "reason": f"Noise relationship analysis failed: {exc}",
- }
- stage_bar.update(2)
- finally:
- stage_bar.close()
- report_path = _write_backend_plot_report(backend=backend, out_dir=out_dir)
- summary["plots_report"] = str(report_path)
- write_json(out_dir / "backend_summary.json", summary)
- return summary
- def main() -> None:
- args = _parse_args()
- analysis_dir = Path(__file__).resolve().parent
- paths = init_runtime_paths(analysis_dir=analysis_dir, run_name=args.run_name)
- config = load_config(paths.root_dir)
- clinical_df = load_clinical_table(config=config, root_dir=paths.root_dir)
- manifest: dict[str, Any] = {
- "run_dir": str(paths.run_dir),
- "output_root": str(paths.output_root),
- "mode": (
- "dataset_summary_only"
- if bool(args.dataset_summary_only)
- else (
- "longitudinal_breakdown_only"
- if bool(args.longitudinal_breakdown_only)
- else (
- "noise_correlation_only"
- if bool(args.noise_correlation_only)
- else "full"
- )
- )
- ),
- "positive_class_index": DEFAULT_POSITIVE_CLASS_INDEX,
- "threshold_sweep": {
- "values": [float(v) for v in threshold_grid().tolist()],
- },
- "calibration_bins": DEFAULT_CALIBRATION_BINS,
- "noise_factors": noise_factor_grid(),
- "bayesian_mc_passes": DEFAULT_BAYESIAN_MC_PASSES,
- "decision_threshold": DEFAULT_DECISION_THRESHOLD,
- "backends": {},
- }
- if args.dataset_summary_only:
- manifest["dataset_summary"] = run_dataset_summary(
- config=config,
- root_dir=paths.root_dir,
- output_dir=paths.run_dir,
- positive_class_index=DEFAULT_POSITIVE_CLASS_INDEX,
- )
- write_json(paths.run_dir / "run_manifest.json", manifest)
- print(f"Dataset summary complete. Results saved to {paths.run_dir}")
- return
- backend_iter = tqdm(args.backend, desc="Backends", unit="backend")
- for backend in backend_iter:
- out_dir = backend_dir(paths, backend)
- backend_iter.set_postfix_str(backend)
- if args.longitudinal_breakdown_only:
- manifest["backends"][backend] = _run_longitudinal_breakdown_only(
- config=config,
- backend=backend,
- clinical_df=clinical_df,
- out_dir=out_dir,
- )
- elif args.noise_correlation_only:
- manifest["backends"][backend] = _run_noise_correlation_only(
- backend=backend,
- out_dir=out_dir,
- )
- else:
- manifest["backends"][backend] = _run_backend(
- config=config,
- root_dir=paths.root_dir,
- backend=backend,
- clinical_df=clinical_df,
- skip_noise=bool(args.skip_noise),
- out_dir=out_dir,
- )
- write_json(paths.run_dir / "run_manifest.json", manifest)
- print(f"Analysis complete. Results saved to {paths.run_dir}")
- if __name__ == "__main__":
- main()
|