| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476 |
- # The purpose of this file is to perform a sensitivity analysis on the model evaluation results and graph the findings.
- # The sensitivity analysis will be done by varying the number of models used in the ensemble and observing the effect on overall accuracy.
- # We will take 50 different random arrangemnts of models for each ensemble size (other than 50, which is the full set) to get a distribution of accuracies for each ensemble size.
- # The will have associated error bars based on the standard deviation of the accuracies for each ensemble size.
- import xarray as xr
- from utils.config import config
- import pathlib as pl
- import numpy as np
- import matplotlib.pyplot as plt
- import os
- # Load the evaluation results
- os.chdir(pl.Path(__file__).parent)
- model_dataset_path = pl.Path("../model_evaluations") / pl.Path(
- config["analysis"]["evaluation_name"].strip()
- ).with_suffix(".nc")
- print(f"Loading evaluation results from {model_dataset_path}")
- array = xr.open_dataset(model_dataset_path) # type: ignore
- # This section was generated by Github Copilot - 2025-11-04
- # Perform sensitivity analysis by varying ensemble size and sampling subsets of models.
- predictions: xr.DataArray = array["predictions"]
- labels: xr.DataArray = array["labels"]
- # Make plots directory if it doesn't exist (matching other scripts)
- plots_dir = (
- pl.Path("../output") / pl.Path(config["analysis"]["evaluation_name"]) / "plots"
- )
- plots_dir.mkdir(parents=True, exist_ok=True)
- # Configuration for the sensitivity analysis
- rng = np.random.default_rng(777)
- num_models = int(predictions.sizes["model"])
- ensemble_sizes = list(range(1, num_models + 1))
- samples_per_size = 50
- # Extract true labels for the positive class (assumes same structure as other script)
- true_labels = labels.sel(label=1).values # shape: (n_samples,)
- # Container for results
- mean_accuracies: list[float] = []
- std_accuracies: list[float] = []
- all_accuracies: dict[int, list[float]] = {k: [] for k in ensemble_sizes}
- # Confusion-matrix counts per ensemble size (one entry per sample draw)
- all_true_positives: dict[int, list[int]] = {k: [] for k in ensemble_sizes}
- all_false_positives: dict[int, list[int]] = {k: [] for k in ensemble_sizes}
- all_true_negatives: dict[int, list[int]] = {k: [] for k in ensemble_sizes}
- all_false_negatives: dict[int, list[int]] = {k: [] for k in ensemble_sizes}
- # Also implement a "confidence accuracy score":
- # sum over samples of (confidence_of_predicted_class - 0.5) * (+1 if correct else -1)
- confidence_accuracy_scores: dict[int, list[float]] = {k: [] for k in ensemble_sizes}
- mean_confidence_accuracy_scores: list[float] = []
- std_confidence_accuracy_scores: list[float] = []
- # Track average confidence of predicted class for correct vs incorrect predictions
- correct_prediction_confidences: dict[int, list[float]] = {k: [] for k in ensemble_sizes}
- incorrect_prediction_confidences: dict[int, list[float]] = {
- k: [] for k in ensemble_sizes
- }
- mean_correct_prediction_confidences: list[float] = []
- std_correct_prediction_confidences: list[float] = []
- mean_incorrect_prediction_confidences: list[float] = []
- std_incorrect_prediction_confidences: list[float] = []
- # Track delta (correct - incorrect) confidence per model, averaged over models
- confidence_delta_per_model: dict[int, list[float]] = {k: [] for k in ensemble_sizes}
- mean_confidence_delta_per_model: list[float] = []
- std_confidence_delta_per_model: list[float] = []
- def confidence_accuracy_score(
- positive_class_confidence: np.ndarray, true_labels_binary: np.ndarray
- ) -> float:
- """Compute confidence accuracy score as described.
- Parameters
- ----------
- positive_class_confidence:
- Probability/confidence assigned to the positive class (class 1), shape (n_samples,).
- true_labels_binary:
- Ground-truth binary labels in {0, 1}, shape (n_samples,).
- """
- confs = np.asarray(positive_class_confidence, dtype=float)
- y = np.asarray(true_labels_binary, dtype=int)
- if confs.shape[0] != y.shape[0]:
- raise ValueError(
- f"Mismatched lengths: confs has {confs.shape[0]} samples, labels has {y.shape[0]}"
- )
- predicted_positive = confs >= 0.5
- true_positive = y == 1
- correct = predicted_positive == true_positive
- # Confidence of the predicted class (so correct negatives contribute positively)
- predicted_class_confidence = np.where(predicted_positive, confs, 1.0 - confs)
- sign = np.where(correct, 1.0, -1.0)
- # Normalize by number of evaluated images (per-image average score)
- # To normalize score to [0, 1], divide by 0.5 * n_samples
- return float(
- np.sum((predicted_class_confidence - 0.5) * sign) / (0.5 * confs.shape[0])
- )
- def average_predicted_class_confidence_by_correctness(
- positive_class_confidence: np.ndarray, true_labels_binary: np.ndarray
- ) -> tuple[float, float]:
- """Return (mean_conf_correct, mean_conf_incorrect) for predicted class confidence.
- Uses confidence of the predicted class for each sample:
- - if predicted positive: confidence = P(class=1)
- - if predicted negative: confidence = P(class=0) = 1 - P(class=1)
- """
- confs = np.asarray(positive_class_confidence, dtype=float)
- y = np.asarray(true_labels_binary, dtype=int)
- if confs.shape[0] != y.shape[0]:
- raise ValueError(
- f"Mismatched lengths: confs has {confs.shape[0]} samples, labels has {y.shape[0]}"
- )
- predicted_positive = confs >= 0.5
- true_positive = y == 1
- correct = predicted_positive == true_positive
- predicted_class_confidence = np.where(predicted_positive, confs, 1.0 - confs)
- correct_confs = predicted_class_confidence[correct]
- incorrect_confs = predicted_class_confidence[~correct]
- mean_correct = float(np.mean(correct_confs)) if correct_confs.size else float("nan")
- mean_incorrect = (
- float(np.mean(incorrect_confs)) if incorrect_confs.size else float("nan")
- )
- return mean_correct, mean_incorrect
- def average_confidence_delta_per_model(
- predictions_selected: xr.DataArray, true_labels_binary: np.ndarray
- ) -> float:
- """Average (correct - incorrect) predicted-class confidence per model.
- For each model independently:
- 1) compute predicted-class confidence per sample
- 2) split into correct vs incorrect samples
- 3) delta = mean(correct) - mean(incorrect)
- Returns the mean delta across models (nan-safe).
- """
- per_model_confs = predictions_selected.sel(img_class=1).values # (k_models, n)
- y = np.asarray(true_labels_binary, dtype=int)
- if per_model_confs.shape[1] != y.shape[0]:
- raise ValueError(
- f"Mismatched lengths: predictions have {per_model_confs.shape[1]} samples, labels has {y.shape[0]}"
- )
- # For each model, determine predicted class and confidence of predicted class
- predicted_positive = per_model_confs >= 0.5
- true_positive = y == 1
- correct = predicted_positive == true_positive # broadcasts to (k_models, n)
- predicted_class_confidence = np.where(
- predicted_positive, per_model_confs, 1.0 - per_model_confs
- )
- # Compute per-model means for correct/incorrect, then delta
- deltas: list[float] = []
- for m in range(predicted_class_confidence.shape[0]):
- conf_m = predicted_class_confidence[m]
- correct_m = correct[m]
- correct_vals = conf_m[correct_m]
- incorrect_vals = conf_m[~correct_m]
- if correct_vals.size == 0 or incorrect_vals.size == 0:
- deltas.append(float("nan"))
- continue
- deltas.append(float(np.mean(correct_vals) - np.mean(incorrect_vals)))
- return float(np.nanmean(np.asarray(deltas, dtype=float)))
- for k in ensemble_sizes:
- accuracies_k = []
- true_positives_k: list[int] = []
- false_positives_k: list[int] = []
- true_negatives_k: list[int] = []
- false_negatives_k: list[int] = []
- confidence_scores_k: list[float] = []
- correct_confidences_k: list[float] = []
- incorrect_confidences_k: list[float] = []
- confidence_deltas_per_model_k: list[float] = []
- # If using the full set, evaluate once deterministically
- if k == num_models:
- selected_idx = np.arange(num_models)
- preds_selected = predictions.isel(model=selected_idx).mean(dim="model")
- per_model_preds_selected = predictions.isel(model=selected_idx)
- confs = preds_selected.sel(img_class=1).values
- predicted_positive = confs >= 0.5
- true_positive = true_labels == 1
- confidence_scores_k.append(confidence_accuracy_score(confs, true_labels))
- mean_correct_conf, mean_incorrect_conf = (
- average_predicted_class_confidence_by_correctness(confs, true_labels)
- )
- correct_confidences_k.append(mean_correct_conf)
- incorrect_confidences_k.append(mean_incorrect_conf)
- confidence_deltas_per_model_k.append(
- average_confidence_delta_per_model(per_model_preds_selected, true_labels)
- )
- tp = int((predicted_positive & true_positive).sum().item())
- fp = int((predicted_positive & ~true_positive).sum().item())
- tn = int((~predicted_positive & ~true_positive).sum().item())
- fn = int((~predicted_positive & true_positive).sum().item())
- true_positives_k.append(tp)
- false_positives_k.append(fp)
- true_negatives_k.append(tn)
- false_negatives_k.append(fn)
- acc = (predicted_positive == true_positive).sum().item() / len(confs)
- accuracies_k.append(acc)
- else:
- for _ in range(samples_per_size):
- selected_idx = rng.choice(num_models, size=k, replace=False)
- preds_selected = predictions.isel(model=selected_idx).mean(dim="model")
- per_model_preds_selected = predictions.isel(model=selected_idx)
- confs = preds_selected.sel(img_class=1).values
- predicted_positive = confs >= 0.5
- true_positive = true_labels == 1
- confidence_scores_k.append(confidence_accuracy_score(confs, true_labels))
- mean_correct_conf, mean_incorrect_conf = (
- average_predicted_class_confidence_by_correctness(confs, true_labels)
- )
- correct_confidences_k.append(mean_correct_conf)
- incorrect_confidences_k.append(mean_incorrect_conf)
- confidence_deltas_per_model_k.append(
- average_confidence_delta_per_model(
- per_model_preds_selected, true_labels
- )
- )
- tp = int((predicted_positive & true_positive).sum().item())
- fp = int((predicted_positive & ~true_positive).sum().item())
- tn = int((~predicted_positive & ~true_positive).sum().item())
- fn = int((~predicted_positive & true_positive).sum().item())
- true_positives_k.append(tp)
- false_positives_k.append(fp)
- true_negatives_k.append(tn)
- false_negatives_k.append(fn)
- acc = (predicted_positive == true_positive).sum().item() / len(confs)
- accuracies_k.append(acc)
- all_accuracies[k] = accuracies_k
- all_true_positives[k] = true_positives_k
- all_false_positives[k] = false_positives_k
- all_true_negatives[k] = true_negatives_k
- all_false_negatives[k] = false_negatives_k
- confidence_accuracy_scores[k] = confidence_scores_k
- mean_confidence_accuracy_scores.append(float(np.mean(confidence_scores_k)))
- std_confidence_accuracy_scores.append(float(np.std(confidence_scores_k, ddof=0)))
- correct_prediction_confidences[k] = correct_confidences_k
- incorrect_prediction_confidences[k] = incorrect_confidences_k
- mean_correct_prediction_confidences.append(
- float(np.nanmean(np.asarray(correct_confidences_k, dtype=float)))
- )
- std_correct_prediction_confidences.append(
- float(np.nanstd(np.asarray(correct_confidences_k, dtype=float), ddof=0))
- )
- mean_incorrect_prediction_confidences.append(
- float(np.nanmean(np.asarray(incorrect_confidences_k, dtype=float)))
- )
- std_incorrect_prediction_confidences.append(
- float(np.nanstd(np.asarray(incorrect_confidences_k, dtype=float), ddof=0))
- )
- confidence_delta_per_model[k] = confidence_deltas_per_model_k
- mean_confidence_delta_per_model.append(
- float(np.nanmean(np.asarray(confidence_deltas_per_model_k, dtype=float)))
- )
- std_confidence_delta_per_model.append(
- float(np.nanstd(np.asarray(confidence_deltas_per_model_k, dtype=float), ddof=0))
- )
- mean_accuracies.append(float(np.mean(accuracies_k)))
- std_accuracies.append(float(np.std(accuracies_k, ddof=0)))
- # Compute F1 scores per ensemble size from stored confusion counts
- mean_f1s: list[float] = []
- std_f1s: list[float] = []
- all_f1s: dict[int, list[float]] = {k: [] for k in ensemble_sizes}
- for k in ensemble_sizes:
- tp_arr = np.asarray(all_true_positives[k], dtype=float)
- fp_arr = np.asarray(all_false_positives[k], dtype=float)
- fn_arr = np.asarray(all_false_negatives[k], dtype=float)
- denom = 2 * tp_arr + fp_arr + fn_arr
- f1_arr = np.divide(
- 2 * tp_arr,
- denom,
- out=np.zeros_like(denom, dtype=float),
- where=denom != 0,
- )
- f1s_k = [float(x) for x in f1_arr.tolist()]
- all_f1s[k] = f1s_k
- mean_f1s.append(float(np.mean(f1s_k)))
- std_f1s.append(float(np.std(f1s_k, ddof=0)))
- # Plot mean accuracy vs ensemble size with error bars (std)
- plt.figure(figsize=(10, 6))
- plt.errorbar(
- ensemble_sizes,
- mean_accuracies,
- yerr=std_accuracies,
- fmt="-o",
- capsize=3,
- color="tab:blue",
- ecolor="tab:blue",
- )
- plt.title("Sensitivity Analysis: Accuracy vs Ensemble Size")
- plt.xlabel("Number of Models in Ensemble")
- plt.ylabel("Accuracy")
- plt.grid(True)
- # Set x-ticks every 5 models (and always include the final model count)
- ticks = list(range(1, num_models + 1, 5))
- if len(ticks) == 0 or ticks[-1] != num_models:
- ticks.append(num_models)
- plt.xticks(ticks)
- # Optionally overlay raw sample distributions as jittered points
- for i, k in enumerate(ensemble_sizes):
- y = all_accuracies[k]
- x = np.full(len(y), k) + (rng.random(len(y)) - 0.5) * 0.2 # small jitter
- plt.scatter(x, y, alpha=0.35, s=10, color="lightgray")
- plt.tight_layout()
- plt.savefig(plots_dir / "sensitivity_accuracy_vs_ensemble_size.png")
- # Plot mean F1 vs ensemble size with error bars (std)
- plt.figure(figsize=(10, 6))
- plt.errorbar(
- ensemble_sizes,
- mean_f1s,
- yerr=std_f1s,
- fmt="-o",
- capsize=3,
- color="tab:orange",
- ecolor="tab:orange",
- )
- plt.title("Sensitivity Analysis: F1 Score vs Ensemble Size")
- plt.xlabel("Number of Models in Ensemble")
- plt.ylabel("F1 Score")
- plt.grid(True)
- plt.xticks(ticks)
- # Optionally overlay raw sample distributions as jittered points
- for i, k in enumerate(ensemble_sizes):
- y = all_f1s[k]
- x = np.full(len(y), k) + (rng.random(len(y)) - 0.5) * 0.2 # small jitter
- plt.scatter(x, y, alpha=0.35, s=10, color="lightgray")
- plt.tight_layout()
- plt.savefig(plots_dir / "sensitivity_f1_vs_ensemble_size.png")
- # Plot mean confidence accuracy score vs ensemble size with error bars (std)
- plt.figure(figsize=(10, 6))
- plt.errorbar(
- ensemble_sizes,
- mean_confidence_accuracy_scores,
- yerr=std_confidence_accuracy_scores,
- fmt="-o",
- capsize=3,
- color="tab:purple",
- ecolor="tab:purple",
- )
- plt.title("Sensitivity Analysis: Confidence Accuracy Score vs Ensemble Size")
- plt.xlabel("Number of Models in Ensemble")
- plt.ylabel("Confidence Accuracy Score (per image)")
- plt.grid(True)
- plt.xticks(ticks)
- # Optionally overlay raw sample distributions as jittered points
- for k in ensemble_sizes:
- y = confidence_accuracy_scores[k]
- x = np.full(len(y), k) + (rng.random(len(y)) - 0.5) * 0.2 # small jitter
- plt.scatter(x, y, alpha=0.35, s=10, color="lightgray")
- plt.tight_layout()
- plt.savefig(plots_dir / "sensitivity_confidence_accuracy_vs_ensemble_size.png")
- # Plot mean predicted-class confidence for correct vs incorrect predictions
- plt.figure(figsize=(10, 6))
- plt.errorbar(
- ensemble_sizes,
- mean_correct_prediction_confidences,
- yerr=std_correct_prediction_confidences,
- fmt="-o",
- capsize=3,
- color="tab:green",
- label="Correct predictions",
- )
- plt.errorbar(
- ensemble_sizes,
- mean_incorrect_prediction_confidences,
- yerr=std_incorrect_prediction_confidences,
- fmt="-o",
- capsize=3,
- color="tab:red",
- label="Incorrect predictions",
- )
- plt.title("Sensitivity Analysis: Avg Predicted-Class Confidence (Correct vs Incorrect)")
- plt.xlabel("Number of Models in Ensemble")
- plt.ylabel("Average Predicted-Class Confidence")
- plt.grid(True)
- plt.xticks(ticks)
- # Overlay raw sample distributions as jittered points
- for k in ensemble_sizes:
- x = np.full(samples_per_size if k != num_models else 1, k) + (
- (rng.random(samples_per_size if k != num_models else 1) - 0.5) * 0.2
- )
- y_correct = np.asarray(correct_prediction_confidences[k], dtype=float)
- y_incorrect = np.asarray(incorrect_prediction_confidences[k], dtype=float)
- if y_correct.size:
- plt.scatter(
- x[: y_correct.size],
- y_correct,
- alpha=0.45,
- s=14,
- color="lightgray",
- marker="o",
- )
- if y_incorrect.size:
- plt.scatter(
- x[: y_incorrect.size],
- y_incorrect,
- alpha=0.45,
- s=18,
- color="dimgray",
- marker="x",
- )
- plt.legend()
- plt.tight_layout()
- plt.savefig(plots_dir / "sensitivity_avg_confidence_correct_vs_incorrect.png")
- # Plot mean confidence delta per model (correct - incorrect) vs ensemble size
- plt.figure(figsize=(10, 6))
- plt.errorbar(
- ensemble_sizes,
- mean_confidence_delta_per_model,
- yerr=std_confidence_delta_per_model,
- fmt="-o",
- capsize=3,
- color="tab:blue",
- )
- plt.title("Sensitivity Analysis: Confidence Delta Per Model (Correct - Incorrect)")
- plt.xlabel("Number of Models in Ensemble")
- plt.ylabel("Avg Delta Per Model")
- plt.grid(True)
- plt.xticks(ticks)
- for k in ensemble_sizes:
- y = confidence_delta_per_model[k]
- x = np.full(len(y), k) + (rng.random(len(y)) - 0.5) * 0.2
- plt.scatter(x, y, alpha=0.35, s=12, color="lightgray")
- plt.tight_layout()
- plt.savefig(plots_dir / "sensitivity_confidence_delta_per_model.png")
- # End of Copilot section
|