# The purpose of this file is to perform a sensitivity analysis on the model evaluation results and graph the findings. # The sensitivity analysis will be done by varying the number of models used in the ensemble and observing the effect on overall accuracy. # We will take 50 different random arrangemnts of models for each ensemble size (other than 50, which is the full set) to get a distribution of accuracies for each ensemble size. # The will have associated error bars based on the standard deviation of the accuracies for each ensemble size. import xarray as xr from utils.config import config import pathlib as pl import numpy as np import matplotlib.pyplot as plt import os # Load the evaluation results os.chdir(pl.Path(__file__).parent) model_dataset_path = pl.Path("../model_evaluations") / pl.Path( config["analysis"]["evaluation_name"].strip() ).with_suffix(".nc") print(f"Loading evaluation results from {model_dataset_path}") array = xr.open_dataset(model_dataset_path) # type: ignore # This section was generated by Github Copilot - 2025-11-04 # Perform sensitivity analysis by varying ensemble size and sampling subsets of models. predictions: xr.DataArray = array["predictions"] labels: xr.DataArray = array["labels"] # Make plots directory if it doesn't exist (matching other scripts) plots_dir = ( pl.Path("../output") / pl.Path(config["analysis"]["evaluation_name"]) / "plots" ) plots_dir.mkdir(parents=True, exist_ok=True) # Configuration for the sensitivity analysis rng = np.random.default_rng(777) num_models = int(predictions.sizes["model"]) ensemble_sizes = list(range(1, num_models + 1)) samples_per_size = 50 # Extract true labels for the positive class (assumes same structure as other script) true_labels = labels.sel(label=1).values # shape: (n_samples,) # Container for results mean_accuracies: list[float] = [] std_accuracies: list[float] = [] all_accuracies: dict[int, list[float]] = {k: [] for k in ensemble_sizes} # Confusion-matrix counts per ensemble size (one entry per sample draw) all_true_positives: dict[int, list[int]] = {k: [] for k in ensemble_sizes} all_false_positives: dict[int, list[int]] = {k: [] for k in ensemble_sizes} all_true_negatives: dict[int, list[int]] = {k: [] for k in ensemble_sizes} all_false_negatives: dict[int, list[int]] = {k: [] for k in ensemble_sizes} # Also implement a "confidence accuracy score": # sum over samples of (confidence_of_predicted_class - 0.5) * (+1 if correct else -1) confidence_accuracy_scores: dict[int, list[float]] = {k: [] for k in ensemble_sizes} mean_confidence_accuracy_scores: list[float] = [] std_confidence_accuracy_scores: list[float] = [] # Track average confidence of predicted class for correct vs incorrect predictions correct_prediction_confidences: dict[int, list[float]] = {k: [] for k in ensemble_sizes} incorrect_prediction_confidences: dict[int, list[float]] = { k: [] for k in ensemble_sizes } mean_correct_prediction_confidences: list[float] = [] std_correct_prediction_confidences: list[float] = [] mean_incorrect_prediction_confidences: list[float] = [] std_incorrect_prediction_confidences: list[float] = [] # Track delta (correct - incorrect) confidence per model, averaged over models confidence_delta_per_model: dict[int, list[float]] = {k: [] for k in ensemble_sizes} mean_confidence_delta_per_model: list[float] = [] std_confidence_delta_per_model: list[float] = [] def confidence_accuracy_score( positive_class_confidence: np.ndarray, true_labels_binary: np.ndarray ) -> float: """Compute confidence accuracy score as described. Parameters ---------- positive_class_confidence: Probability/confidence assigned to the positive class (class 1), shape (n_samples,). true_labels_binary: Ground-truth binary labels in {0, 1}, shape (n_samples,). """ confs = np.asarray(positive_class_confidence, dtype=float) y = np.asarray(true_labels_binary, dtype=int) if confs.shape[0] != y.shape[0]: raise ValueError( f"Mismatched lengths: confs has {confs.shape[0]} samples, labels has {y.shape[0]}" ) predicted_positive = confs >= 0.5 true_positive = y == 1 correct = predicted_positive == true_positive # Confidence of the predicted class (so correct negatives contribute positively) predicted_class_confidence = np.where(predicted_positive, confs, 1.0 - confs) sign = np.where(correct, 1.0, -1.0) # Normalize by number of evaluated images (per-image average score) # To normalize score to [0, 1], divide by 0.5 * n_samples return float( np.sum((predicted_class_confidence - 0.5) * sign) / (0.5 * confs.shape[0]) ) def average_predicted_class_confidence_by_correctness( positive_class_confidence: np.ndarray, true_labels_binary: np.ndarray ) -> tuple[float, float]: """Return (mean_conf_correct, mean_conf_incorrect) for predicted class confidence. Uses confidence of the predicted class for each sample: - if predicted positive: confidence = P(class=1) - if predicted negative: confidence = P(class=0) = 1 - P(class=1) """ confs = np.asarray(positive_class_confidence, dtype=float) y = np.asarray(true_labels_binary, dtype=int) if confs.shape[0] != y.shape[0]: raise ValueError( f"Mismatched lengths: confs has {confs.shape[0]} samples, labels has {y.shape[0]}" ) predicted_positive = confs >= 0.5 true_positive = y == 1 correct = predicted_positive == true_positive predicted_class_confidence = np.where(predicted_positive, confs, 1.0 - confs) correct_confs = predicted_class_confidence[correct] incorrect_confs = predicted_class_confidence[~correct] mean_correct = float(np.mean(correct_confs)) if correct_confs.size else float("nan") mean_incorrect = ( float(np.mean(incorrect_confs)) if incorrect_confs.size else float("nan") ) return mean_correct, mean_incorrect def average_confidence_delta_per_model( predictions_selected: xr.DataArray, true_labels_binary: np.ndarray ) -> float: """Average (correct - incorrect) predicted-class confidence per model. For each model independently: 1) compute predicted-class confidence per sample 2) split into correct vs incorrect samples 3) delta = mean(correct) - mean(incorrect) Returns the mean delta across models (nan-safe). """ per_model_confs = predictions_selected.sel(img_class=1).values # (k_models, n) y = np.asarray(true_labels_binary, dtype=int) if per_model_confs.shape[1] != y.shape[0]: raise ValueError( f"Mismatched lengths: predictions have {per_model_confs.shape[1]} samples, labels has {y.shape[0]}" ) # For each model, determine predicted class and confidence of predicted class predicted_positive = per_model_confs >= 0.5 true_positive = y == 1 correct = predicted_positive == true_positive # broadcasts to (k_models, n) predicted_class_confidence = np.where( predicted_positive, per_model_confs, 1.0 - per_model_confs ) # Compute per-model means for correct/incorrect, then delta deltas: list[float] = [] for m in range(predicted_class_confidence.shape[0]): conf_m = predicted_class_confidence[m] correct_m = correct[m] correct_vals = conf_m[correct_m] incorrect_vals = conf_m[~correct_m] if correct_vals.size == 0 or incorrect_vals.size == 0: deltas.append(float("nan")) continue deltas.append(float(np.mean(correct_vals) - np.mean(incorrect_vals))) return float(np.nanmean(np.asarray(deltas, dtype=float))) for k in ensemble_sizes: accuracies_k = [] true_positives_k: list[int] = [] false_positives_k: list[int] = [] true_negatives_k: list[int] = [] false_negatives_k: list[int] = [] confidence_scores_k: list[float] = [] correct_confidences_k: list[float] = [] incorrect_confidences_k: list[float] = [] confidence_deltas_per_model_k: list[float] = [] # If using the full set, evaluate once deterministically if k == num_models: selected_idx = np.arange(num_models) preds_selected = predictions.isel(model=selected_idx).mean(dim="model") per_model_preds_selected = predictions.isel(model=selected_idx) confs = preds_selected.sel(img_class=1).values predicted_positive = confs >= 0.5 true_positive = true_labels == 1 confidence_scores_k.append(confidence_accuracy_score(confs, true_labels)) mean_correct_conf, mean_incorrect_conf = ( average_predicted_class_confidence_by_correctness(confs, true_labels) ) correct_confidences_k.append(mean_correct_conf) incorrect_confidences_k.append(mean_incorrect_conf) confidence_deltas_per_model_k.append( average_confidence_delta_per_model(per_model_preds_selected, true_labels) ) tp = int((predicted_positive & true_positive).sum().item()) fp = int((predicted_positive & ~true_positive).sum().item()) tn = int((~predicted_positive & ~true_positive).sum().item()) fn = int((~predicted_positive & true_positive).sum().item()) true_positives_k.append(tp) false_positives_k.append(fp) true_negatives_k.append(tn) false_negatives_k.append(fn) acc = (predicted_positive == true_positive).sum().item() / len(confs) accuracies_k.append(acc) else: for _ in range(samples_per_size): selected_idx = rng.choice(num_models, size=k, replace=False) preds_selected = predictions.isel(model=selected_idx).mean(dim="model") per_model_preds_selected = predictions.isel(model=selected_idx) confs = preds_selected.sel(img_class=1).values predicted_positive = confs >= 0.5 true_positive = true_labels == 1 confidence_scores_k.append(confidence_accuracy_score(confs, true_labels)) mean_correct_conf, mean_incorrect_conf = ( average_predicted_class_confidence_by_correctness(confs, true_labels) ) correct_confidences_k.append(mean_correct_conf) incorrect_confidences_k.append(mean_incorrect_conf) confidence_deltas_per_model_k.append( average_confidence_delta_per_model( per_model_preds_selected, true_labels ) ) tp = int((predicted_positive & true_positive).sum().item()) fp = int((predicted_positive & ~true_positive).sum().item()) tn = int((~predicted_positive & ~true_positive).sum().item()) fn = int((~predicted_positive & true_positive).sum().item()) true_positives_k.append(tp) false_positives_k.append(fp) true_negatives_k.append(tn) false_negatives_k.append(fn) acc = (predicted_positive == true_positive).sum().item() / len(confs) accuracies_k.append(acc) all_accuracies[k] = accuracies_k all_true_positives[k] = true_positives_k all_false_positives[k] = false_positives_k all_true_negatives[k] = true_negatives_k all_false_negatives[k] = false_negatives_k confidence_accuracy_scores[k] = confidence_scores_k mean_confidence_accuracy_scores.append(float(np.mean(confidence_scores_k))) std_confidence_accuracy_scores.append(float(np.std(confidence_scores_k, ddof=0))) correct_prediction_confidences[k] = correct_confidences_k incorrect_prediction_confidences[k] = incorrect_confidences_k mean_correct_prediction_confidences.append( float(np.nanmean(np.asarray(correct_confidences_k, dtype=float))) ) std_correct_prediction_confidences.append( float(np.nanstd(np.asarray(correct_confidences_k, dtype=float), ddof=0)) ) mean_incorrect_prediction_confidences.append( float(np.nanmean(np.asarray(incorrect_confidences_k, dtype=float))) ) std_incorrect_prediction_confidences.append( float(np.nanstd(np.asarray(incorrect_confidences_k, dtype=float), ddof=0)) ) confidence_delta_per_model[k] = confidence_deltas_per_model_k mean_confidence_delta_per_model.append( float(np.nanmean(np.asarray(confidence_deltas_per_model_k, dtype=float))) ) std_confidence_delta_per_model.append( float(np.nanstd(np.asarray(confidence_deltas_per_model_k, dtype=float), ddof=0)) ) mean_accuracies.append(float(np.mean(accuracies_k))) std_accuracies.append(float(np.std(accuracies_k, ddof=0))) # Compute F1 scores per ensemble size from stored confusion counts mean_f1s: list[float] = [] std_f1s: list[float] = [] all_f1s: dict[int, list[float]] = {k: [] for k in ensemble_sizes} for k in ensemble_sizes: tp_arr = np.asarray(all_true_positives[k], dtype=float) fp_arr = np.asarray(all_false_positives[k], dtype=float) fn_arr = np.asarray(all_false_negatives[k], dtype=float) denom = 2 * tp_arr + fp_arr + fn_arr f1_arr = np.divide( 2 * tp_arr, denom, out=np.zeros_like(denom, dtype=float), where=denom != 0, ) f1s_k = [float(x) for x in f1_arr.tolist()] all_f1s[k] = f1s_k mean_f1s.append(float(np.mean(f1s_k))) std_f1s.append(float(np.std(f1s_k, ddof=0))) # Plot mean accuracy vs ensemble size with error bars (std) plt.figure(figsize=(10, 6)) plt.errorbar( ensemble_sizes, mean_accuracies, yerr=std_accuracies, fmt="-o", capsize=3, color="tab:blue", ecolor="tab:blue", ) plt.title("Sensitivity Analysis: Accuracy vs Ensemble Size") plt.xlabel("Number of Models in Ensemble") plt.ylabel("Accuracy") plt.grid(True) # Set x-ticks every 5 models (and always include the final model count) ticks = list(range(1, num_models + 1, 5)) if len(ticks) == 0 or ticks[-1] != num_models: ticks.append(num_models) plt.xticks(ticks) # Optionally overlay raw sample distributions as jittered points for i, k in enumerate(ensemble_sizes): y = all_accuracies[k] x = np.full(len(y), k) + (rng.random(len(y)) - 0.5) * 0.2 # small jitter plt.scatter(x, y, alpha=0.35, s=10, color="lightgray") plt.tight_layout() plt.savefig(plots_dir / "sensitivity_accuracy_vs_ensemble_size.png") # Plot mean F1 vs ensemble size with error bars (std) plt.figure(figsize=(10, 6)) plt.errorbar( ensemble_sizes, mean_f1s, yerr=std_f1s, fmt="-o", capsize=3, color="tab:orange", ecolor="tab:orange", ) plt.title("Sensitivity Analysis: F1 Score vs Ensemble Size") plt.xlabel("Number of Models in Ensemble") plt.ylabel("F1 Score") plt.grid(True) plt.xticks(ticks) # Optionally overlay raw sample distributions as jittered points for i, k in enumerate(ensemble_sizes): y = all_f1s[k] x = np.full(len(y), k) + (rng.random(len(y)) - 0.5) * 0.2 # small jitter plt.scatter(x, y, alpha=0.35, s=10, color="lightgray") plt.tight_layout() plt.savefig(plots_dir / "sensitivity_f1_vs_ensemble_size.png") # Plot mean confidence accuracy score vs ensemble size with error bars (std) plt.figure(figsize=(10, 6)) plt.errorbar( ensemble_sizes, mean_confidence_accuracy_scores, yerr=std_confidence_accuracy_scores, fmt="-o", capsize=3, color="tab:purple", ecolor="tab:purple", ) plt.title("Sensitivity Analysis: Confidence Accuracy Score vs Ensemble Size") plt.xlabel("Number of Models in Ensemble") plt.ylabel("Confidence Accuracy Score (per image)") plt.grid(True) plt.xticks(ticks) # Optionally overlay raw sample distributions as jittered points for k in ensemble_sizes: y = confidence_accuracy_scores[k] x = np.full(len(y), k) + (rng.random(len(y)) - 0.5) * 0.2 # small jitter plt.scatter(x, y, alpha=0.35, s=10, color="lightgray") plt.tight_layout() plt.savefig(plots_dir / "sensitivity_confidence_accuracy_vs_ensemble_size.png") # Plot mean predicted-class confidence for correct vs incorrect predictions plt.figure(figsize=(10, 6)) plt.errorbar( ensemble_sizes, mean_correct_prediction_confidences, yerr=std_correct_prediction_confidences, fmt="-o", capsize=3, color="tab:green", label="Correct predictions", ) plt.errorbar( ensemble_sizes, mean_incorrect_prediction_confidences, yerr=std_incorrect_prediction_confidences, fmt="-o", capsize=3, color="tab:red", label="Incorrect predictions", ) plt.title("Sensitivity Analysis: Avg Predicted-Class Confidence (Correct vs Incorrect)") plt.xlabel("Number of Models in Ensemble") plt.ylabel("Average Predicted-Class Confidence") plt.grid(True) plt.xticks(ticks) # Overlay raw sample distributions as jittered points for k in ensemble_sizes: x = np.full(samples_per_size if k != num_models else 1, k) + ( (rng.random(samples_per_size if k != num_models else 1) - 0.5) * 0.2 ) y_correct = np.asarray(correct_prediction_confidences[k], dtype=float) y_incorrect = np.asarray(incorrect_prediction_confidences[k], dtype=float) if y_correct.size: plt.scatter( x[: y_correct.size], y_correct, alpha=0.45, s=14, color="lightgray", marker="o", ) if y_incorrect.size: plt.scatter( x[: y_incorrect.size], y_incorrect, alpha=0.45, s=18, color="dimgray", marker="x", ) plt.legend() plt.tight_layout() plt.savefig(plots_dir / "sensitivity_avg_confidence_correct_vs_incorrect.png") # Plot mean confidence delta per model (correct - incorrect) vs ensemble size plt.figure(figsize=(10, 6)) plt.errorbar( ensemble_sizes, mean_confidence_delta_per_model, yerr=std_confidence_delta_per_model, fmt="-o", capsize=3, color="tab:blue", ) plt.title("Sensitivity Analysis: Confidence Delta Per Model (Correct - Incorrect)") plt.xlabel("Number of Models in Ensemble") plt.ylabel("Avg Delta Per Model") plt.grid(True) plt.xticks(ticks) for k in ensemble_sizes: y = confidence_delta_per_model[k] x = np.full(len(y), k) + (rng.random(len(y)) - 0.5) * 0.2 plt.scatter(x, y, alpha=0.35, s=12, color="lightgray") plt.tight_layout() plt.savefig(plots_dir / "sensitivity_confidence_delta_per_model.png") # End of Copilot section