|
@@ -32,7 +32,7 @@ plots_dir = (
|
|
|
plots_dir.mkdir(parents=True, exist_ok=True)
|
|
plots_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
|
|
|
|
# Configuration for the sensitivity analysis
|
|
# Configuration for the sensitivity analysis
|
|
|
-rng = np.random.default_rng(42)
|
|
|
|
|
|
|
+rng = np.random.default_rng(777)
|
|
|
num_models = int(predictions.sizes["model"])
|
|
num_models = int(predictions.sizes["model"])
|
|
|
ensemble_sizes = list(range(1, num_models + 1))
|
|
ensemble_sizes = list(range(1, num_models + 1))
|
|
|
samples_per_size = 50
|
|
samples_per_size = 50
|
|
@@ -51,21 +51,166 @@ all_false_positives: dict[int, list[int]] = {k: [] for k in ensemble_sizes}
|
|
|
all_true_negatives: dict[int, list[int]] = {k: [] for k in ensemble_sizes}
|
|
all_true_negatives: dict[int, list[int]] = {k: [] for k in ensemble_sizes}
|
|
|
all_false_negatives: dict[int, list[int]] = {k: [] for k in ensemble_sizes}
|
|
all_false_negatives: dict[int, list[int]] = {k: [] for k in ensemble_sizes}
|
|
|
|
|
|
|
|
|
|
+# Also implement a "confidence accuracy score":
|
|
|
|
|
+# sum over samples of (confidence_of_predicted_class - 0.5) * (+1 if correct else -1)
|
|
|
|
|
+confidence_accuracy_scores: dict[int, list[float]] = {k: [] for k in ensemble_sizes}
|
|
|
|
|
+mean_confidence_accuracy_scores: list[float] = []
|
|
|
|
|
+std_confidence_accuracy_scores: list[float] = []
|
|
|
|
|
+
|
|
|
|
|
+# Track average confidence of predicted class for correct vs incorrect predictions
|
|
|
|
|
+correct_prediction_confidences: dict[int, list[float]] = {k: [] for k in ensemble_sizes}
|
|
|
|
|
+incorrect_prediction_confidences: dict[int, list[float]] = {
|
|
|
|
|
+ k: [] for k in ensemble_sizes
|
|
|
|
|
+}
|
|
|
|
|
+mean_correct_prediction_confidences: list[float] = []
|
|
|
|
|
+std_correct_prediction_confidences: list[float] = []
|
|
|
|
|
+mean_incorrect_prediction_confidences: list[float] = []
|
|
|
|
|
+std_incorrect_prediction_confidences: list[float] = []
|
|
|
|
|
+
|
|
|
|
|
+# Track delta (correct - incorrect) confidence per model, averaged over models
|
|
|
|
|
+confidence_delta_per_model: dict[int, list[float]] = {k: [] for k in ensemble_sizes}
|
|
|
|
|
+mean_confidence_delta_per_model: list[float] = []
|
|
|
|
|
+std_confidence_delta_per_model: list[float] = []
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+def confidence_accuracy_score(
|
|
|
|
|
+ positive_class_confidence: np.ndarray, true_labels_binary: np.ndarray
|
|
|
|
|
+) -> float:
|
|
|
|
|
+ """Compute confidence accuracy score as described.
|
|
|
|
|
+
|
|
|
|
|
+ Parameters
|
|
|
|
|
+ ----------
|
|
|
|
|
+ positive_class_confidence:
|
|
|
|
|
+ Probability/confidence assigned to the positive class (class 1), shape (n_samples,).
|
|
|
|
|
+ true_labels_binary:
|
|
|
|
|
+ Ground-truth binary labels in {0, 1}, shape (n_samples,).
|
|
|
|
|
+ """
|
|
|
|
|
+
|
|
|
|
|
+ confs = np.asarray(positive_class_confidence, dtype=float)
|
|
|
|
|
+ y = np.asarray(true_labels_binary, dtype=int)
|
|
|
|
|
+ if confs.shape[0] != y.shape[0]:
|
|
|
|
|
+ raise ValueError(
|
|
|
|
|
+ f"Mismatched lengths: confs has {confs.shape[0]} samples, labels has {y.shape[0]}"
|
|
|
|
|
+ )
|
|
|
|
|
+
|
|
|
|
|
+ predicted_positive = confs >= 0.5
|
|
|
|
|
+ true_positive = y == 1
|
|
|
|
|
+ correct = predicted_positive == true_positive
|
|
|
|
|
+
|
|
|
|
|
+ # Confidence of the predicted class (so correct negatives contribute positively)
|
|
|
|
|
+ predicted_class_confidence = np.where(predicted_positive, confs, 1.0 - confs)
|
|
|
|
|
+
|
|
|
|
|
+ sign = np.where(correct, 1.0, -1.0)
|
|
|
|
|
+ # Normalize by number of evaluated images (per-image average score)
|
|
|
|
|
+ # To normalize score to [0, 1], divide by 0.5 * n_samples
|
|
|
|
|
+ return float(
|
|
|
|
|
+ np.sum((predicted_class_confidence - 0.5) * sign) / (0.5 * confs.shape[0])
|
|
|
|
|
+ )
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+def average_predicted_class_confidence_by_correctness(
|
|
|
|
|
+ positive_class_confidence: np.ndarray, true_labels_binary: np.ndarray
|
|
|
|
|
+) -> tuple[float, float]:
|
|
|
|
|
+ """Return (mean_conf_correct, mean_conf_incorrect) for predicted class confidence.
|
|
|
|
|
+
|
|
|
|
|
+ Uses confidence of the predicted class for each sample:
|
|
|
|
|
+ - if predicted positive: confidence = P(class=1)
|
|
|
|
|
+ - if predicted negative: confidence = P(class=0) = 1 - P(class=1)
|
|
|
|
|
+ """
|
|
|
|
|
+
|
|
|
|
|
+ confs = np.asarray(positive_class_confidence, dtype=float)
|
|
|
|
|
+ y = np.asarray(true_labels_binary, dtype=int)
|
|
|
|
|
+ if confs.shape[0] != y.shape[0]:
|
|
|
|
|
+ raise ValueError(
|
|
|
|
|
+ f"Mismatched lengths: confs has {confs.shape[0]} samples, labels has {y.shape[0]}"
|
|
|
|
|
+ )
|
|
|
|
|
+
|
|
|
|
|
+ predicted_positive = confs >= 0.5
|
|
|
|
|
+ true_positive = y == 1
|
|
|
|
|
+ correct = predicted_positive == true_positive
|
|
|
|
|
+ predicted_class_confidence = np.where(predicted_positive, confs, 1.0 - confs)
|
|
|
|
|
+
|
|
|
|
|
+ correct_confs = predicted_class_confidence[correct]
|
|
|
|
|
+ incorrect_confs = predicted_class_confidence[~correct]
|
|
|
|
|
+
|
|
|
|
|
+ mean_correct = float(np.mean(correct_confs)) if correct_confs.size else float("nan")
|
|
|
|
|
+ mean_incorrect = (
|
|
|
|
|
+ float(np.mean(incorrect_confs)) if incorrect_confs.size else float("nan")
|
|
|
|
|
+ )
|
|
|
|
|
+ return mean_correct, mean_incorrect
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+def average_confidence_delta_per_model(
|
|
|
|
|
+ predictions_selected: xr.DataArray, true_labels_binary: np.ndarray
|
|
|
|
|
+) -> float:
|
|
|
|
|
+ """Average (correct - incorrect) predicted-class confidence per model.
|
|
|
|
|
+
|
|
|
|
|
+ For each model independently:
|
|
|
|
|
+ 1) compute predicted-class confidence per sample
|
|
|
|
|
+ 2) split into correct vs incorrect samples
|
|
|
|
|
+ 3) delta = mean(correct) - mean(incorrect)
|
|
|
|
|
+ Returns the mean delta across models (nan-safe).
|
|
|
|
|
+ """
|
|
|
|
|
+
|
|
|
|
|
+ per_model_confs = predictions_selected.sel(img_class=1).values # (k_models, n)
|
|
|
|
|
+ y = np.asarray(true_labels_binary, dtype=int)
|
|
|
|
|
+ if per_model_confs.shape[1] != y.shape[0]:
|
|
|
|
|
+ raise ValueError(
|
|
|
|
|
+ f"Mismatched lengths: predictions have {per_model_confs.shape[1]} samples, labels has {y.shape[0]}"
|
|
|
|
|
+ )
|
|
|
|
|
+
|
|
|
|
|
+ # For each model, determine predicted class and confidence of predicted class
|
|
|
|
|
+ predicted_positive = per_model_confs >= 0.5
|
|
|
|
|
+ true_positive = y == 1
|
|
|
|
|
+ correct = predicted_positive == true_positive # broadcasts to (k_models, n)
|
|
|
|
|
+ predicted_class_confidence = np.where(
|
|
|
|
|
+ predicted_positive, per_model_confs, 1.0 - per_model_confs
|
|
|
|
|
+ )
|
|
|
|
|
+
|
|
|
|
|
+ # Compute per-model means for correct/incorrect, then delta
|
|
|
|
|
+ deltas: list[float] = []
|
|
|
|
|
+ for m in range(predicted_class_confidence.shape[0]):
|
|
|
|
|
+ conf_m = predicted_class_confidence[m]
|
|
|
|
|
+ correct_m = correct[m]
|
|
|
|
|
+ correct_vals = conf_m[correct_m]
|
|
|
|
|
+ incorrect_vals = conf_m[~correct_m]
|
|
|
|
|
+ if correct_vals.size == 0 or incorrect_vals.size == 0:
|
|
|
|
|
+ deltas.append(float("nan"))
|
|
|
|
|
+ continue
|
|
|
|
|
+ deltas.append(float(np.mean(correct_vals) - np.mean(incorrect_vals)))
|
|
|
|
|
+
|
|
|
|
|
+ return float(np.nanmean(np.asarray(deltas, dtype=float)))
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
for k in ensemble_sizes:
|
|
for k in ensemble_sizes:
|
|
|
accuracies_k = []
|
|
accuracies_k = []
|
|
|
true_positives_k: list[int] = []
|
|
true_positives_k: list[int] = []
|
|
|
false_positives_k: list[int] = []
|
|
false_positives_k: list[int] = []
|
|
|
true_negatives_k: list[int] = []
|
|
true_negatives_k: list[int] = []
|
|
|
false_negatives_k: list[int] = []
|
|
false_negatives_k: list[int] = []
|
|
|
-
|
|
|
|
|
|
|
+ confidence_scores_k: list[float] = []
|
|
|
|
|
+ correct_confidences_k: list[float] = []
|
|
|
|
|
+ incorrect_confidences_k: list[float] = []
|
|
|
|
|
+ confidence_deltas_per_model_k: list[float] = []
|
|
|
# If using the full set, evaluate once deterministically
|
|
# If using the full set, evaluate once deterministically
|
|
|
if k == num_models:
|
|
if k == num_models:
|
|
|
selected_idx = np.arange(num_models)
|
|
selected_idx = np.arange(num_models)
|
|
|
preds_selected = predictions.isel(model=selected_idx).mean(dim="model")
|
|
preds_selected = predictions.isel(model=selected_idx).mean(dim="model")
|
|
|
|
|
+ per_model_preds_selected = predictions.isel(model=selected_idx)
|
|
|
confs = preds_selected.sel(img_class=1).values
|
|
confs = preds_selected.sel(img_class=1).values
|
|
|
predicted_positive = confs >= 0.5
|
|
predicted_positive = confs >= 0.5
|
|
|
true_positive = true_labels == 1
|
|
true_positive = true_labels == 1
|
|
|
|
|
|
|
|
|
|
+ confidence_scores_k.append(confidence_accuracy_score(confs, true_labels))
|
|
|
|
|
+ mean_correct_conf, mean_incorrect_conf = (
|
|
|
|
|
+ average_predicted_class_confidence_by_correctness(confs, true_labels)
|
|
|
|
|
+ )
|
|
|
|
|
+ correct_confidences_k.append(mean_correct_conf)
|
|
|
|
|
+ incorrect_confidences_k.append(mean_incorrect_conf)
|
|
|
|
|
+ confidence_deltas_per_model_k.append(
|
|
|
|
|
+ average_confidence_delta_per_model(per_model_preds_selected, true_labels)
|
|
|
|
|
+ )
|
|
|
|
|
+
|
|
|
tp = int((predicted_positive & true_positive).sum().item())
|
|
tp = int((predicted_positive & true_positive).sum().item())
|
|
|
fp = int((predicted_positive & ~true_positive).sum().item())
|
|
fp = int((predicted_positive & ~true_positive).sum().item())
|
|
|
tn = int((~predicted_positive & ~true_positive).sum().item())
|
|
tn = int((~predicted_positive & ~true_positive).sum().item())
|
|
@@ -81,10 +226,23 @@ for k in ensemble_sizes:
|
|
|
for _ in range(samples_per_size):
|
|
for _ in range(samples_per_size):
|
|
|
selected_idx = rng.choice(num_models, size=k, replace=False)
|
|
selected_idx = rng.choice(num_models, size=k, replace=False)
|
|
|
preds_selected = predictions.isel(model=selected_idx).mean(dim="model")
|
|
preds_selected = predictions.isel(model=selected_idx).mean(dim="model")
|
|
|
|
|
+ per_model_preds_selected = predictions.isel(model=selected_idx)
|
|
|
confs = preds_selected.sel(img_class=1).values
|
|
confs = preds_selected.sel(img_class=1).values
|
|
|
predicted_positive = confs >= 0.5
|
|
predicted_positive = confs >= 0.5
|
|
|
true_positive = true_labels == 1
|
|
true_positive = true_labels == 1
|
|
|
|
|
|
|
|
|
|
+ confidence_scores_k.append(confidence_accuracy_score(confs, true_labels))
|
|
|
|
|
+ mean_correct_conf, mean_incorrect_conf = (
|
|
|
|
|
+ average_predicted_class_confidence_by_correctness(confs, true_labels)
|
|
|
|
|
+ )
|
|
|
|
|
+ correct_confidences_k.append(mean_correct_conf)
|
|
|
|
|
+ incorrect_confidences_k.append(mean_incorrect_conf)
|
|
|
|
|
+ confidence_deltas_per_model_k.append(
|
|
|
|
|
+ average_confidence_delta_per_model(
|
|
|
|
|
+ per_model_preds_selected, true_labels
|
|
|
|
|
+ )
|
|
|
|
|
+ )
|
|
|
|
|
+
|
|
|
tp = int((predicted_positive & true_positive).sum().item())
|
|
tp = int((predicted_positive & true_positive).sum().item())
|
|
|
fp = int((predicted_positive & ~true_positive).sum().item())
|
|
fp = int((predicted_positive & ~true_positive).sum().item())
|
|
|
tn = int((~predicted_positive & ~true_positive).sum().item())
|
|
tn = int((~predicted_positive & ~true_positive).sum().item())
|
|
@@ -102,6 +260,32 @@ for k in ensemble_sizes:
|
|
|
all_false_positives[k] = false_positives_k
|
|
all_false_positives[k] = false_positives_k
|
|
|
all_true_negatives[k] = true_negatives_k
|
|
all_true_negatives[k] = true_negatives_k
|
|
|
all_false_negatives[k] = false_negatives_k
|
|
all_false_negatives[k] = false_negatives_k
|
|
|
|
|
+ confidence_accuracy_scores[k] = confidence_scores_k
|
|
|
|
|
+ mean_confidence_accuracy_scores.append(float(np.mean(confidence_scores_k)))
|
|
|
|
|
+ std_confidence_accuracy_scores.append(float(np.std(confidence_scores_k, ddof=0)))
|
|
|
|
|
+
|
|
|
|
|
+ correct_prediction_confidences[k] = correct_confidences_k
|
|
|
|
|
+ incorrect_prediction_confidences[k] = incorrect_confidences_k
|
|
|
|
|
+ mean_correct_prediction_confidences.append(
|
|
|
|
|
+ float(np.nanmean(np.asarray(correct_confidences_k, dtype=float)))
|
|
|
|
|
+ )
|
|
|
|
|
+ std_correct_prediction_confidences.append(
|
|
|
|
|
+ float(np.nanstd(np.asarray(correct_confidences_k, dtype=float), ddof=0))
|
|
|
|
|
+ )
|
|
|
|
|
+ mean_incorrect_prediction_confidences.append(
|
|
|
|
|
+ float(np.nanmean(np.asarray(incorrect_confidences_k, dtype=float)))
|
|
|
|
|
+ )
|
|
|
|
|
+ std_incorrect_prediction_confidences.append(
|
|
|
|
|
+ float(np.nanstd(np.asarray(incorrect_confidences_k, dtype=float), ddof=0))
|
|
|
|
|
+ )
|
|
|
|
|
+
|
|
|
|
|
+ confidence_delta_per_model[k] = confidence_deltas_per_model_k
|
|
|
|
|
+ mean_confidence_delta_per_model.append(
|
|
|
|
|
+ float(np.nanmean(np.asarray(confidence_deltas_per_model_k, dtype=float)))
|
|
|
|
|
+ )
|
|
|
|
|
+ std_confidence_delta_per_model.append(
|
|
|
|
|
+ float(np.nanstd(np.asarray(confidence_deltas_per_model_k, dtype=float), ddof=0))
|
|
|
|
|
+ )
|
|
|
|
|
|
|
|
mean_accuracies.append(float(np.mean(accuracies_k)))
|
|
mean_accuracies.append(float(np.mean(accuracies_k)))
|
|
|
std_accuracies.append(float(np.std(accuracies_k, ddof=0)))
|
|
std_accuracies.append(float(np.std(accuracies_k, ddof=0)))
|
|
@@ -129,7 +313,15 @@ for k in ensemble_sizes:
|
|
|
|
|
|
|
|
# Plot mean accuracy vs ensemble size with error bars (std)
|
|
# Plot mean accuracy vs ensemble size with error bars (std)
|
|
|
plt.figure(figsize=(10, 6))
|
|
plt.figure(figsize=(10, 6))
|
|
|
-plt.errorbar(ensemble_sizes, mean_accuracies, yerr=std_accuracies, fmt="-o", capsize=3)
|
|
|
|
|
|
|
+plt.errorbar(
|
|
|
|
|
+ ensemble_sizes,
|
|
|
|
|
+ mean_accuracies,
|
|
|
|
|
+ yerr=std_accuracies,
|
|
|
|
|
+ fmt="-o",
|
|
|
|
|
+ capsize=3,
|
|
|
|
|
+ color="tab:blue",
|
|
|
|
|
+ ecolor="tab:blue",
|
|
|
|
|
+)
|
|
|
plt.title("Sensitivity Analysis: Accuracy vs Ensemble Size")
|
|
plt.title("Sensitivity Analysis: Accuracy vs Ensemble Size")
|
|
|
plt.xlabel("Number of Models in Ensemble")
|
|
plt.xlabel("Number of Models in Ensemble")
|
|
|
plt.ylabel("Accuracy")
|
|
plt.ylabel("Accuracy")
|
|
@@ -144,7 +336,7 @@ plt.xticks(ticks)
|
|
|
for i, k in enumerate(ensemble_sizes):
|
|
for i, k in enumerate(ensemble_sizes):
|
|
|
y = all_accuracies[k]
|
|
y = all_accuracies[k]
|
|
|
x = np.full(len(y), k) + (rng.random(len(y)) - 0.5) * 0.2 # small jitter
|
|
x = np.full(len(y), k) + (rng.random(len(y)) - 0.5) * 0.2 # small jitter
|
|
|
- plt.scatter(x, y, alpha=0.3, s=8, color="gray")
|
|
|
|
|
|
|
+ plt.scatter(x, y, alpha=0.35, s=10, color="lightgray")
|
|
|
|
|
|
|
|
plt.tight_layout()
|
|
plt.tight_layout()
|
|
|
|
|
|
|
@@ -152,7 +344,15 @@ plt.savefig(plots_dir / "sensitivity_accuracy_vs_ensemble_size.png")
|
|
|
|
|
|
|
|
# Plot mean F1 vs ensemble size with error bars (std)
|
|
# Plot mean F1 vs ensemble size with error bars (std)
|
|
|
plt.figure(figsize=(10, 6))
|
|
plt.figure(figsize=(10, 6))
|
|
|
-plt.errorbar(ensemble_sizes, mean_f1s, yerr=std_f1s, fmt="-o", capsize=3)
|
|
|
|
|
|
|
+plt.errorbar(
|
|
|
|
|
+ ensemble_sizes,
|
|
|
|
|
+ mean_f1s,
|
|
|
|
|
+ yerr=std_f1s,
|
|
|
|
|
+ fmt="-o",
|
|
|
|
|
+ capsize=3,
|
|
|
|
|
+ color="tab:orange",
|
|
|
|
|
+ ecolor="tab:orange",
|
|
|
|
|
+)
|
|
|
plt.title("Sensitivity Analysis: F1 Score vs Ensemble Size")
|
|
plt.title("Sensitivity Analysis: F1 Score vs Ensemble Size")
|
|
|
plt.xlabel("Number of Models in Ensemble")
|
|
plt.xlabel("Number of Models in Ensemble")
|
|
|
plt.ylabel("F1 Score")
|
|
plt.ylabel("F1 Score")
|
|
@@ -163,8 +363,114 @@ plt.xticks(ticks)
|
|
|
for i, k in enumerate(ensemble_sizes):
|
|
for i, k in enumerate(ensemble_sizes):
|
|
|
y = all_f1s[k]
|
|
y = all_f1s[k]
|
|
|
x = np.full(len(y), k) + (rng.random(len(y)) - 0.5) * 0.2 # small jitter
|
|
x = np.full(len(y), k) + (rng.random(len(y)) - 0.5) * 0.2 # small jitter
|
|
|
- plt.scatter(x, y, alpha=0.3, s=8, color="gray")
|
|
|
|
|
|
|
+ plt.scatter(x, y, alpha=0.35, s=10, color="lightgray")
|
|
|
|
|
|
|
|
plt.tight_layout()
|
|
plt.tight_layout()
|
|
|
plt.savefig(plots_dir / "sensitivity_f1_vs_ensemble_size.png")
|
|
plt.savefig(plots_dir / "sensitivity_f1_vs_ensemble_size.png")
|
|
|
|
|
+
|
|
|
|
|
+# Plot mean confidence accuracy score vs ensemble size with error bars (std)
|
|
|
|
|
+plt.figure(figsize=(10, 6))
|
|
|
|
|
+plt.errorbar(
|
|
|
|
|
+ ensemble_sizes,
|
|
|
|
|
+ mean_confidence_accuracy_scores,
|
|
|
|
|
+ yerr=std_confidence_accuracy_scores,
|
|
|
|
|
+ fmt="-o",
|
|
|
|
|
+ capsize=3,
|
|
|
|
|
+ color="tab:purple",
|
|
|
|
|
+ ecolor="tab:purple",
|
|
|
|
|
+)
|
|
|
|
|
+plt.title("Sensitivity Analysis: Confidence Accuracy Score vs Ensemble Size")
|
|
|
|
|
+plt.xlabel("Number of Models in Ensemble")
|
|
|
|
|
+plt.ylabel("Confidence Accuracy Score (per image)")
|
|
|
|
|
+plt.grid(True)
|
|
|
|
|
+plt.xticks(ticks)
|
|
|
|
|
+
|
|
|
|
|
+# Optionally overlay raw sample distributions as jittered points
|
|
|
|
|
+for k in ensemble_sizes:
|
|
|
|
|
+ y = confidence_accuracy_scores[k]
|
|
|
|
|
+ x = np.full(len(y), k) + (rng.random(len(y)) - 0.5) * 0.2 # small jitter
|
|
|
|
|
+ plt.scatter(x, y, alpha=0.35, s=10, color="lightgray")
|
|
|
|
|
+
|
|
|
|
|
+plt.tight_layout()
|
|
|
|
|
+plt.savefig(plots_dir / "sensitivity_confidence_accuracy_vs_ensemble_size.png")
|
|
|
|
|
+
|
|
|
|
|
+# Plot mean predicted-class confidence for correct vs incorrect predictions
|
|
|
|
|
+plt.figure(figsize=(10, 6))
|
|
|
|
|
+plt.errorbar(
|
|
|
|
|
+ ensemble_sizes,
|
|
|
|
|
+ mean_correct_prediction_confidences,
|
|
|
|
|
+ yerr=std_correct_prediction_confidences,
|
|
|
|
|
+ fmt="-o",
|
|
|
|
|
+ capsize=3,
|
|
|
|
|
+ color="tab:green",
|
|
|
|
|
+ label="Correct predictions",
|
|
|
|
|
+)
|
|
|
|
|
+plt.errorbar(
|
|
|
|
|
+ ensemble_sizes,
|
|
|
|
|
+ mean_incorrect_prediction_confidences,
|
|
|
|
|
+ yerr=std_incorrect_prediction_confidences,
|
|
|
|
|
+ fmt="-o",
|
|
|
|
|
+ capsize=3,
|
|
|
|
|
+ color="tab:red",
|
|
|
|
|
+ label="Incorrect predictions",
|
|
|
|
|
+)
|
|
|
|
|
+plt.title("Sensitivity Analysis: Avg Predicted-Class Confidence (Correct vs Incorrect)")
|
|
|
|
|
+plt.xlabel("Number of Models in Ensemble")
|
|
|
|
|
+plt.ylabel("Average Predicted-Class Confidence")
|
|
|
|
|
+plt.grid(True)
|
|
|
|
|
+plt.xticks(ticks)
|
|
|
|
|
+
|
|
|
|
|
+# Overlay raw sample distributions as jittered points
|
|
|
|
|
+for k in ensemble_sizes:
|
|
|
|
|
+ x = np.full(samples_per_size if k != num_models else 1, k) + (
|
|
|
|
|
+ (rng.random(samples_per_size if k != num_models else 1) - 0.5) * 0.2
|
|
|
|
|
+ )
|
|
|
|
|
+ y_correct = np.asarray(correct_prediction_confidences[k], dtype=float)
|
|
|
|
|
+ y_incorrect = np.asarray(incorrect_prediction_confidences[k], dtype=float)
|
|
|
|
|
+ if y_correct.size:
|
|
|
|
|
+ plt.scatter(
|
|
|
|
|
+ x[: y_correct.size],
|
|
|
|
|
+ y_correct,
|
|
|
|
|
+ alpha=0.45,
|
|
|
|
|
+ s=14,
|
|
|
|
|
+ color="lightgray",
|
|
|
|
|
+ marker="o",
|
|
|
|
|
+ )
|
|
|
|
|
+ if y_incorrect.size:
|
|
|
|
|
+ plt.scatter(
|
|
|
|
|
+ x[: y_incorrect.size],
|
|
|
|
|
+ y_incorrect,
|
|
|
|
|
+ alpha=0.45,
|
|
|
|
|
+ s=18,
|
|
|
|
|
+ color="dimgray",
|
|
|
|
|
+ marker="x",
|
|
|
|
|
+ )
|
|
|
|
|
+
|
|
|
|
|
+plt.legend()
|
|
|
|
|
+plt.tight_layout()
|
|
|
|
|
+plt.savefig(plots_dir / "sensitivity_avg_confidence_correct_vs_incorrect.png")
|
|
|
|
|
+
|
|
|
|
|
+# Plot mean confidence delta per model (correct - incorrect) vs ensemble size
|
|
|
|
|
+plt.figure(figsize=(10, 6))
|
|
|
|
|
+plt.errorbar(
|
|
|
|
|
+ ensemble_sizes,
|
|
|
|
|
+ mean_confidence_delta_per_model,
|
|
|
|
|
+ yerr=std_confidence_delta_per_model,
|
|
|
|
|
+ fmt="-o",
|
|
|
|
|
+ capsize=3,
|
|
|
|
|
+ color="tab:blue",
|
|
|
|
|
+)
|
|
|
|
|
+plt.title("Sensitivity Analysis: Confidence Delta Per Model (Correct - Incorrect)")
|
|
|
|
|
+plt.xlabel("Number of Models in Ensemble")
|
|
|
|
|
+plt.ylabel("Avg Delta Per Model")
|
|
|
|
|
+plt.grid(True)
|
|
|
|
|
+plt.xticks(ticks)
|
|
|
|
|
+
|
|
|
|
|
+for k in ensemble_sizes:
|
|
|
|
|
+ y = confidence_delta_per_model[k]
|
|
|
|
|
+ x = np.full(len(y), k) + (rng.random(len(y)) - 0.5) * 0.2
|
|
|
|
|
+ plt.scatter(x, y, alpha=0.35, s=12, color="lightgray")
|
|
|
|
|
+
|
|
|
|
|
+plt.tight_layout()
|
|
|
|
|
+plt.savefig(plots_dir / "sensitivity_confidence_delta_per_model.png")
|
|
|
# End of Copilot section
|
|
# End of Copilot section
|