|
@@ -45,8 +45,19 @@ mean_accuracies: list[float] = []
|
|
|
std_accuracies: list[float] = []
|
|
std_accuracies: list[float] = []
|
|
|
all_accuracies: dict[int, list[float]] = {k: [] for k in ensemble_sizes}
|
|
all_accuracies: dict[int, list[float]] = {k: [] for k in ensemble_sizes}
|
|
|
|
|
|
|
|
|
|
+# Confusion-matrix counts per ensemble size (one entry per sample draw)
|
|
|
|
|
+all_true_positives: dict[int, list[int]] = {k: [] for k in ensemble_sizes}
|
|
|
|
|
+all_false_positives: dict[int, list[int]] = {k: [] for k in ensemble_sizes}
|
|
|
|
|
+all_true_negatives: dict[int, list[int]] = {k: [] for k in ensemble_sizes}
|
|
|
|
|
+all_false_negatives: dict[int, list[int]] = {k: [] for k in ensemble_sizes}
|
|
|
|
|
+
|
|
|
for k in ensemble_sizes:
|
|
for k in ensemble_sizes:
|
|
|
accuracies_k = []
|
|
accuracies_k = []
|
|
|
|
|
+ true_positives_k: list[int] = []
|
|
|
|
|
+ false_positives_k: list[int] = []
|
|
|
|
|
+ true_negatives_k: list[int] = []
|
|
|
|
|
+ false_negatives_k: list[int] = []
|
|
|
|
|
+
|
|
|
# If using the full set, evaluate once deterministically
|
|
# If using the full set, evaluate once deterministically
|
|
|
if k == num_models:
|
|
if k == num_models:
|
|
|
selected_idx = np.arange(num_models)
|
|
selected_idx = np.arange(num_models)
|
|
@@ -54,6 +65,16 @@ for k in ensemble_sizes:
|
|
|
confs = preds_selected.sel(img_class=1).values
|
|
confs = preds_selected.sel(img_class=1).values
|
|
|
predicted_positive = confs >= 0.5
|
|
predicted_positive = confs >= 0.5
|
|
|
true_positive = true_labels == 1
|
|
true_positive = true_labels == 1
|
|
|
|
|
+
|
|
|
|
|
+ tp = int((predicted_positive & true_positive).sum().item())
|
|
|
|
|
+ fp = int((predicted_positive & ~true_positive).sum().item())
|
|
|
|
|
+ tn = int((~predicted_positive & ~true_positive).sum().item())
|
|
|
|
|
+ fn = int((~predicted_positive & true_positive).sum().item())
|
|
|
|
|
+ true_positives_k.append(tp)
|
|
|
|
|
+ false_positives_k.append(fp)
|
|
|
|
|
+ true_negatives_k.append(tn)
|
|
|
|
|
+ false_negatives_k.append(fn)
|
|
|
|
|
+
|
|
|
acc = (predicted_positive == true_positive).sum().item() / len(confs)
|
|
acc = (predicted_positive == true_positive).sum().item() / len(confs)
|
|
|
accuracies_k.append(acc)
|
|
accuracies_k.append(acc)
|
|
|
else:
|
|
else:
|
|
@@ -63,12 +84,49 @@ for k in ensemble_sizes:
|
|
|
confs = preds_selected.sel(img_class=1).values
|
|
confs = preds_selected.sel(img_class=1).values
|
|
|
predicted_positive = confs >= 0.5
|
|
predicted_positive = confs >= 0.5
|
|
|
true_positive = true_labels == 1
|
|
true_positive = true_labels == 1
|
|
|
|
|
+
|
|
|
|
|
+ tp = int((predicted_positive & true_positive).sum().item())
|
|
|
|
|
+ fp = int((predicted_positive & ~true_positive).sum().item())
|
|
|
|
|
+ tn = int((~predicted_positive & ~true_positive).sum().item())
|
|
|
|
|
+ fn = int((~predicted_positive & true_positive).sum().item())
|
|
|
|
|
+ true_positives_k.append(tp)
|
|
|
|
|
+ false_positives_k.append(fp)
|
|
|
|
|
+ true_negatives_k.append(tn)
|
|
|
|
|
+ false_negatives_k.append(fn)
|
|
|
|
|
+
|
|
|
acc = (predicted_positive == true_positive).sum().item() / len(confs)
|
|
acc = (predicted_positive == true_positive).sum().item() / len(confs)
|
|
|
accuracies_k.append(acc)
|
|
accuracies_k.append(acc)
|
|
|
|
|
+
|
|
|
all_accuracies[k] = accuracies_k
|
|
all_accuracies[k] = accuracies_k
|
|
|
|
|
+ all_true_positives[k] = true_positives_k
|
|
|
|
|
+ all_false_positives[k] = false_positives_k
|
|
|
|
|
+ all_true_negatives[k] = true_negatives_k
|
|
|
|
|
+ all_false_negatives[k] = false_negatives_k
|
|
|
|
|
+
|
|
|
mean_accuracies.append(float(np.mean(accuracies_k)))
|
|
mean_accuracies.append(float(np.mean(accuracies_k)))
|
|
|
std_accuracies.append(float(np.std(accuracies_k, ddof=0)))
|
|
std_accuracies.append(float(np.std(accuracies_k, ddof=0)))
|
|
|
|
|
|
|
|
|
|
+# Compute F1 scores per ensemble size from stored confusion counts
|
|
|
|
|
+mean_f1s: list[float] = []
|
|
|
|
|
+std_f1s: list[float] = []
|
|
|
|
|
+all_f1s: dict[int, list[float]] = {k: [] for k in ensemble_sizes}
|
|
|
|
|
+
|
|
|
|
|
+for k in ensemble_sizes:
|
|
|
|
|
+ tp_arr = np.asarray(all_true_positives[k], dtype=float)
|
|
|
|
|
+ fp_arr = np.asarray(all_false_positives[k], dtype=float)
|
|
|
|
|
+ fn_arr = np.asarray(all_false_negatives[k], dtype=float)
|
|
|
|
|
+ denom = 2 * tp_arr + fp_arr + fn_arr
|
|
|
|
|
+ f1_arr = np.divide(
|
|
|
|
|
+ 2 * tp_arr,
|
|
|
|
|
+ denom,
|
|
|
|
|
+ out=np.zeros_like(denom, dtype=float),
|
|
|
|
|
+ where=denom != 0,
|
|
|
|
|
+ )
|
|
|
|
|
+ f1s_k = [float(x) for x in f1_arr.tolist()]
|
|
|
|
|
+ all_f1s[k] = f1s_k
|
|
|
|
|
+ mean_f1s.append(float(np.mean(f1s_k)))
|
|
|
|
|
+ std_f1s.append(float(np.std(f1s_k, ddof=0)))
|
|
|
|
|
+
|
|
|
# Plot mean accuracy vs ensemble size with error bars (std)
|
|
# Plot mean accuracy vs ensemble size with error bars (std)
|
|
|
plt.figure(figsize=(10, 6))
|
|
plt.figure(figsize=(10, 6))
|
|
|
plt.errorbar(ensemble_sizes, mean_accuracies, yerr=std_accuracies, fmt="-o", capsize=3)
|
|
plt.errorbar(ensemble_sizes, mean_accuracies, yerr=std_accuracies, fmt="-o", capsize=3)
|
|
@@ -91,7 +149,22 @@ for i, k in enumerate(ensemble_sizes):
|
|
|
plt.tight_layout()
|
|
plt.tight_layout()
|
|
|
|
|
|
|
|
plt.savefig(plots_dir / "sensitivity_accuracy_vs_ensemble_size.png")
|
|
plt.savefig(plots_dir / "sensitivity_accuracy_vs_ensemble_size.png")
|
|
|
-# End of Copilot section
|
|
|
|
|
|
|
|
|
|
|
|
+# Plot mean F1 vs ensemble size with error bars (std)
|
|
|
|
|
+plt.figure(figsize=(10, 6))
|
|
|
|
|
+plt.errorbar(ensemble_sizes, mean_f1s, yerr=std_f1s, fmt="-o", capsize=3)
|
|
|
|
|
+plt.title("Sensitivity Analysis: F1 Score vs Ensemble Size")
|
|
|
|
|
+plt.xlabel("Number of Models in Ensemble")
|
|
|
|
|
+plt.ylabel("F1 Score")
|
|
|
|
|
+plt.grid(True)
|
|
|
|
|
+plt.xticks(ticks)
|
|
|
|
|
|
|
|
-#
|
|
|
|
|
|
|
+# Optionally overlay raw sample distributions as jittered points
|
|
|
|
|
+for i, k in enumerate(ensemble_sizes):
|
|
|
|
|
+ y = all_f1s[k]
|
|
|
|
|
+ x = np.full(len(y), k) + (rng.random(len(y)) - 0.5) * 0.2 # small jitter
|
|
|
|
|
+ plt.scatter(x, y, alpha=0.3, s=8, color="gray")
|
|
|
|
|
+
|
|
|
|
|
+plt.tight_layout()
|
|
|
|
|
+plt.savefig(plots_dir / "sensitivity_f1_vs_ensemble_size.png")
|
|
|
|
|
+# End of Copilot section
|