Sfoglia il codice sorgente

Addded f1 evaluation

Nicholas Schense 1 settimana fa
parent
commit
3201101e50

+ 75 - 2
analysis/sensitivity_analysis.py

@@ -45,8 +45,19 @@ mean_accuracies: list[float] = []
 std_accuracies: list[float] = []
 all_accuracies: dict[int, list[float]] = {k: [] for k in ensemble_sizes}
 
+# Confusion-matrix counts per ensemble size (one entry per sample draw)
+all_true_positives: dict[int, list[int]] = {k: [] for k in ensemble_sizes}
+all_false_positives: dict[int, list[int]] = {k: [] for k in ensemble_sizes}
+all_true_negatives: dict[int, list[int]] = {k: [] for k in ensemble_sizes}
+all_false_negatives: dict[int, list[int]] = {k: [] for k in ensemble_sizes}
+
 for k in ensemble_sizes:
     accuracies_k = []
+    true_positives_k: list[int] = []
+    false_positives_k: list[int] = []
+    true_negatives_k: list[int] = []
+    false_negatives_k: list[int] = []
+
     # If using the full set, evaluate once deterministically
     if k == num_models:
         selected_idx = np.arange(num_models)
@@ -54,6 +65,16 @@ for k in ensemble_sizes:
         confs = preds_selected.sel(img_class=1).values
         predicted_positive = confs >= 0.5
         true_positive = true_labels == 1
+
+        tp = int((predicted_positive & true_positive).sum().item())
+        fp = int((predicted_positive & ~true_positive).sum().item())
+        tn = int((~predicted_positive & ~true_positive).sum().item())
+        fn = int((~predicted_positive & true_positive).sum().item())
+        true_positives_k.append(tp)
+        false_positives_k.append(fp)
+        true_negatives_k.append(tn)
+        false_negatives_k.append(fn)
+
         acc = (predicted_positive == true_positive).sum().item() / len(confs)
         accuracies_k.append(acc)
     else:
@@ -63,12 +84,49 @@ for k in ensemble_sizes:
             confs = preds_selected.sel(img_class=1).values
             predicted_positive = confs >= 0.5
             true_positive = true_labels == 1
+
+            tp = int((predicted_positive & true_positive).sum().item())
+            fp = int((predicted_positive & ~true_positive).sum().item())
+            tn = int((~predicted_positive & ~true_positive).sum().item())
+            fn = int((~predicted_positive & true_positive).sum().item())
+            true_positives_k.append(tp)
+            false_positives_k.append(fp)
+            true_negatives_k.append(tn)
+            false_negatives_k.append(fn)
+
             acc = (predicted_positive == true_positive).sum().item() / len(confs)
             accuracies_k.append(acc)
+
     all_accuracies[k] = accuracies_k
+    all_true_positives[k] = true_positives_k
+    all_false_positives[k] = false_positives_k
+    all_true_negatives[k] = true_negatives_k
+    all_false_negatives[k] = false_negatives_k
+
     mean_accuracies.append(float(np.mean(accuracies_k)))
     std_accuracies.append(float(np.std(accuracies_k, ddof=0)))
 
+# Compute F1 scores per ensemble size from stored confusion counts
+mean_f1s: list[float] = []
+std_f1s: list[float] = []
+all_f1s: dict[int, list[float]] = {k: [] for k in ensemble_sizes}
+
+for k in ensemble_sizes:
+    tp_arr = np.asarray(all_true_positives[k], dtype=float)
+    fp_arr = np.asarray(all_false_positives[k], dtype=float)
+    fn_arr = np.asarray(all_false_negatives[k], dtype=float)
+    denom = 2 * tp_arr + fp_arr + fn_arr
+    f1_arr = np.divide(
+        2 * tp_arr,
+        denom,
+        out=np.zeros_like(denom, dtype=float),
+        where=denom != 0,
+    )
+    f1s_k = [float(x) for x in f1_arr.tolist()]
+    all_f1s[k] = f1s_k
+    mean_f1s.append(float(np.mean(f1s_k)))
+    std_f1s.append(float(np.std(f1s_k, ddof=0)))
+
 # Plot mean accuracy vs ensemble size with error bars (std)
 plt.figure(figsize=(10, 6))
 plt.errorbar(ensemble_sizes, mean_accuracies, yerr=std_accuracies, fmt="-o", capsize=3)
@@ -91,7 +149,22 @@ for i, k in enumerate(ensemble_sizes):
 plt.tight_layout()
 
 plt.savefig(plots_dir / "sensitivity_accuracy_vs_ensemble_size.png")
-# End of Copilot section
 
+# Plot mean F1 vs ensemble size with error bars (std)
+plt.figure(figsize=(10, 6))
+plt.errorbar(ensemble_sizes, mean_f1s, yerr=std_f1s, fmt="-o", capsize=3)
+plt.title("Sensitivity Analysis: F1 Score vs Ensemble Size")
+plt.xlabel("Number of Models in Ensemble")
+plt.ylabel("F1 Score")
+plt.grid(True)
+plt.xticks(ticks)
 
-#
+# Optionally overlay raw sample distributions as jittered points
+for i, k in enumerate(ensemble_sizes):
+    y = all_f1s[k]
+    x = np.full(len(y), k) + (rng.random(len(y)) - 0.5) * 0.2  # small jitter
+    plt.scatter(x, y, alpha=0.3, s=8, color="gray")
+
+plt.tight_layout()
+plt.savefig(plots_dir / "sensitivity_f1_vs_ensemble_size.png")
+# End of Copilot section

BIN
output/Full_Ensemble(50x30) /plots/sensitivity_accuracy_vs_ensemble_size.png


BIN
output/Full_Ensemble(50x30) /plots/sensitivity_f1_vs_ensemble_size.png