2 mesi fa · 4be550e4a0
--- a/.gitignore
+++ b/.gitignore
@@ -58,3 +58,5 @@ docs/_build/
 
				 # PyBuilder
			
 
				 target/
			
 
				 
			
 
				+.venv
			
 
				+**.png
			
--- a/analysis/longitudinal_comparison.py
+++ b/analysis/longitudinal_comparison.py
@@ -0,0 +1,129 @@
 
				+from utils.config import config
			
 
				+import pathlib as pl
			
 
				+import numpy as np
			
 
				+import pandas as pd
			
 
				+from collections.abc import Iterable
			
 
				+
			
 
				+
			
 
				+RESET_COLOR = "\033[0m"
			
 
				+DIAGNOSIS_COLORS = {
			
 
				+    "CN": "\033[92m",
			
 
				+    "sMCI": "\033[93m",
			
 
				+    "pMCI": "\033[96m",
			
 
				+    "AD": "\033[91m",
			
 
				+}
			
 
				+
			
 
				+
			
 
				+adni_data = pd.read_csv(config["analysis"]["adni_path"])
			
 
				+
			
 
				+# Strip leading and trailing spaces from column names
			
 
				+
			
 
				+adni_data = adni_data.rename(columns=lambda x: x.strip())
			
 
				+adni_data["EXAMDATE"] = pd.to_datetime(
			
 
				+    adni_data["EXAMDATE"].astype("string").str.strip(),
			
 
				+    format="%m/%d/%y",
			
 
				+    errors="coerce",
			
 
				+)
			
 
				+adni_data = adni_data.sort_values(["PTID", "EXAMDATE"], na_position="last")
			
 
				+
			
 
				+
			
 
				+plots_dir = (
			
 
				+    pl.Path("../output") / pl.Path(config["analysis"]["evaluation_name"]) / "plots"
			
 
				+)
			
 
				+plots_dir.mkdir(parents=True, exist_ok=True)
			
 
				+
			
 
				+
			
 
				+# The goal for this analysis is to compare the confidence of the model on longitudinal data, where we analyze patients who have multiple images and switched from MCI or CN to AD to patients who stayed stable.
			
 
				+# First filter to only inlude patients with multiple images
			
 
				+
			
 
				+multiple_images = adni_data.groupby("PTID").filter(lambda x: len(x) > 1)
			
 
				+
			
 
				+
			
 
				+# Filter by patients who switched from MCI or CN to AD, accounting for an unknown number of images per patient.
			
 
				+def is_missing_diagnosis(diagnosis: object) -> bool:
			
 
				+    if diagnosis is None or diagnosis is pd.NA:
			
 
				+        return True
			
 
				+    if isinstance(diagnosis, float):
			
 
				+        return bool(np.isnan(diagnosis))
			
 
				+    return False
			
 
				+
			
 
				+
			
 
				+def normalize_diagnoses(diagnoses: Iterable[object]) -> np.ndarray:
			
 
				+    normalized = [
			
 
				+        str(diagnosis).strip()
			
 
				+        for diagnosis in diagnoses
			
 
				+        if not is_missing_diagnosis(diagnosis)
			
 
				+    ]
			
 
				+    return np.array(normalized, dtype=str)
			
 
				+
			
 
				+
			
 
				+def format_diagnoses(diagnoses: Iterable[object]) -> str:
			
 
				+    normalized = normalize_diagnoses(diagnoses)
			
 
				+    colored = [
			
 
				+        (
			
 
				+            f"{DIAGNOSIS_COLORS[diagnosis]}{diagnosis}{RESET_COLOR}"
			
 
				+            if diagnosis in DIAGNOSIS_COLORS
			
 
				+            else diagnosis
			
 
				+        )
			
 
				+        for diagnosis in normalized
			
 
				+    ]
			
 
				+    return f"[{', '.join(colored)}]"
			
 
				+
			
 
				+
			
 
				+# Print a list of patients with multiple images and their diagnoses
			
 
				+print("Patients with multiple images and their diagnoses:")
			
 
				+for ptid, group in multiple_images.groupby("PTID"):
			
 
				+    diagnoses = group["Class"].values
			
 
				+    print(f"PTID: {ptid}, Diagnoses: {format_diagnoses(diagnoses)}")
			
 
				+
			
 
				+
			
 
				+def has_mci_dx(diagnoses: Iterable[object]) -> bool:
			
 
				+    diagnoses = normalize_diagnoses(diagnoses)
			
 
				+    return bool(
			
 
				+        np.any((diagnoses == "sMCI") | (diagnoses == "pMCI") | (diagnoses == "CN"))
			
 
				+    )
			
 
				+
			
 
				+
			
 
				+def has_ad_dx(diagnoses: Iterable[object]) -> bool:
			
 
				+    diagnoses = normalize_diagnoses(diagnoses)
			
 
				+    return bool(np.any(diagnoses == "AD"))
			
 
				+
			
 
				+
			
 
				+def has_nc_dx(diagnoses: Iterable[object]) -> bool:
			
 
				+    diagnoses = normalize_diagnoses(diagnoses)
			
 
				+    return bool(np.any(diagnoses == "CN"))
			
 
				+
			
 
				+
			
 
				+# Switched from MCI to CN or AD, or from CN to MCI or AD
			
 
				+def switched_class(diagnoses: Iterable[object]) -> bool:
			
 
				+    return (has_mci_dx(diagnoses) and has_ad_dx(diagnoses)) or (
			
 
				+        has_nc_dx(diagnoses) and (has_mci_dx(diagnoses) or has_ad_dx(diagnoses))
			
 
				+    )
			
 
				+
			
 
				+
			
 
				+patients_switched = multiple_images.groupby("PTID").filter(
			
 
				+    lambda x: switched_class(x["Class"].values)
			
 
				+)
			
 
				+
			
 
				+print("\nPatients who switched from MCI or CN to AD or vice versa:")
			
 
				+for ptid, group in patients_switched.groupby("PTID"):
			
 
				+    diagnoses = group["Class"].values
			
 
				+    print(f"PTID: {ptid}, Diagnoses: {format_diagnoses(diagnoses)}")
			
 
				+
			
 
				+
			
 
				+# print length of patients with multiple images, and how many switched from MCI or CN to AD or vice versa
			
 
				+print(f"\nTotal patients with multiple images: {multiple_images['PTID'].nunique()}")
			
 
				+print(
			
 
				+    f"Total patients who switched from MCI or CN to AD or vice versa: {patients_switched['PTID'].nunique()}"
			
 
				+)
			
 
				+
			
 
				+
			
 
				+# Filter just for the patients who switched from CN to AD
			
 
				+patients_switched_cn_to_ad = multiple_images.groupby("PTID").filter(
			
 
				+    lambda x: has_nc_dx(x["Class"].values) and has_ad_dx(x["Class"].values)
			
 
				+)
			
 
				+
			
 
				+print("\nPatients who switched from CN to AD:")
			
 
				+for ptid, group in patients_switched_cn_to_ad.groupby("PTID"):
			
 
				+    diagnoses = group["Class"].values
			
 
				+    print(f"PTID: {ptid}, Diagnoses: {format_diagnoses(diagnoses)}")
			
--- a/analysis/physician_comparison.py
+++ b/analysis/physician_comparison.py
@@ -0,0 +1,89 @@
 
				+import xarray as xr
			
 
				+from utils.config import config
			
 
				+import pathlib as pl
			
 
				+import numpy as np
			
 
				+import pandas as pd
			
 
				+import matplotlib.pyplot as plt
			
 
				+import os
			
 
				+
			
 
				+# This code compares the confidence of the model outputs with the physician ratings
			
 
				+
			
 
				+adni_data = pd.read_csv(config["analysis"]["adni_path"])
			
 
				+
			
 
				+plots_dir = (
			
 
				+    pl.Path("../output") / pl.Path(config["analysis"]["evaluation_name"]) / "plots"
			
 
				+)
			
 
				+plots_dir.mkdir(parents=True, exist_ok=True)
			
 
				+
			
 
				+
			
 
				+physician_ratings = adni_data[
			
 
				+    "DXCONFID (1=uncertain, 2= mild, 3= moderate, 4=high confidence) "
			
 
				+].to_numpy(dtype=str)
			
 
				+
			
 
				+image_ids = adni_data["Image Data ID"].to_numpy(dtype=str)
			
 
				+
			
 
				+physician_ratings = np.strings.strip(physician_ratings)
			
 
				+image_ids = np.strings.strip(image_ids)
			
 
				+
			
 
				+# Filter to only include ratings where it is not "na" or only spaces
			
 
				+valid_indices = np.where(physician_ratings != "na")[0]
			
 
				+valid_indices = valid_indices[physician_ratings[valid_indices] != ""]
			
 
				+physician_ratings = physician_ratings[valid_indices].astype(int)
			
 
				+csv_img_ids = image_ids[valid_indices].astype(int)
			
 
				+
			
 
				+
			
 
				+# Load the evaluation results
			
 
				+os.chdir(pl.Path(__file__).parent)
			
 
				+model_dataset_path = pl.Path("../model_evaluations") / pl.Path(
			
 
				+    config["analysis"]["evaluation_name"].strip()
			
 
				+).with_suffix(".nc")
			
 
				+
			
 
				+print(f"Loading evaluation results from {model_dataset_path}")
			
 
				+
			
 
				+array = xr.open_dataset(model_dataset_path)  # type: ignore
			
 
				+
			
 
				+# This dataset includes two dataarrays: 'img_id' and 'predictions' that we will use to determine the model confidence for each image ID
			
 
				+eval_img_ids = array["img_id"]
			
 
				+predictions = array["predictions"]
			
 
				+
			
 
				+# Average across models to get the mean confidence for each image (taking the higher confidence between the two classes)
			
 
				+model_confidences = predictions.mean(dim="model").max(dim="img_class").values
			
 
				+
			
 
				+
			
 
				+# Find the shared image IDs between the model evaluation and the physician ratings
			
 
				+shared_img_ids, model_indices, csv_indices = np.intersect1d(
			
 
				+    eval_img_ids.values, csv_img_ids, return_indices=True
			
 
				+)
			
 
				+
			
 
				+
			
 
				+# Get the corresponding physician ratings and model confidences for the shared image IDs
			
 
				+shared_physician_ratings = physician_ratings[csv_indices]
			
 
				+shared_model_confidences = model_confidences[model_indices]
			
 
				+
			
 
				+
			
 
				+# Print distribution of ratings for shared samples
			
 
				+print("Distribution of Physician Ratings for Shared Samples:"),
			
 
				+(unique, counts) = np.unique(shared_physician_ratings, return_counts=True)
			
 
				+distribution = dict(zip(unique, counts))
			
 
				+for rating in range(1, 5):
			
 
				+    count = distribution.get(rating, 0)
			
 
				+    print(f"  Rating {rating}: {count} samples")
			
 
				+
			
 
				+
			
 
				+# Graph the model confidence vs physician ratings using a violin plot
			
 
				+plt.figure(figsize=(10, 6))
			
 
				+plt.boxplot(
			
 
				+    [
			
 
				+        shared_model_confidences[shared_physician_ratings == rating]
			
 
				+        for rating in range(1, 5)
			
 
				+    ],
			
 
				+    positions=[1, 2, 3, 4],
			
 
				+    widths=0.6,
			
 
				+)
			
 
				+plt.xticks([1, 2, 3, 4], ["1 (Uncertain)", "2 (Mild)", "3 (Moderate)", "4 (High)"])
			
 
				+plt.xlabel("Physician Confidence Rating")
			
 
				+plt.ylabel("Model Confidence")
			
 
				+plt.title("Model Confidence vs Physician Confidence Ratings")
			
 
				+plt.grid(axis="y")
			
 
				+plt.savefig(plots_dir / "model_confidence_vs_physician_ratings_boxplot.png")
			
 
				+plt.close()
			
--- a/analysis/sensitivity_analysis.py
+++ b/analysis/sensitivity_analysis.py
@@ -32,7 +32,7 @@ plots_dir = (
 
				 plots_dir.mkdir(parents=True, exist_ok=True)
			
 
				 
			
 
				 # Configuration for the sensitivity analysis
			
 
				-rng = np.random.default_rng(42)
			
 
				+rng = np.random.default_rng(777)
			
 
				 num_models = int(predictions.sizes["model"])
			
 
				 ensemble_sizes = list(range(1, num_models + 1))
			
 
				 samples_per_size = 50
			
@@ -51,21 +51,166 @@ all_false_positives: dict[int, list[int]] = {k: [] for k in ensemble_sizes}
 
				 all_true_negatives: dict[int, list[int]] = {k: [] for k in ensemble_sizes}
			
 
				 all_false_negatives: dict[int, list[int]] = {k: [] for k in ensemble_sizes}
			
 
				 
			
 
				+# Also implement a "confidence accuracy score":
			
 
				+# sum over samples of (confidence_of_predicted_class - 0.5) * (+1 if correct else -1)
			
 
				+confidence_accuracy_scores: dict[int, list[float]] = {k: [] for k in ensemble_sizes}
			
 
				+mean_confidence_accuracy_scores: list[float] = []
			
 
				+std_confidence_accuracy_scores: list[float] = []
			
 
				+
			
 
				+# Track average confidence of predicted class for correct vs incorrect predictions
			
 
				+correct_prediction_confidences: dict[int, list[float]] = {k: [] for k in ensemble_sizes}
			
 
				+incorrect_prediction_confidences: dict[int, list[float]] = {
			
 
				+    k: [] for k in ensemble_sizes
			
 
				+}
			
 
				+mean_correct_prediction_confidences: list[float] = []
			
 
				+std_correct_prediction_confidences: list[float] = []
			
 
				+mean_incorrect_prediction_confidences: list[float] = []
			
 
				+std_incorrect_prediction_confidences: list[float] = []
			
 
				+
			
 
				+# Track delta (correct - incorrect) confidence per model, averaged over models
			
 
				+confidence_delta_per_model: dict[int, list[float]] = {k: [] for k in ensemble_sizes}
			
 
				+mean_confidence_delta_per_model: list[float] = []
			
 
				+std_confidence_delta_per_model: list[float] = []
			
 
				+
			
 
				+
			
 
				+def confidence_accuracy_score(
			
 
				+    positive_class_confidence: np.ndarray, true_labels_binary: np.ndarray
			
 
				+) -> float:
			
 
				+    """Compute confidence accuracy score as described.
			
 
				+
			
 
				+    Parameters
			
 
				+    ----------
			
 
				+    positive_class_confidence:
			
 
				+        Probability/confidence assigned to the positive class (class 1), shape (n_samples,).
			
 
				+    true_labels_binary:
			
 
				+        Ground-truth binary labels in {0, 1}, shape (n_samples,).
			
 
				+    """
			
 
				+
			
 
				+    confs = np.asarray(positive_class_confidence, dtype=float)
			
 
				+    y = np.asarray(true_labels_binary, dtype=int)
			
 
				+    if confs.shape[0] != y.shape[0]:
			
 
				+        raise ValueError(
			
 
				+            f"Mismatched lengths: confs has {confs.shape[0]} samples, labels has {y.shape[0]}"
			
 
				+        )
			
 
				+
			
 
				+    predicted_positive = confs >= 0.5
			
 
				+    true_positive = y == 1
			
 
				+    correct = predicted_positive == true_positive
			
 
				+
			
 
				+    # Confidence of the predicted class (so correct negatives contribute positively)
			
 
				+    predicted_class_confidence = np.where(predicted_positive, confs, 1.0 - confs)
			
 
				+
			
 
				+    sign = np.where(correct, 1.0, -1.0)
			
 
				+    # Normalize by number of evaluated images (per-image average score)
			
 
				+    # To normalize score to [0, 1], divide by 0.5 * n_samples
			
 
				+    return float(
			
 
				+        np.sum((predicted_class_confidence - 0.5) * sign) / (0.5 * confs.shape[0])
			
 
				+    )
			
 
				+
			
 
				+
			
 
				+def average_predicted_class_confidence_by_correctness(
			
 
				+    positive_class_confidence: np.ndarray, true_labels_binary: np.ndarray
			
 
				+) -> tuple[float, float]:
			
 
				+    """Return (mean_conf_correct, mean_conf_incorrect) for predicted class confidence.
			
 
				+
			
 
				+    Uses confidence of the predicted class for each sample:
			
 
				+    - if predicted positive: confidence = P(class=1)
			
 
				+    - if predicted negative: confidence = P(class=0) = 1 - P(class=1)
			
 
				+    """
			
 
				+
			
 
				+    confs = np.asarray(positive_class_confidence, dtype=float)
			
 
				+    y = np.asarray(true_labels_binary, dtype=int)
			
 
				+    if confs.shape[0] != y.shape[0]:
			
 
				+        raise ValueError(
			
 
				+            f"Mismatched lengths: confs has {confs.shape[0]} samples, labels has {y.shape[0]}"
			
 
				+        )
			
 
				+
			
 
				+    predicted_positive = confs >= 0.5
			
 
				+    true_positive = y == 1
			
 
				+    correct = predicted_positive == true_positive
			
 
				+    predicted_class_confidence = np.where(predicted_positive, confs, 1.0 - confs)
			
 
				+
			
 
				+    correct_confs = predicted_class_confidence[correct]
			
 
				+    incorrect_confs = predicted_class_confidence[~correct]
			
 
				+
			
 
				+    mean_correct = float(np.mean(correct_confs)) if correct_confs.size else float("nan")
			
 
				+    mean_incorrect = (
			
 
				+        float(np.mean(incorrect_confs)) if incorrect_confs.size else float("nan")
			
 
				+    )
			
 
				+    return mean_correct, mean_incorrect
			
 
				+
			
 
				+
			
 
				+def average_confidence_delta_per_model(
			
 
				+    predictions_selected: xr.DataArray, true_labels_binary: np.ndarray
			
 
				+) -> float:
			
 
				+    """Average (correct - incorrect) predicted-class confidence per model.
			
 
				+
			
 
				+    For each model independently:
			
 
				+    1) compute predicted-class confidence per sample
			
 
				+    2) split into correct vs incorrect samples
			
 
				+    3) delta = mean(correct) - mean(incorrect)
			
 
				+    Returns the mean delta across models (nan-safe).
			
 
				+    """
			
 
				+
			
 
				+    per_model_confs = predictions_selected.sel(img_class=1).values  # (k_models, n)
			
 
				+    y = np.asarray(true_labels_binary, dtype=int)
			
 
				+    if per_model_confs.shape[1] != y.shape[0]:
			
 
				+        raise ValueError(
			
 
				+            f"Mismatched lengths: predictions have {per_model_confs.shape[1]} samples, labels has {y.shape[0]}"
			
 
				+        )
			
 
				+
			
 
				+    # For each model, determine predicted class and confidence of predicted class
			
 
				+    predicted_positive = per_model_confs >= 0.5
			
 
				+    true_positive = y == 1
			
 
				+    correct = predicted_positive == true_positive  # broadcasts to (k_models, n)
			
 
				+    predicted_class_confidence = np.where(
			
 
				+        predicted_positive, per_model_confs, 1.0 - per_model_confs
			
 
				+    )
			
 
				+
			
 
				+    # Compute per-model means for correct/incorrect, then delta
			
 
				+    deltas: list[float] = []
			
 
				+    for m in range(predicted_class_confidence.shape[0]):
			
 
				+        conf_m = predicted_class_confidence[m]
			
 
				+        correct_m = correct[m]
			
 
				+        correct_vals = conf_m[correct_m]
			
 
				+        incorrect_vals = conf_m[~correct_m]
			
 
				+        if correct_vals.size == 0 or incorrect_vals.size == 0:
			
 
				+            deltas.append(float("nan"))
			
 
				+            continue
			
 
				+        deltas.append(float(np.mean(correct_vals) - np.mean(incorrect_vals)))
			
 
				+
			
 
				+    return float(np.nanmean(np.asarray(deltas, dtype=float)))
			
 
				+
			
 
				+
			
 
				 for k in ensemble_sizes:
			
 
				     accuracies_k = []
			
 
				     true_positives_k: list[int] = []
			
 
				     false_positives_k: list[int] = []
			
 
				     true_negatives_k: list[int] = []
			
 
				     false_negatives_k: list[int] = []
			
 
				-
			
 
				+    confidence_scores_k: list[float] = []
			
 
				+    correct_confidences_k: list[float] = []
			
 
				+    incorrect_confidences_k: list[float] = []
			
 
				+    confidence_deltas_per_model_k: list[float] = []
			
 
				     # If using the full set, evaluate once deterministically
			
 
				     if k == num_models:
			
 
				         selected_idx = np.arange(num_models)
			
 
				         preds_selected = predictions.isel(model=selected_idx).mean(dim="model")
			
 
				+        per_model_preds_selected = predictions.isel(model=selected_idx)
			
 
				         confs = preds_selected.sel(img_class=1).values
			
 
				         predicted_positive = confs >= 0.5
			
 
				         true_positive = true_labels == 1
			
 
				 
			
 
				+        confidence_scores_k.append(confidence_accuracy_score(confs, true_labels))
			
 
				+        mean_correct_conf, mean_incorrect_conf = (
			
 
				+            average_predicted_class_confidence_by_correctness(confs, true_labels)
			
 
				+        )
			
 
				+        correct_confidences_k.append(mean_correct_conf)
			
 
				+        incorrect_confidences_k.append(mean_incorrect_conf)
			
 
				+        confidence_deltas_per_model_k.append(
			
 
				+            average_confidence_delta_per_model(per_model_preds_selected, true_labels)
			
 
				+        )
			
 
				+
			
 
				         tp = int((predicted_positive & true_positive).sum().item())
			
 
				         fp = int((predicted_positive & ~true_positive).sum().item())
			
 
				         tn = int((~predicted_positive & ~true_positive).sum().item())
			
@@ -81,10 +226,23 @@ for k in ensemble_sizes:
 
				         for _ in range(samples_per_size):
			
 
				             selected_idx = rng.choice(num_models, size=k, replace=False)
			
 
				             preds_selected = predictions.isel(model=selected_idx).mean(dim="model")
			
 
				+            per_model_preds_selected = predictions.isel(model=selected_idx)
			
 
				             confs = preds_selected.sel(img_class=1).values
			
 
				             predicted_positive = confs >= 0.5
			
 
				             true_positive = true_labels == 1
			
 
				 
			
 
				+            confidence_scores_k.append(confidence_accuracy_score(confs, true_labels))
			
 
				+            mean_correct_conf, mean_incorrect_conf = (
			
 
				+                average_predicted_class_confidence_by_correctness(confs, true_labels)
			
 
				+            )
			
 
				+            correct_confidences_k.append(mean_correct_conf)
			
 
				+            incorrect_confidences_k.append(mean_incorrect_conf)
			
 
				+            confidence_deltas_per_model_k.append(
			
 
				+                average_confidence_delta_per_model(
			
 
				+                    per_model_preds_selected, true_labels
			
 
				+                )
			
 
				+            )
			
 
				+
			
 
				             tp = int((predicted_positive & true_positive).sum().item())
			
 
				             fp = int((predicted_positive & ~true_positive).sum().item())
			
 
				             tn = int((~predicted_positive & ~true_positive).sum().item())
			
@@ -102,6 +260,32 @@ for k in ensemble_sizes:
 
				     all_false_positives[k] = false_positives_k
			
 
				     all_true_negatives[k] = true_negatives_k
			
 
				     all_false_negatives[k] = false_negatives_k
			
 
				+    confidence_accuracy_scores[k] = confidence_scores_k
			
 
				+    mean_confidence_accuracy_scores.append(float(np.mean(confidence_scores_k)))
			
 
				+    std_confidence_accuracy_scores.append(float(np.std(confidence_scores_k, ddof=0)))
			
 
				+
			
 
				+    correct_prediction_confidences[k] = correct_confidences_k
			
 
				+    incorrect_prediction_confidences[k] = incorrect_confidences_k
			
 
				+    mean_correct_prediction_confidences.append(
			
 
				+        float(np.nanmean(np.asarray(correct_confidences_k, dtype=float)))
			
 
				+    )
			
 
				+    std_correct_prediction_confidences.append(
			
 
				+        float(np.nanstd(np.asarray(correct_confidences_k, dtype=float), ddof=0))
			
 
				+    )
			
 
				+    mean_incorrect_prediction_confidences.append(
			
 
				+        float(np.nanmean(np.asarray(incorrect_confidences_k, dtype=float)))
			
 
				+    )
			
 
				+    std_incorrect_prediction_confidences.append(
			
 
				+        float(np.nanstd(np.asarray(incorrect_confidences_k, dtype=float), ddof=0))
			
 
				+    )
			
 
				+
			
 
				+    confidence_delta_per_model[k] = confidence_deltas_per_model_k
			
 
				+    mean_confidence_delta_per_model.append(
			
 
				+        float(np.nanmean(np.asarray(confidence_deltas_per_model_k, dtype=float)))
			
 
				+    )
			
 
				+    std_confidence_delta_per_model.append(
			
 
				+        float(np.nanstd(np.asarray(confidence_deltas_per_model_k, dtype=float), ddof=0))
			
 
				+    )
			
 
				 
			
 
				     mean_accuracies.append(float(np.mean(accuracies_k)))
			
 
				     std_accuracies.append(float(np.std(accuracies_k, ddof=0)))
			
@@ -129,7 +313,15 @@ for k in ensemble_sizes:
 
				 
			
 
				 # Plot mean accuracy vs ensemble size with error bars (std)
			
 
				 plt.figure(figsize=(10, 6))
			
 
				-plt.errorbar(ensemble_sizes, mean_accuracies, yerr=std_accuracies, fmt="-o", capsize=3)
			
 
				+plt.errorbar(
			
 
				+    ensemble_sizes,
			
 
				+    mean_accuracies,
			
 
				+    yerr=std_accuracies,
			
 
				+    fmt="-o",
			
 
				+    capsize=3,
			
 
				+    color="tab:blue",
			
 
				+    ecolor="tab:blue",
			
 
				+)
			
 
				 plt.title("Sensitivity Analysis: Accuracy vs Ensemble Size")
			
 
				 plt.xlabel("Number of Models in Ensemble")
			
 
				 plt.ylabel("Accuracy")
			
@@ -144,7 +336,7 @@ plt.xticks(ticks)
 
				 for i, k in enumerate(ensemble_sizes):
			
 
				     y = all_accuracies[k]
			
 
				     x = np.full(len(y), k) + (rng.random(len(y)) - 0.5) * 0.2  # small jitter
			
 
				-    plt.scatter(x, y, alpha=0.3, s=8, color="gray")
			
 
				+    plt.scatter(x, y, alpha=0.35, s=10, color="lightgray")
			
 
				 
			
 
				 plt.tight_layout()
			
 
				 
			
@@ -152,7 +344,15 @@ plt.savefig(plots_dir / "sensitivity_accuracy_vs_ensemble_size.png")
 
				 
			
 
				 # Plot mean F1 vs ensemble size with error bars (std)
			
 
				 plt.figure(figsize=(10, 6))
			
 
				-plt.errorbar(ensemble_sizes, mean_f1s, yerr=std_f1s, fmt="-o", capsize=3)
			
 
				+plt.errorbar(
			
 
				+    ensemble_sizes,
			
 
				+    mean_f1s,
			
 
				+    yerr=std_f1s,
			
 
				+    fmt="-o",
			
 
				+    capsize=3,
			
 
				+    color="tab:orange",
			
 
				+    ecolor="tab:orange",
			
 
				+)
			
 
				 plt.title("Sensitivity Analysis: F1 Score vs Ensemble Size")
			
 
				 plt.xlabel("Number of Models in Ensemble")
			
 
				 plt.ylabel("F1 Score")
			
@@ -163,8 +363,114 @@ plt.xticks(ticks)
 
				 for i, k in enumerate(ensemble_sizes):
			
 
				     y = all_f1s[k]
			
 
				     x = np.full(len(y), k) + (rng.random(len(y)) - 0.5) * 0.2  # small jitter
			
 
				-    plt.scatter(x, y, alpha=0.3, s=8, color="gray")
			
 
				+    plt.scatter(x, y, alpha=0.35, s=10, color="lightgray")
			
 
				 
			
 
				 plt.tight_layout()
			
 
				 plt.savefig(plots_dir / "sensitivity_f1_vs_ensemble_size.png")
			
 
				+
			
 
				+# Plot mean confidence accuracy score vs ensemble size with error bars (std)
			
 
				+plt.figure(figsize=(10, 6))
			
 
				+plt.errorbar(
			
 
				+    ensemble_sizes,
			
 
				+    mean_confidence_accuracy_scores,
			
 
				+    yerr=std_confidence_accuracy_scores,
			
 
				+    fmt="-o",
			
 
				+    capsize=3,
			
 
				+    color="tab:purple",
			
 
				+    ecolor="tab:purple",
			
 
				+)
			
 
				+plt.title("Sensitivity Analysis: Confidence Accuracy Score vs Ensemble Size")
			
 
				+plt.xlabel("Number of Models in Ensemble")
			
 
				+plt.ylabel("Confidence Accuracy Score (per image)")
			
 
				+plt.grid(True)
			
 
				+plt.xticks(ticks)
			
 
				+
			
 
				+# Optionally overlay raw sample distributions as jittered points
			
 
				+for k in ensemble_sizes:
			
 
				+    y = confidence_accuracy_scores[k]
			
 
				+    x = np.full(len(y), k) + (rng.random(len(y)) - 0.5) * 0.2  # small jitter
			
 
				+    plt.scatter(x, y, alpha=0.35, s=10, color="lightgray")
			
 
				+
			
 
				+plt.tight_layout()
			
 
				+plt.savefig(plots_dir / "sensitivity_confidence_accuracy_vs_ensemble_size.png")
			
 
				+
			
 
				+# Plot mean predicted-class confidence for correct vs incorrect predictions
			
 
				+plt.figure(figsize=(10, 6))
			
 
				+plt.errorbar(
			
 
				+    ensemble_sizes,
			
 
				+    mean_correct_prediction_confidences,
			
 
				+    yerr=std_correct_prediction_confidences,
			
 
				+    fmt="-o",
			
 
				+    capsize=3,
			
 
				+    color="tab:green",
			
 
				+    label="Correct predictions",
			
 
				+)
			
 
				+plt.errorbar(
			
 
				+    ensemble_sizes,
			
 
				+    mean_incorrect_prediction_confidences,
			
 
				+    yerr=std_incorrect_prediction_confidences,
			
 
				+    fmt="-o",
			
 
				+    capsize=3,
			
 
				+    color="tab:red",
			
 
				+    label="Incorrect predictions",
			
 
				+)
			
 
				+plt.title("Sensitivity Analysis: Avg Predicted-Class Confidence (Correct vs Incorrect)")
			
 
				+plt.xlabel("Number of Models in Ensemble")
			
 
				+plt.ylabel("Average Predicted-Class Confidence")
			
 
				+plt.grid(True)
			
 
				+plt.xticks(ticks)
			
 
				+
			
 
				+# Overlay raw sample distributions as jittered points
			
 
				+for k in ensemble_sizes:
			
 
				+    x = np.full(samples_per_size if k != num_models else 1, k) + (
			
 
				+        (rng.random(samples_per_size if k != num_models else 1) - 0.5) * 0.2
			
 
				+    )
			
 
				+    y_correct = np.asarray(correct_prediction_confidences[k], dtype=float)
			
 
				+    y_incorrect = np.asarray(incorrect_prediction_confidences[k], dtype=float)
			
 
				+    if y_correct.size:
			
 
				+        plt.scatter(
			
 
				+            x[: y_correct.size],
			
 
				+            y_correct,
			
 
				+            alpha=0.45,
			
 
				+            s=14,
			
 
				+            color="lightgray",
			
 
				+            marker="o",
			
 
				+        )
			
 
				+    if y_incorrect.size:
			
 
				+        plt.scatter(
			
 
				+            x[: y_incorrect.size],
			
 
				+            y_incorrect,
			
 
				+            alpha=0.45,
			
 
				+            s=18,
			
 
				+            color="dimgray",
			
 
				+            marker="x",
			
 
				+        )
			
 
				+
			
 
				+plt.legend()
			
 
				+plt.tight_layout()
			
 
				+plt.savefig(plots_dir / "sensitivity_avg_confidence_correct_vs_incorrect.png")
			
 
				+
			
 
				+# Plot mean confidence delta per model (correct - incorrect) vs ensemble size
			
 
				+plt.figure(figsize=(10, 6))
			
 
				+plt.errorbar(
			
 
				+    ensemble_sizes,
			
 
				+    mean_confidence_delta_per_model,
			
 
				+    yerr=std_confidence_delta_per_model,
			
 
				+    fmt="-o",
			
 
				+    capsize=3,
			
 
				+    color="tab:blue",
			
 
				+)
			
 
				+plt.title("Sensitivity Analysis: Confidence Delta Per Model (Correct - Incorrect)")
			
 
				+plt.xlabel("Number of Models in Ensemble")
			
 
				+plt.ylabel("Avg Delta Per Model")
			
 
				+plt.grid(True)
			
 
				+plt.xticks(ticks)
			
 
				+
			
 
				+for k in ensemble_sizes:
			
 
				+    y = confidence_delta_per_model[k]
			
 
				+    x = np.full(len(y), k) + (rng.random(len(y)) - 0.5) * 0.2
			
 
				+    plt.scatter(x, y, alpha=0.35, s=12, color="lightgray")
			
 
				+
			
 
				+plt.tight_layout()
			
 
				+plt.savefig(plots_dir / "sensitivity_confidence_delta_per_model.png")
			
 
				 # End of Copilot section
			
--- a/config.toml
+++ b/config.toml
@@ -1,2 +1,3 @@
 
				 [analysis]
			
 
				-evaluation_name = "Full_Ensemble(50x30) "
			
 
				+evaluation_name = "Full_Ensemble(50x30) "
			
 
				+adni_path = "../data/LP_ADNIMERGE.csv"