# The purpose of this file is to perform a sensitivity analysis on the model evaluation results and graph the findings. # The sensitivity analysis will be done by varying the number of models used in the ensemble and observing the effect on overall accuracy. # We will take 50 different random arrangemnts of models for each ensemble size (other than 50, which is the full set) to get a distribution of accuracies for each ensemble size. # The will have associated error bars based on the standard deviation of the accuracies for each ensemble size. import xarray as xr from utils.config import config import pathlib as pl import numpy as np import matplotlib.pyplot as plt import os # Load the evaluation results os.chdir(pl.Path(__file__).parent) model_dataset_path = pl.Path("../model_evaluations") / pl.Path( config["analysis"]["evaluation_name"].strip() ).with_suffix(".nc") print(f"Loading evaluation results from {model_dataset_path}") array = xr.open_dataset(model_dataset_path) # type: ignore # This section was generated by Github Copilot - 2025-11-04 # Perform sensitivity analysis by varying ensemble size and sampling subsets of models. predictions: xr.DataArray = array["predictions"] labels: xr.DataArray = array["labels"] # Make plots directory if it doesn't exist (matching other scripts) plots_dir = ( pl.Path("../output") / pl.Path(config["analysis"]["evaluation_name"]) / "plots" ) plots_dir.mkdir(parents=True, exist_ok=True) # Configuration for the sensitivity analysis rng = np.random.default_rng(42) num_models = int(predictions.sizes["model"]) ensemble_sizes = list(range(1, num_models + 1)) samples_per_size = 50 # Extract true labels for the positive class (assumes same structure as other script) true_labels = labels.sel(label=1).values # shape: (n_samples,) # Container for results mean_accuracies: list[float] = [] std_accuracies: list[float] = [] all_accuracies: dict[int, list[float]] = {k: [] for k in ensemble_sizes} for k in ensemble_sizes: accuracies_k = [] # If using the full set, evaluate once deterministically if k == num_models: selected_idx = np.arange(num_models) preds_selected = predictions.isel(model=selected_idx).mean(dim="model") confs = preds_selected.sel(img_class=1).values predicted_positive = confs >= 0.5 true_positive = true_labels == 1 acc = (predicted_positive == true_positive).sum().item() / len(confs) accuracies_k.append(acc) else: for _ in range(samples_per_size): selected_idx = rng.choice(num_models, size=k, replace=False) preds_selected = predictions.isel(model=selected_idx).mean(dim="model") confs = preds_selected.sel(img_class=1).values predicted_positive = confs >= 0.5 true_positive = true_labels == 1 acc = (predicted_positive == true_positive).sum().item() / len(confs) accuracies_k.append(acc) all_accuracies[k] = accuracies_k mean_accuracies.append(float(np.mean(accuracies_k))) std_accuracies.append(float(np.std(accuracies_k, ddof=0))) # Plot mean accuracy vs ensemble size with error bars (std) plt.figure(figsize=(10, 6)) plt.errorbar(ensemble_sizes, mean_accuracies, yerr=std_accuracies, fmt="-o", capsize=3) plt.title("Sensitivity Analysis: Accuracy vs Ensemble Size") plt.xlabel("Number of Models in Ensemble") plt.ylabel("Accuracy") plt.grid(True) # Set x-ticks every 5 models (and always include the final model count) ticks = list(range(1, num_models + 1, 5)) if len(ticks) == 0 or ticks[-1] != num_models: ticks.append(num_models) plt.xticks(ticks) # Optionally overlay raw sample distributions as jittered points for i, k in enumerate(ensemble_sizes): y = all_accuracies[k] x = np.full(len(y), k) + (rng.random(len(y)) - 0.5) * 0.2 # small jitter plt.scatter(x, y, alpha=0.3, s=8, color="gray") plt.tight_layout() plt.savefig(plots_dir / "sensitivity_accuracy_vs_ensemble_size.png") # End of Copilot section