# The purpose of this file is to perform a sensitivity analysis on the model evaluation results and graph the findings.
# The sensitivity analysis will be done by varying the number of models used in the ensemble and observing the effect on overall accuracy.
# We will take 50 different random arrangemnts of models for each ensemble size (other than 50, which is the full set) to get a distribution of accuracies for each ensemble size.
# The will have associated error bars based on the standard deviation of the accuracies for each ensemble size.
import xarray as xr
from utils.config import config
import pathlib as pl
import numpy as np
import matplotlib.pyplot as plt
import os

# Load the evaluation results
os.chdir(pl.Path(__file__).parent)
model_dataset_path = pl.Path("../model_evaluations") / pl.Path(
    config["analysis"]["evaluation_name"].strip()
).with_suffix(".nc")

print(f"Loading evaluation results from {model_dataset_path}")
array = xr.open_dataset(model_dataset_path)  # type: ignore


# This section was generated by Github Copilot - 2025-11-04
# Perform sensitivity analysis by varying ensemble size and sampling subsets of models.

predictions: xr.DataArray = array["predictions"]
labels: xr.DataArray = array["labels"]

# Make plots directory if it doesn't exist (matching other scripts)
plots_dir = (
    pl.Path("../output") / pl.Path(config["analysis"]["evaluation_name"]) / "plots"
)
plots_dir.mkdir(parents=True, exist_ok=True)

# Configuration for the sensitivity analysis
rng = np.random.default_rng(42)
num_models = int(predictions.sizes["model"])
ensemble_sizes = list(range(1, num_models + 1))
samples_per_size = 50

# Extract true labels for the positive class (assumes same structure as other script)
true_labels = labels.sel(label=1).values  # shape: (n_samples,)

# Container for results
mean_accuracies: list[float] = []
std_accuracies: list[float] = []
all_accuracies: dict[int, list[float]] = {k: [] for k in ensemble_sizes}

for k in ensemble_sizes:
    accuracies_k = []
    # If using the full set, evaluate once deterministically
    if k == num_models:
        selected_idx = np.arange(num_models)
        preds_selected = predictions.isel(model=selected_idx).mean(dim="model")
        confs = preds_selected.sel(img_class=1).values
        predicted_positive = confs >= 0.5
        true_positive = true_labels == 1
        acc = (predicted_positive == true_positive).sum().item() / len(confs)
        accuracies_k.append(acc)
    else:
        for _ in range(samples_per_size):
            selected_idx = rng.choice(num_models, size=k, replace=False)
            preds_selected = predictions.isel(model=selected_idx).mean(dim="model")
            confs = preds_selected.sel(img_class=1).values
            predicted_positive = confs >= 0.5
            true_positive = true_labels == 1
            acc = (predicted_positive == true_positive).sum().item() / len(confs)
            accuracies_k.append(acc)
    all_accuracies[k] = accuracies_k
    mean_accuracies.append(float(np.mean(accuracies_k)))
    std_accuracies.append(float(np.std(accuracies_k, ddof=0)))

# Plot mean accuracy vs ensemble size with error bars (std)
plt.figure(figsize=(10, 6))
plt.errorbar(ensemble_sizes, mean_accuracies, yerr=std_accuracies, fmt="-o", capsize=3)
plt.title("Sensitivity Analysis: Accuracy vs Ensemble Size")
plt.xlabel("Number of Models in Ensemble")
plt.ylabel("Accuracy")
plt.grid(True)
# Set x-ticks every 5 models (and always include the final model count)
ticks = list(range(1, num_models + 1, 5))
if len(ticks) == 0 or ticks[-1] != num_models:
    ticks.append(num_models)
plt.xticks(ticks)

# Optionally overlay raw sample distributions as jittered points
for i, k in enumerate(ensemble_sizes):
    y = all_accuracies[k]
    x = np.full(len(y), k) + (rng.random(len(y)) - 0.5) * 0.2  # small jitter
    plt.scatter(x, y, alpha=0.3, s=8, color="gray")

plt.tight_layout()

plt.savefig(plots_dir / "sensitivity_accuracy_vs_ensemble_size.png")
# End of Copilot section