4 Commits cd26ee24c3 ... 7775cf4b28

Auteur SHA1 Message Date
  Nicholas Schense 7775cf4b28 Begin threshold refactor il y a 4 mois
  Nicholas Schense f18dfa3d01 Implemented entropy calculation il y a 4 mois
  Nicholas Schense fb1dbc94c2 Implemented metrics il y a 5 mois
  Nicholas Schense cc357abff3 Refactored Threshold code il y a 5 mois
6 fichiers modifiés avec 476 ajouts et 211 suppressions
  1. 0 0
      LP_ADNIMERGE.csv
  2. 3 3
      config.toml
  3. 13 0
      daily_log.md
  4. 14 3
      planning.md
  5. 422 192
      threshold.py
  6. 24 13
      utils/metrics.py

Fichier diff supprimé car celui-ci est trop grand
+ 0 - 0
LP_ADNIMERGE.csv


+ 3 - 3
config.toml

@@ -7,7 +7,7 @@ model_output = '/export/home/nschense/alzheimers/alzheimers_nn/saved_models/'
 
 [training]
 device = 'cuda:1'
-runs = 100
+runs = 50
 max_epochs = 30
 
 [dataset]
@@ -16,7 +16,7 @@ validation_split = 0.4 #Splits the dataset into the train and validation/test se
 #|splt*0.5  | split*0.5      | 1-split   |
 
 [model]
-name = 'cnn-100x30-2'
+name = 'cnn-50x30'
 image_channels = 1
 clin_data_channels = 2
 
@@ -29,5 +29,5 @@ droprate = 0.5
 silent = false
 
 [ensemble]
-name = 'cnn-100x30-2'
+name = 'cnn-50x30'
 prune_threshold = 0.0 # Any models with accuracy below this threshold will be pruned, set to 0 to disable pruning

+ 13 - 0
daily_log.md

@@ -41,4 +41,17 @@ Slow day today, mostly continued with reading. Began training a new model with 1
 ### Future
 - Meet with Ali
 - Continue reading
+
+## Wednesday, June 19, 2024
+Work from home day, mostly planning and reading. Met with Ali and Brayden, discussed progress and began initial thinking towards future paths for project. Main focus on improving our uncertainty
+z
+### Progress
+- Met with Ali and Brayden
+- Began thinking about future progress
+- Worked on ensemble a bit
+
+### Future
+- Fix uncertainty metric
+- Evaluate performance of different metrics
+- Write research proposal
   

+ 14 - 3
planning.md

@@ -3,9 +3,9 @@
 As of now, we have a program set up to be able to:
 
 - train an individual model with specific hyperparameters
-- train a ensemble of models with the identical hyperparameters 
-- evaluate the accuracy of an ensemble of models 
-- perform a coverage analysis on an ensemble of models 
+- train a ensemble of models with the identical hyperparameters
+- evaluate the accuracy of an ensemble of models
+- perform a coverage analysis on an ensemble of models
 
 The goal of this rewrite is to preserve those functions while making the program significantly cleaner and easier to use, and to make it easier to extend with new functionality in the future as well. The hope is for this project to take approximately ~1-2 days, and be completed by Monday (6/17). The additional features that I would like to implement are:
 
@@ -16,3 +16,14 @@ The goal of this rewrite is to preserve those functions while making the program
 - Implementation of new metrics and ensembles
 - Deterministic dataloading (for a specified seed, the data used is set and does not change, even if the loading methods do)
 
+## Further Planning as of 7/8/24
+
+- With the implementation of uncertainty through standard deviation, confidence and entropy, next steps are
+  - Refactor current threshold implementation - very very messy and difficult to add new features
+  - Enable checking images for incorrect prediction, and predictions off of the main curve for stdev-conf curve thing
+  - Investigate physician confidence, and compare to uncertianty predictions
+  - Deep dive standard deviation
+  - Box plot?
+  - Investigate calibration - do we need it?
+  - Consider manuscript - should be thinking about writing
+  

+ 422 - 192
threshold.py

@@ -9,8 +9,46 @@ import matplotlib.pyplot as plt
 import sklearn.metrics as metrics
 from tqdm import tqdm
 import utils.metrics as met
+import itertools as it
+import matplotlib.ticker as ticker
+
+
+# Define plotting helper function
+def plot_coverage(
+    percentiles,
+    ensemble_results,
+    individual_results,
+    title,
+    x_lablel,
+    y_label,
+    save_path,
+    flip=False,
+):
+    fig, ax = plt.subplots()
+    plt.plot(
+        percentiles,
+        ensemble_results,
+        'ob',
+        label='Ensemble',
+    )
+    plt.plot(
+        percentiles,
+        individual_results,
+        'xr',
+        label='Individual (on entire dataset)',
+    )
+    plt.xlabel(x_lablel)
+    plt.ylabel(y_label)
+    plt.title(title)
+    plt.legend()
+    if flip:
+        plt.gca().invert_xaxis()
+    ax.xaxis.set_major_formatter(ticker.PercentFormatter(xmax=1.0))
+    plt.savefig(save_path)
+    plt.close()
+
 
-RUN = True
+RUN = False
 
 # CONFIGURATION
 if os.getenv('ADL_CONFIG_PATH') is None:
@@ -21,239 +59,431 @@ else:
         config = toml.load(f)
 
 
-# This function returns a list of the accuracies given a threshold
-def threshold(config):
-    # First, get the model data
-    test_set = torch.load(
-        config['paths']['model_output']
-        + config['ensemble']['name']
-        + '/test_dataset.pt'
-    )
+ENSEMBLE_PATH = f"{config['paths']['model_output']}{config['ensemble']['name']}"
 
-    vs = torch.load(
-        config['paths']['model_output'] + config['ensemble']['name'] + '/val_dataset.pt'
-    )
+V2_PATH = ENSEMBLE_PATH + '/v2'
 
-    test_set = test_set + vs
 
-    models, _ = ens.load_models(
-        config['paths']['model_output'] + config['ensemble']['name'] + '/models/',
-        config['training']['device'],
-    )
+# Result is a 1x2 tensor, with the softmax of the 2 predicted classes
+# Want to convert to a predicted class and a confidence
+def output_to_confidence(result):
+    predicted_class = torch.argmax(result).item()
+    confidence = (torch.max(result).item() - 0.5) * 2
 
-    indv_model = models[0]
-
-    predictions = []
-    indv_predictions = []
-
-    # Evaluate ensemble and uncertainty test set
-    for mdata, target in tqdm(test_set, total=len(test_set)):
-        mri, xls = mdata
-        mri = mri.unsqueeze(0)
-        xls = xls.unsqueeze(0)
-        mdata = (mri, xls)
-        mean, variance = ens.ensemble_predict(models, mdata)
-        stdev = torch.sqrt(variance)
-        prediction = mean.item()
-
-        target = target[1]
-
-        # Check if the prediction is correct
-        correct = (prediction < 0.5 and int(target.item()) == 0) or (
-            prediction >= 0.5 and int(target.item()) == 1
-        )
-
-        predictions.append(
-            {
-                'Prediction': prediction,
-                'Actual': target.item(),
-                'Stdev': stdev.item(),
-                'Correct': correct,
-            }
-        )
-
-        i_mean = indv_model(mdata)[:, 1].item()
-        i_correct = (i_mean < 0.5 and int(target.item()) == 0) or (
-            i_mean >= 0.5 and int(target.item()) == 1
-        )
-
-        indv_predictions.append(
-            {
-                'Prediction': i_mean,
-                'Actual': target.item(),
-                'Stdev': 0,
-                'Correct': i_correct,
-            }
-        )
-
-    # Sort the predictions by the uncertainty
-    predictions = pd.DataFrame(predictions).sort_values(by='Stdev')
-
-    # Calculate the metrics for the individual model
-    indv_predictions = pd.DataFrame(indv_predictions)
-    indv_correct = indv_predictions['Correct'].sum()
-    indv_accuracy = indv_correct / len(indv_predictions)
-    indv_false_pos = len(
-        indv_predictions[
-            (indv_predictions['Prediction'] >= 0.5) & (indv_predictions['Actual'] == 0)
-        ]
-    )
-    indv_false_neg = len(
-        indv_predictions[
-            (indv_predictions['Prediction'] < 0.5) & (indv_predictions['Actual'] == 1)
-        ]
+    return torch.Tensor([predicted_class, confidence])
+
+
+# This function conducts tests on the models and returns the results, as well as saving the predictions and metrics
+def get_predictions(config):
+    models, model_descs = ens.load_models(
+        f'{ENSEMBLE_PATH}/models/',
+        config['training']['device'],
     )
-    indv_f1 = 2 * indv_correct / (2 * indv_correct + indv_false_pos + indv_false_neg)
-    indv_auc = metrics.roc_auc_score(
-        indv_predictions['Actual'], indv_predictions['Prediction']
+    models = [model.to(config['training']['device']) for model in models]
+    test_set = torch.load(f'{ENSEMBLE_PATH}/test_dataset.pt') + torch.load(
+        f'{ENSEMBLE_PATH}/val_dataset.pt'
     )
+    print(f'Loaded {len(test_set)} samples')
 
-    indv_metrics = {'Accuracy': indv_accuracy, 'F1': indv_f1, 'AUC': indv_auc}
-
-    thresholds = []
-    quantiles = np.arange(0.1, 1, 0.1)
-    # get uncertainty quantiles
-    for quantile in quantiles:
-        thresholds.append(predictions['Stdev'].quantile(quantile))
-
-    # Calculate the accuracy of the model for each threshold
-    accuracies = []
-    # Calculate the accuracy of the model for each threshold
-    for threshold, quantile in zip(thresholds, quantiles):
-        filtered = predictions[predictions['Stdev'] <= threshold]
-        correct = filtered['Correct'].sum()
-        total = len(filtered)
-        accuracy = correct / total
-
-        false_positives = len(
-            filtered[(filtered['Prediction'] >= 0.5) & (filtered['Actual'] == 0)]
-        )
-
-        false_negatives = len(
-            filtered[(filtered['Prediction'] < 0.5) & (filtered['Actual'] == 1)]
-        )
-
-        f1 = 2 * correct / (2 * correct + false_positives + false_negatives)
-
-        auc = metrics.roc_auc_score(filtered['Actual'], filtered['Prediction'])
-
-        accuracies.append(
-            {
-                'Threshold': threshold,
-                'Accuracy': accuracy,
-                'Quantile': quantile,
-                'F1': f1,
-                'AUC': auc,
-            }
-        )
-
-    predictions.to_csv(
-        f"{config['paths']['model_output']}{config['ensemble']['name']}/predictions.csv"
-    )
+    # [([model results], labels)]
+    results = []
 
-    indv_predictions.to_csv(
-        f"{config['paths']['model_output']}{config['ensemble']['name']}/indv_predictions.csv"
-    )
+    # [(class_1, class_2, true_label)]
+    indv_results = []
+
+    for i, (data, target) in tqdm(
+        enumerate(test_set),
+        total=len(test_set),
+        desc='Getting predictions',
+        unit='sample',
+    ):
+        mri, xls = data
+        mri = mri.unsqueeze(0).to(config['training']['device'])
+        xls = xls.unsqueeze(0).to(config['training']['device'])
+        data = (mri, xls)
+        res = []
+        for j, model in enumerate(models):
+            model.eval()
+            with torch.no_grad():
+                output = model(data)
+
+                output = output.tolist()
+
+                if j == 0:
+                    indv_results.append((output[0][0], output[0][1], target[1].item()))
+
+                res.append(output)
+        results.append((res, target.tolist()))
+
+    # The results are a list of tuples, where each tuple contains a list of model outputs and the true label
+    # We want to convert this to 2 list of tuples, one with the ensemble predicted class, ensemble confidence and true label
+    # And one with the ensemble predicted class, ensemble standard deviation and true label
+
+    # [(ensemble predicted class, ensemble confidence, true label)]
+    confidences = []
+
+    # [(ensemble predicted class, ensemble standard deviation, true label)]
+    stdevs = []
+
+    # [(ensemble predicted class, ensemble entropy, true label)]
+    entropies = []
+
+    for result in results:
+        model_results, true_label = result
+        # Get the ensemble mean and variance with numpy, as these are lists
+        mean = np.mean(model_results, axis=0)
+        variance = np.var(model_results, axis=0)
 
-    return pd.DataFrame(accuracies), indv_metrics
+        # Calculate the entropy
+        entropy = -1 * np.sum(mean * np.log(mean))
+
+        # Calculate confidence and standard deviation
+        confidence = (np.max(mean) - 0.5) * 2
+        stdev = np.sqrt(variance)
+
+        # Get the predicted class
+        predicted_class = np.argmax(mean)
+
+        # Get the confidence and standard deviation of the predicted class
+        pc_stdev = np.squeeze(stdev)[predicted_class]
+        # Get the individual classes
+        class_1 = mean[0][0]
+        class_2 = mean[0][1]
+
+        # Get the true label
+        true_label = true_label[1]
+
+        confidences.append((predicted_class, confidence, true_label, class_1, class_2))
+        stdevs.append((predicted_class, pc_stdev, true_label, class_1, class_2))
+        entropies.append((predicted_class, entropy, true_label, class_1, class_2))
+
+    return results, confidences, stdevs, entropies, indv_results
 
 
 if RUN:
-    result, indv = threshold(config)
-    result.to_csv(
-        f"{config['paths']['model_output']}{config['ensemble']['name']}/coverage.csv"
+    results, confs, stdevs, entropies, indv_results = get_predictions(config)
+    # Convert to pandas dataframes
+    confs_df = pd.DataFrame(
+        confs,
+        columns=['predicted_class', 'confidence', 'true_label', 'class_1', 'class_2'],
     )
-    indv = pd.DataFrame([indv])
-    indv.to_csv(
-        f"{config['paths']['model_output']}{config['ensemble']['name']}/indv_metrics.csv"
+    stdevs_df = pd.DataFrame(
+        stdevs, columns=['predicted_class', 'stdev', 'true_label', 'class_1', 'class_2']
     )
 
-result = pd.read_csv(
-    f"{config['paths']['model_output']}{config['ensemble']['name']}/coverage.csv"
-)
-predictions = pd.read_csv(
-    f"{config['paths']['model_output']}{config['ensemble']['name']}/predictions.csv"
+    entropies_df = pd.DataFrame(
+        entropies,
+        columns=['predicted_class', 'entropy', 'true_label', 'class_1', 'class_2'],
+    )
+
+    indv_df = pd.DataFrame(indv_results, columns=['class_1', 'class_2', 'true_label'])
+
+    if not os.path.exists(V2_PATH):
+        os.makedirs(V2_PATH)
+
+    confs_df.to_csv(f'{V2_PATH}/ensemble_confidences.csv')
+    stdevs_df.to_csv(f'{V2_PATH}/ensemble_stdevs.csv')
+    entropies_df.to_csv(f'{V2_PATH}/ensemble_entropies.csv')
+    indv_df.to_csv(f'{V2_PATH}/individual_results.csv')
+else:
+    confs_df = pd.read_csv(f'{V2_PATH}/ensemble_confidences.csv')
+    stdevs_df = pd.read_csv(f'{V2_PATH}/ensemble_stdevs.csv')
+    entropies_df = pd.read_csv(f'{V2_PATH}/ensemble_entropies.csv')
+    indv_df = pd.read_csv(f'{V2_PATH}/individual_results.csv')
+
+
+# Plot confidence vs standard deviation, and change color of dots based on if they are correct
+correct_conf = confs_df[confs_df['predicted_class'] == confs_df['true_label']]
+incorrect_conf = confs_df[confs_df['predicted_class'] != confs_df['true_label']]
+
+correct_stdev = stdevs_df[stdevs_df['predicted_class'] == stdevs_df['true_label']]
+incorrect_stdev = stdevs_df[stdevs_df['predicted_class'] != stdevs_df['true_label']]
+
+correct_ent = entropies_df[
+    entropies_df['predicted_class'] == entropies_df['true_label']
+]
+incorrect_ent = entropies_df[
+    entropies_df['predicted_class'] != entropies_df['true_label']
+]
+
+plot, ax = plt.subplots()
+plt.scatter(
+    correct_conf['confidence'],
+    correct_stdev['stdev'],
+    color='green',
+    label='Correct Prediction',
 )
-indv = pd.read_csv(
-    f"{config['paths']['model_output']}{config['ensemble']['name']}/indv_metrics.csv"
+plt.scatter(
+    incorrect_conf['confidence'],
+    incorrect_stdev['stdev'],
+    color='red',
+    label='Incorrect Prediction',
 )
+plt.xlabel('Confidence (Raw Value)')
+plt.ylabel('Standard Deviation (Raw Value)')
+plt.title('Confidence vs Standard Deviation')
+plt.legend()
+plt.savefig(f'{V2_PATH}/confidence_vs_stdev.png')
 
-print(indv)
+plt.close()
 
+# Do the same for confidence vs entropy
+plot, ax = plt.subplots()
+plt.scatter(
+    correct_conf['confidence'],
+    correct_ent['entropy'],
+    color='green',
+    label='Correct Prediction',
+)
+plt.scatter(
+    incorrect_conf['confidence'],
+    incorrect_ent['entropy'],
+    color='red',
+    label='Incorrect Prediction',
+)
+plt.xlabel('Confidence (Raw Value)')
+plt.ylabel('Entropy (Raw Value)')
+plt.title('Confidence vs Entropy')
+plt.legend()
+plt.savefig(f'{V2_PATH}/confidence_vs_entropy.png')
 
-plt.figure()
+plt.close()
 
-plt.plot(result['Quantile'], result['Accuracy'], label='Ensemble Accuracy')
 
-plt.plot(
-    result['Quantile'],
-    [indv['Accuracy']] * len(result['Quantile']),
-    label='Individual Accuracy',
-    linestyle='--',
+# Calculate individual model accuracy and entropy
+# Determine predicted class
+indv_df['predicted_class'] = indv_df[['class_1', 'class_2']].idxmax(axis=1)
+indv_df['predicted_class'] = indv_df['predicted_class'].apply(
+    lambda x: 0 if x == 'class_1' else 1
 )
-plt.legend()
+indv_df['correct'] = indv_df['predicted_class'] == indv_df['true_label']
+accuracy_indv = indv_df['correct'].mean()
+f1_indv = met.F1(
+    indv_df['predicted_class'].to_numpy(), indv_df['true_label'].to_numpy()
+)
+auc_indv = metrics.roc_auc_score(
+    indv_df['true_label'].to_numpy(), indv_df['class_2'].to_numpy()
+)
+indv_df['entropy'] = -1 * indv_df[['class_1', 'class_2']].apply(
+    lambda x: x * np.log(x), axis=0
+).sum(axis=1)
+
+# Calculate percentiles for confidence and standard deviation
+quantiles_conf = confs_df.quantile(np.linspace(0, 1, 11), interpolation='lower')[
+    'confidence'
+]
+quantiles_stdev = stdevs_df.quantile(np.linspace(0, 1, 11), interpolation='lower')[
+    'stdev'
+]
+
+# Additionally for individual confidence
+quantiles_indv_conf = indv_df.quantile(np.linspace(0, 1, 11), interpolation='lower')[
+    'class_2'
+]
+
+# For indivual entropy
+quantiles_indv_entropy = indv_df.quantile(np.linspace(0, 1, 11), interpolation='lower')[
+    'entropy'
+]
+
+#
+
+accuracies_conf = []
+# Use the quantiles to calculate the coverage
+iter_conf = it.islice(quantiles_conf.items(), 0, None)
+for quantile in iter_conf:
+    percentile = quantile[0]
+
+    filt = confs_df[confs_df['confidence'] >= quantile[1]]
+    accuracy = (
+        filt[filt['predicted_class'] == filt['true_label']].shape[0] / filt.shape[0]
+    )
+    f1 = met.F1(filt['predicted_class'].to_numpy(), filt['true_label'].to_numpy())
 
-plt.title('Accuracy vs Coverage')
+    accuracies_conf.append({'percentile': percentile, 'accuracy': accuracy, 'f1': f1})
+
+accuracies_df = pd.DataFrame(accuracies_conf)
+
+indv_conf = []
+# Use the quantiles to calculate the coverage
+iter_conf = it.islice(quantiles_indv_conf.items(), 0, None)
+for quantile in iter_conf:
+    percentile = quantile[0]
+
+    filt = indv_df[indv_df['class_2'] >= quantile[1]]
+    accuracy = filt['correct'].mean()
+    f1 = met.F1(filt['predicted_class'].to_numpy(), filt['true_label'].to_numpy())
+
+    indv_conf.append({'percentile': percentile, 'accuracy': accuracy, 'f1': f1})
+
+indv_conf_df = pd.DataFrame(indv_conf)
+
+# Do the same for entropy
+indv_entropy = []
+iter_entropy = it.islice(quantiles_indv_entropy.items(), 0, None)
+for quantile in iter_entropy:
+    percentile = quantile[0]
+
+    filt = indv_df[indv_df['entropy'] <= quantile[1]]
+    accuracy = filt['correct'].mean()
+    f1 = met.F1(filt['predicted_class'].to_numpy(), filt['true_label'].to_numpy())
+
+    indv_entropy.append({'percentile': percentile, 'accuracy': accuracy, 'f1': f1})
+
+indv_entropy_df = pd.DataFrame(indv_entropy)
 
-plt.xlabel('Coverage')
-plt.ylabel('Accuracy')
-plt.gca().invert_xaxis()
 
-plt.savefig(
-    f"{config['paths']['model_output']}{config['ensemble']['name']}/coverage.png"
+# Plot the coverage for confidence and accuracy
+plot_coverage(
+    accuracies_df['percentile'],
+    accuracies_df['accuracy'],
+    indv_conf_df['accuracy'],
+    'Confidence Accuracy Coverage Plot',
+    'Minimum Confidence Percentile (Low to High)',
+    'Accuracy',
+    f'{V2_PATH}/coverage_conf.png',
 )
 
-plt.figure()
-plt.plot(result['Quantile'], result['F1'], label='Ensemble F1')
+# Plot the coverage for confidence and F1
+plot_coverage(
+    accuracies_df['percentile'],
+    accuracies_df['f1'],
+    indv_conf_df['f1'],
+    'Confidence F1 Coverage Plot',
+    'Minimum Confidence Percentile (Low to High)',
+    'F1',
+    f'{V2_PATH}/f1_coverage_conf.png',
+)
+
+# Repeat for standard deviation
+accuracies_stdev = []
+iter_stdev = it.islice(quantiles_stdev.items(), 0, None)
+for quantile in iter_stdev:
+    percentile = quantile[0]
+
+    filt = stdevs_df[stdevs_df['stdev'] <= quantile[1]]
+    accuracy = (
+        filt[filt['predicted_class'] == filt['true_label']].shape[0] / filt.shape[0]
+    )
+    f1 = met.F1(filt['predicted_class'].to_numpy(), filt['true_label'].to_numpy())
+
+    accuracies_stdev.append({'percentile': percentile, 'accuracy': accuracy, 'f1': f1})
+
+accuracies_stdev_df = pd.DataFrame(accuracies_stdev)
+
+fig, ax = plt.subplots()
 plt.plot(
-    result['Quantile'],
-    [indv['F1']] * len(result['Quantile']),
-    label='Individual F1',
-    linestyle='--',
+    accuracies_stdev_df['percentile'],
+    accuracies_stdev_df['accuracy'],
+    'ob',
+    label='Ensemble',
 )
+plt.plot(
+    accuracies_stdev_df['percentile'],
+    [accuracy_indv] * len(accuracies_stdev_df['percentile']),
+    'xr',
+    label='Individual (on entire dataset)',
+)
+plt.xlabel('Maximum Standard Deviation Percentile (High to Low)')
+plt.ylabel('Accuracy')
+plt.title('Standard Deviation Accuracy Coverage Plot')
 plt.legend()
-plt.title('F1 vs Coverage')
-
-plt.xlabel('Coverage')
-plt.ylabel('F1')
 plt.gca().invert_xaxis()
+ax.xaxis.set_major_formatter(ticker.PercentFormatter(xmax=1.0))
+plt.savefig(f'{V2_PATH}/coverage_stdev.png')
+plt.close()
 
-plt.savefig(
-    f"{config['paths']['model_output']}{config['ensemble']['name']}/coverage_f1.png"
+# Plot coverage vs F1 for standard deviation
+fig, ax = plt.subplots()
+plt.plot(
+    accuracies_stdev_df['percentile'], accuracies_stdev_df['f1'], 'ob', label='Ensemble'
 )
-
-plt.figure()
-plt.plot(result['Quantile'], result['AUC'], label='Ensemble AUC')
 plt.plot(
-    result['Quantile'],
-    [indv['AUC']] * len(result['Quantile']),
-    label='Individual AUC',
-    linestyle='--',
+    accuracies_stdev_df['percentile'],
+    [f1_indv] * len(accuracies_stdev_df['percentile']),
+    'xr',
+    label='Individual (on entire dataset)',
 )
+plt.xlabel('Maximum Standard Deviation Percentile (High to Low)')
+plt.ylabel('F1')
+plt.title('Standard Deviation F1 Coverage Plot')
 plt.legend()
-plt.title('AUC vs Coverage')
-plt.xlabel('Coverage')
-plt.ylabel('AUC')
 plt.gca().invert_xaxis()
+ax.xaxis.set_major_formatter(ticker.PercentFormatter(xmax=1.0))
+plt.savefig(f'{V2_PATH}/coverage_f1_stdev.png')
+
+plt.close()
 
-plt.savefig(
-    f"{config['paths']['model_output']}{config['ensemble']['name']}/coverage_auc.png"
+
+# Print overall accuracy
+overall_accuracy = (
+    confs_df[confs_df['predicted_class'] == confs_df['true_label']].shape[0]
+    / confs_df.shape[0]
+)
+overall_f1 = met.F1(
+    confs_df['predicted_class'].to_numpy(), confs_df['true_label'].to_numpy()
+)
+# Calculate ECE and MCE
+conf_ece = met.ECE(
+    confs_df['predicted_class'].to_numpy(),
+    confs_df['confidence'].to_numpy(),
+    confs_df['true_label'].to_numpy(),
 )
 
-# create histogram of the incorrect predictions vs the uncertainty
-plt.figure()
-plt.hist(predictions[~predictions['Correct']]['Stdev'], bins=10)
-plt.xlabel('Uncertainty')
-plt.ylabel('Number of incorrect predictions')
-plt.savefig(
-    f"{config['paths']['model_output']}{config['ensemble']['name']}/incorrect_predictions.png"
+stdev_ece = met.ECE(
+    stdevs_df['predicted_class'].to_numpy(),
+    stdevs_df['stdev'].to_numpy(),
+    stdevs_df['true_label'].to_numpy(),
 )
 
-ece = met.ECE(predictions['Prediction'], predictions['Actual'])
 
-print(f'ECE: {ece}')
+print(f'Overall accuracy: {overall_accuracy}, Overall F1: {overall_f1},')
+print(f'Confidence ECE: {conf_ece}')
+print(f'Standard Deviation ECE: {stdev_ece}')
+
+
+# Repeat for entropy
+quantiles_entropy = entropies_df.quantile(np.linspace(0, 1, 11), interpolation='lower')[
+    'entropy'
+]
 
-with open(
-    f"{config['paths']['model_output']}{config['ensemble']['name']}/summary.txt", 'a'
-) as f:
-    f.write(f'ECE: {ece}\n')
+accuracies_entropy = []
+iter_entropy = it.islice(quantiles_entropy.items(), 0, None)
+for quantile in iter_entropy:
+    percentile = quantile[0]
+
+    filt = entropies_df[entropies_df['entropy'] <= quantile[1]]
+    accuracy = (
+        filt[filt['predicted_class'] == filt['true_label']].shape[0] / filt.shape[0]
+    )
+    f1 = met.F1(filt['predicted_class'].to_numpy(), filt['true_label'].to_numpy())
+
+    accuracies_entropy.append(
+        {'percentile': percentile, 'accuracy': accuracy, 'f1': f1}
+    )
+
+accuracies_entropy_df = pd.DataFrame(accuracies_entropy)
+
+
+# Plot the coverage for entropy and accuracy
+plot_coverage(
+    accuracies_entropy_df['percentile'],
+    accuracies_entropy_df['accuracy'],
+    indv_entropy_df['accuracy'],
+    'Entropy Accuracy Coverage Plot',
+    'Minimum Entropy Percentile (Low to High)',
+    'Accuracy',
+    f'{V2_PATH}/coverage_entropy.png',
+)
+
+# Plot the coverage for entropy and F1
+plot_coverage(
+    accuracies_entropy_df['percentile'],
+    accuracies_entropy_df['f1'],
+    indv_entropy_df['f1'],
+    'Entropy F1 Coverage Plot',
+    'Maximum Entropy Percentile (High to Low)',
+    'F1',
+    f'{V2_PATH}/f1_coverage_entropy.png',
+    flip=True,
+)

+ 24 - 13
utils/metrics.py

@@ -1,26 +1,23 @@
 import numpy as np
+import sklearn.metrics as mt
 
 
 # ECE from https://towardsdatascience.com/expected-calibration-error-ece-a-step-by-step-visual-explanation-with-python-code-c3e9aa12937d
-def ECE(samples, true_labels, M=5):
+def ECE(predicted_labels, confidences, true_labels, M=5):
     # Uniform M bins
     bin_boundaries = np.linspace(0, 1, M + 1)
     bin_lowers = bin_boundaries[:-1]
     bin_uppers = bin_boundaries[1:]
 
-    # Get max probability per sample i
-    confidences = samples
-    predicted_label = true_labels
-
     # get correct/false
-    accuracies = predicted_label == true_labels
+    accuracies = predicted_labels == true_labels
 
     ece = np.zeros(1)
 
     for bin_lower, bin_upper in zip(bin_lowers, bin_uppers):
         # bin sample
         in_bin = np.logical_and(
-            confidences > bin_lower.item(), confidences < bin_upper.item()
+            confidences > bin_lower.item(), confidences <= bin_upper.item()
         )
         prob_in_bin = in_bin.mean()
 
@@ -33,17 +30,13 @@ def ECE(samples, true_labels, M=5):
 
 
 # Maximum Calibration error - maximum of error per bin
-def MCE(samples, true_labels, M=5):
+def MCE(predicted_labels, confidences, true_labels, M=5):
     bin_boundaries = np.linspace(0, 1, M + 1)
     bin_lowers = bin_boundaries[:-1]
     bin_uppers = bin_boundaries[1:]
 
-    # Get max probability per sample i
-    confidences = samples
-    predicted_label = true_labels
-
     # get correct/false
-    accuracies = predicted_label == true_labels
+    accuracies = predicted_labels == true_labels
 
     mces = []
 
@@ -60,3 +53,21 @@ def MCE(samples, true_labels, M=5):
             mces.append(np.abs(avg_confid - accuracy_in_bin))
 
     return max(mces)
+
+
+def F1(predicted_labels, true_labels):
+    tp = np.sum(np.logical_and(predicted_labels == 1, true_labels == 1))
+    fp = np.sum(np.logical_and(predicted_labels == 1, true_labels == 0))
+    fn = np.sum(np.logical_and(predicted_labels == 0, true_labels == 1))
+
+    precision = tp / (tp + fp)
+    recall = tp / (tp + fn)
+
+    return 2 * (precision * recall) / (precision + recall)
+
+
+# Uses sklearn's AUC function
+# Requieres confidences to be the predicted probabilities for the positive class
+def AUC(confidences, true_labels):
+    fpr, tpr, _ = mt.roc_curve(true_labels, confidences)
+    return mt.auc(fpr, tpr)

Certains fichiers n'ont pas été affichés car il y a eu trop de fichiers modifiés dans ce diff