Browse Source

Beginning work on rewrite

Nicholas Schense 1 week ago
parent
commit
53c6d721b1
7 changed files with 188 additions and 181 deletions
  1. 1 0
      .vscode/launch.json
  2. 0 0
      LP_ADNIMERGE.csv
  3. 2 5
      bayesian.py
  4. 7 4
      config.toml
  5. 0 6
      ruff.toml
  6. 158 158
      threshold.py
  7. 20 8
      utils/data/datasets.py

+ 1 - 0
.vscode/launch.json

@@ -4,6 +4,7 @@
     // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
     // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
     "version": "0.2.0",
     "version": "0.2.0",
     "configurations": [
     "configurations": [
+        {"name":"Python Debugger: Current File","type":"debugpy","request":"launch","program":"${file}","console":"integratedTerminal"},
         
         
         {
         {
             "name": "Python Debugger: Current File",
             "name": "Python Debugger: Current File",

File diff suppressed because it is too large
+ 0 - 0
LP_ADNIMERGE.csv


+ 2 - 5
bayesian.py

@@ -52,12 +52,9 @@ for epoch in range(config["training"]["epochs"]):
         loss = loss / len(data)
         loss = loss / len(data)
         loss.backward()
         loss.backward()
         optimizer.step()
         optimizer.step()
-        
 
 
-#Test the model
+
+# Test the model
 model.eval()
 model.eval()
 with torch.no_grad():
 with torch.no_grad():
     output_li
     output_li
-        
-    
-

+ 7 - 4
config.toml

@@ -1,12 +1,15 @@
 [paths]
 [paths]
-mri_data = '/data/data_wnx1/_Data/AlzheimersDL/CNN+RNN-2class-1cnn+data/PET_volumes_customtemplate_float32/'
-xls_data = '/export/home/nschense/alzheimers/alzheimers_nn/LP_ADNIMERGE.csv'
+#mri_data = '/data/data_wnx1/_Data/AlzheimersDL/CNN+RNN-2class-1cnn+data/PET_volumes_customtemplate_float32/'
+#xls_data = '/export/home/nschense/alzheimers/alzheimers_nn/LP_ADNIMERGE.csv'
+
+mri_data = '../data/PET_volumes_customtemplate_float32/'
+xls_data = '../data/LP_ADNIMERGE.csv'
 
 
 #CHANGE THESE BEFORE RUNNING
 #CHANGE THESE BEFORE RUNNING
-model_output = '/export/home/nschense/alzheimers/alzheimers_nn/saved_models/'
+model_output = '../models/'
 
 
 [training]
 [training]
-device = 'cuda:1' 
+device = 'mps' 
 runs = 50 # Number of models
 runs = 50 # Number of models
 max_epochs = 30 # Epochs per model
 max_epochs = 30 # Epochs per model
 
 

+ 0 - 6
ruff.toml

@@ -1,6 +0,0 @@
-[lint]
-select = ["E4", "E7", "E9", "F", "B"]
-
-
-[format]
-quote-style = "single"

+ 158 - 158
threshold.py

@@ -27,14 +27,14 @@ def plot_coverage(
     plt.plot(
     plt.plot(
         percentiles,
         percentiles,
         ensemble_results,
         ensemble_results,
-        'ob',
-        label='Ensemble',
+        "ob",
+        label="Ensemble",
     )
     )
     plt.plot(
     plt.plot(
         percentiles,
         percentiles,
         individual_results,
         individual_results,
-        'xr',
-        label='Individual (on entire dataset)',
+        "xr",
+        label="Individual (on entire dataset)",
     )
     )
     plt.xlabel(x_lablel)
     plt.xlabel(x_lablel)
     plt.ylabel(y_label)
     plt.ylabel(y_label)
@@ -50,17 +50,17 @@ def plot_coverage(
 RUN = False
 RUN = False
 
 
 # CONFIGURATION
 # CONFIGURATION
-if os.getenv('ADL_CONFIG_PATH') is None:
-    with open('config.toml', 'rb') as f:
+if os.getenv("ADL_CONFIG_PATH") is None:
+    with open("config.toml", "rb") as f:
         config = toml.load(f)
         config = toml.load(f)
 else:
 else:
-    with open(os.getenv('ADL_CONFIG_PATH'), 'rb') as f:
+    with open(os.getenv("ADL_CONFIG_PATH"), "rb") as f:
         config = toml.load(f)
         config = toml.load(f)
 
 
 
 
 ENSEMBLE_PATH = f"{config['paths']['model_output']}{config['ensemble']['name']}"
 ENSEMBLE_PATH = f"{config['paths']['model_output']}{config['ensemble']['name']}"
 
 
-V2_PATH = ENSEMBLE_PATH + '/v2'
+V2_PATH = ENSEMBLE_PATH + "/v2"
 
 
 
 
 # Result is a 1x2 tensor, with the softmax of the 2 predicted classes
 # Result is a 1x2 tensor, with the softmax of the 2 predicted classes
@@ -75,14 +75,14 @@ def output_to_confidence(result):
 # This function conducts tests on the models and returns the results, as well as saving the predictions and metrics
 # This function conducts tests on the models and returns the results, as well as saving the predictions and metrics
 def get_predictions(config):
 def get_predictions(config):
     models, model_descs = ens.load_models(
     models, model_descs = ens.load_models(
-        f'{ENSEMBLE_PATH}/models/',
-        config['training']['device'],
+        f"{ENSEMBLE_PATH}/models/",
+        config["training"]["device"],
     )
     )
-    models = [model.to(config['training']['device']) for model in models]
-    test_set = torch.load(f'{ENSEMBLE_PATH}/test_dataset.pt') + torch.load(
-        f'{ENSEMBLE_PATH}/val_dataset.pt'
+    models = [model.to(config["training"]["device"]) for model in models]
+    test_set = torch.load(f"{ENSEMBLE_PATH}/test_dataset.pt") + torch.load(
+        f"{ENSEMBLE_PATH}/val_dataset.pt"
     )
     )
-    print(f'Loaded {len(test_set)} samples')
+    print(f"Loaded {len(test_set)} samples")
 
 
     # [([model results], labels)]
     # [([model results], labels)]
     results = []
     results = []
@@ -93,12 +93,12 @@ def get_predictions(config):
     for _, (data, target) in tqdm(
     for _, (data, target) in tqdm(
         enumerate(test_set),
         enumerate(test_set),
         total=len(test_set),
         total=len(test_set),
-        desc='Getting predictions',
-        unit='sample',
+        desc="Getting predictions",
+        unit="sample",
     ):
     ):
         mri, xls = data
         mri, xls = data
-        mri = mri.unsqueeze(0).to(config['training']['device'])
-        xls = xls.unsqueeze(0).to(config['training']['device'])
+        mri = mri.unsqueeze(0).to(config["training"]["device"])
+        xls = xls.unsqueeze(0).to(config["training"]["device"])
         data = (mri, xls)
         data = (mri, xls)
         res = []
         res = []
         for j, model in enumerate(models):
         for j, model in enumerate(models):
@@ -164,125 +164,125 @@ if RUN:
     # Convert to pandas dataframes
     # Convert to pandas dataframes
     confs_df = pd.DataFrame(
     confs_df = pd.DataFrame(
         confs,
         confs,
-        columns=['predicted_class', 'confidence', 'true_label', 'class_1', 'class_2'],
+        columns=["predicted_class", "confidence", "true_label", "class_1", "class_2"],
     )
     )
     stdevs_df = pd.DataFrame(
     stdevs_df = pd.DataFrame(
-        stdevs, columns=['predicted_class', 'stdev', 'true_label', 'class_1', 'class_2']
+        stdevs, columns=["predicted_class", "stdev", "true_label", "class_1", "class_2"]
     )
     )
 
 
     entropies_df = pd.DataFrame(
     entropies_df = pd.DataFrame(
         entropies,
         entropies,
-        columns=['predicted_class', 'entropy', 'true_label', 'class_1', 'class_2'],
+        columns=["predicted_class", "entropy", "true_label", "class_1", "class_2"],
     )
     )
 
 
-    indv_df = pd.DataFrame(indv_results, columns=['class_1', 'class_2', 'true_label'])
+    indv_df = pd.DataFrame(indv_results, columns=["class_1", "class_2", "true_label"])
 
 
     if not os.path.exists(V2_PATH):
     if not os.path.exists(V2_PATH):
         os.makedirs(V2_PATH)
         os.makedirs(V2_PATH)
 
 
-    confs_df.to_csv(f'{V2_PATH}/ensemble_confidences.csv')
-    stdevs_df.to_csv(f'{V2_PATH}/ensemble_stdevs.csv')
-    entropies_df.to_csv(f'{V2_PATH}/ensemble_entropies.csv')
-    indv_df.to_csv(f'{V2_PATH}/individual_results.csv')
+    confs_df.to_csv(f"{V2_PATH}/ensemble_confidences.csv")
+    stdevs_df.to_csv(f"{V2_PATH}/ensemble_stdevs.csv")
+    entropies_df.to_csv(f"{V2_PATH}/ensemble_entropies.csv")
+    indv_df.to_csv(f"{V2_PATH}/individual_results.csv")
 else:
 else:
-    confs_df = pd.read_csv(f'{V2_PATH}/ensemble_confidences.csv')
-    stdevs_df = pd.read_csv(f'{V2_PATH}/ensemble_stdevs.csv')
-    entropies_df = pd.read_csv(f'{V2_PATH}/ensemble_entropies.csv')
-    indv_df = pd.read_csv(f'{V2_PATH}/individual_results.csv')
+    confs_df = pd.read_csv(f"{V2_PATH}/ensemble_confidences.csv")
+    stdevs_df = pd.read_csv(f"{V2_PATH}/ensemble_stdevs.csv")
+    entropies_df = pd.read_csv(f"{V2_PATH}/ensemble_entropies.csv")
+    indv_df = pd.read_csv(f"{V2_PATH}/individual_results.csv")
 
 
 
 
 # Plot confidence vs standard deviation, and change color of dots based on if they are correct
 # Plot confidence vs standard deviation, and change color of dots based on if they are correct
-correct_conf = confs_df[confs_df['predicted_class'] == confs_df['true_label']]
-incorrect_conf = confs_df[confs_df['predicted_class'] != confs_df['true_label']]
+correct_conf = confs_df[confs_df["predicted_class"] == confs_df["true_label"]]
+incorrect_conf = confs_df[confs_df["predicted_class"] != confs_df["true_label"]]
 
 
-correct_stdev = stdevs_df[stdevs_df['predicted_class'] == stdevs_df['true_label']]
-incorrect_stdev = stdevs_df[stdevs_df['predicted_class'] != stdevs_df['true_label']]
+correct_stdev = stdevs_df[stdevs_df["predicted_class"] == stdevs_df["true_label"]]
+incorrect_stdev = stdevs_df[stdevs_df["predicted_class"] != stdevs_df["true_label"]]
 
 
 correct_ent = entropies_df[
 correct_ent = entropies_df[
-    entropies_df['predicted_class'] == entropies_df['true_label']
+    entropies_df["predicted_class"] == entropies_df["true_label"]
 ]
 ]
 incorrect_ent = entropies_df[
 incorrect_ent = entropies_df[
-    entropies_df['predicted_class'] != entropies_df['true_label']
+    entropies_df["predicted_class"] != entropies_df["true_label"]
 ]
 ]
 
 
 plot, ax = plt.subplots()
 plot, ax = plt.subplots()
 plt.scatter(
 plt.scatter(
-    correct_conf['confidence'],
-    correct_stdev['stdev'],
-    color='green',
-    label='Correct Prediction',
+    correct_conf["confidence"],
+    correct_stdev["stdev"],
+    color="green",
+    label="Correct Prediction",
 )
 )
 plt.scatter(
 plt.scatter(
-    incorrect_conf['confidence'],
-    incorrect_stdev['stdev'],
-    color='red',
-    label='Incorrect Prediction',
+    incorrect_conf["confidence"],
+    incorrect_stdev["stdev"],
+    color="red",
+    label="Incorrect Prediction",
 )
 )
-plt.xlabel('Confidence (Raw Value)')
-plt.ylabel('Standard Deviation (Raw Value)')
-plt.title('Confidence vs Standard Deviation')
+plt.xlabel("Confidence (Raw Value)")
+plt.ylabel("Standard Deviation (Raw Value)")
+plt.title("Confidence vs Standard Deviation")
 plt.legend()
 plt.legend()
-plt.savefig(f'{V2_PATH}/confidence_vs_stdev.png')
+plt.savefig(f"{V2_PATH}/confidence_vs_stdev.png")
 
 
 plt.close()
 plt.close()
 
 
 # Do the same for confidence vs entropy
 # Do the same for confidence vs entropy
 plot, ax = plt.subplots()
 plot, ax = plt.subplots()
 plt.scatter(
 plt.scatter(
-    correct_conf['confidence'],
-    correct_ent['entropy'],
-    color='green',
-    label='Correct Prediction',
+    correct_conf["confidence"],
+    correct_ent["entropy"],
+    color="green",
+    label="Correct Prediction",
 )
 )
 plt.scatter(
 plt.scatter(
-    incorrect_conf['confidence'],
-    incorrect_ent['entropy'],
-    color='red',
-    label='Incorrect Prediction',
+    incorrect_conf["confidence"],
+    incorrect_ent["entropy"],
+    color="red",
+    label="Incorrect Prediction",
 )
 )
-plt.xlabel('Confidence (Raw Value)')
-plt.ylabel('Entropy (Raw Value)')
-plt.title('Confidence vs Entropy')
+plt.xlabel("Confidence (Raw Value)")
+plt.ylabel("Entropy (Raw Value)")
+plt.title("Confidence vs Entropy")
 plt.legend()
 plt.legend()
-plt.savefig(f'{V2_PATH}/confidence_vs_entropy.png')
+plt.savefig(f"{V2_PATH}/confidence_vs_entropy.png")
 
 
 plt.close()
 plt.close()
 
 
 
 
 # Calculate individual model accuracy and entropy
 # Calculate individual model accuracy and entropy
 # Determine predicted class
 # Determine predicted class
-indv_df['predicted_class'] = indv_df[['class_1', 'class_2']].idxmax(axis=1)
-indv_df['predicted_class'] = indv_df['predicted_class'].apply(
-    lambda x: 0 if x == 'class_1' else 1
+indv_df["predicted_class"] = indv_df[["class_1", "class_2"]].idxmax(axis=1)
+indv_df["predicted_class"] = indv_df["predicted_class"].apply(
+    lambda x: 0 if x == "class_1" else 1
 )
 )
-indv_df['correct'] = indv_df['predicted_class'] == indv_df['true_label']
-accuracy_indv = indv_df['correct'].mean()
+indv_df["correct"] = indv_df["predicted_class"] == indv_df["true_label"]
+accuracy_indv = indv_df["correct"].mean()
 f1_indv = met.F1(
 f1_indv = met.F1(
-    indv_df['predicted_class'].to_numpy(), indv_df['true_label'].to_numpy()
+    indv_df["predicted_class"].to_numpy(), indv_df["true_label"].to_numpy()
 )
 )
 auc_indv = metrics.roc_auc_score(
 auc_indv = metrics.roc_auc_score(
-    indv_df['true_label'].to_numpy(), indv_df['class_2'].to_numpy()
+    indv_df["true_label"].to_numpy(), indv_df["class_2"].to_numpy()
 )
 )
-indv_df['entropy'] = -1 * indv_df[['class_1', 'class_2']].apply(
+indv_df["entropy"] = -1 * indv_df[["class_1", "class_2"]].apply(
     lambda x: x * np.log(x), axis=0
     lambda x: x * np.log(x), axis=0
 ).sum(axis=1)
 ).sum(axis=1)
 
 
 # Calculate percentiles for confidence and standard deviation
 # Calculate percentiles for confidence and standard deviation
-quantiles_conf = confs_df.quantile(np.linspace(0, 1, 11), interpolation='lower')[
-    'confidence'
+quantiles_conf = confs_df.quantile(np.linspace(0, 1, 11), interpolation="lower")[
+    "confidence"
 ]
 ]
-quantiles_stdev = stdevs_df.quantile(np.linspace(0, 1, 11), interpolation='lower')[
-    'stdev'
+quantiles_stdev = stdevs_df.quantile(np.linspace(0, 1, 11), interpolation="lower")[
+    "stdev"
 ]
 ]
 
 
 # Additionally for individual confidence
 # Additionally for individual confidence
-quantiles_indv_conf = indv_df.quantile(np.linspace(0, 1, 11), interpolation='lower')[
-    'class_2'
+quantiles_indv_conf = indv_df.quantile(np.linspace(0, 1, 11), interpolation="lower")[
+    "class_2"
 ]
 ]
 
 
 # For indivual entropy
 # For indivual entropy
-quantiles_indv_entropy = indv_df.quantile(np.linspace(0, 1, 11), interpolation='lower')[
-    'entropy'
+quantiles_indv_entropy = indv_df.quantile(np.linspace(0, 1, 11), interpolation="lower")[
+    "entropy"
 ]
 ]
 
 
 #
 #
@@ -293,13 +293,13 @@ iter_conf = it.islice(quantiles_conf.items(), 0, None)
 for quantile in iter_conf:
 for quantile in iter_conf:
     percentile = quantile[0]
     percentile = quantile[0]
 
 
-    filt = confs_df[confs_df['confidence'] >= quantile[1]]
+    filt = confs_df[confs_df["confidence"] >= quantile[1]]
     accuracy = (
     accuracy = (
-        filt[filt['predicted_class'] == filt['true_label']].shape[0] / filt.shape[0]
+        filt[filt["predicted_class"] == filt["true_label"]].shape[0] / filt.shape[0]
     )
     )
-    f1 = met.F1(filt['predicted_class'].to_numpy(), filt['true_label'].to_numpy())
+    f1 = met.F1(filt["predicted_class"].to_numpy(), filt["true_label"].to_numpy())
 
 
-    accuracies_conf.append({'percentile': percentile, 'accuracy': accuracy, 'f1': f1})
+    accuracies_conf.append({"percentile": percentile, "accuracy": accuracy, "f1": f1})
 
 
 accuracies_df = pd.DataFrame(accuracies_conf)
 accuracies_df = pd.DataFrame(accuracies_conf)
 
 
@@ -309,11 +309,11 @@ iter_conf = it.islice(quantiles_indv_conf.items(), 0, None)
 for quantile in iter_conf:
 for quantile in iter_conf:
     percentile = quantile[0]
     percentile = quantile[0]
 
 
-    filt = indv_df[indv_df['class_2'] >= quantile[1]]
-    accuracy = filt['correct'].mean()
-    f1 = met.F1(filt['predicted_class'].to_numpy(), filt['true_label'].to_numpy())
+    filt = indv_df[indv_df["class_2"] >= quantile[1]]
+    accuracy = filt["correct"].mean()
+    f1 = met.F1(filt["predicted_class"].to_numpy(), filt["true_label"].to_numpy())
 
 
-    indv_conf.append({'percentile': percentile, 'accuracy': accuracy, 'f1': f1})
+    indv_conf.append({"percentile": percentile, "accuracy": accuracy, "f1": f1})
 
 
 indv_conf_df = pd.DataFrame(indv_conf)
 indv_conf_df = pd.DataFrame(indv_conf)
 
 
@@ -323,35 +323,35 @@ iter_entropy = it.islice(quantiles_indv_entropy.items(), 0, None)
 for quantile in iter_entropy:
 for quantile in iter_entropy:
     percentile = quantile[0]
     percentile = quantile[0]
 
 
-    filt = indv_df[indv_df['entropy'] <= quantile[1]]
-    accuracy = filt['correct'].mean()
-    f1 = met.F1(filt['predicted_class'].to_numpy(), filt['true_label'].to_numpy())
+    filt = indv_df[indv_df["entropy"] <= quantile[1]]
+    accuracy = filt["correct"].mean()
+    f1 = met.F1(filt["predicted_class"].to_numpy(), filt["true_label"].to_numpy())
 
 
-    indv_entropy.append({'percentile': percentile, 'accuracy': accuracy, 'f1': f1})
+    indv_entropy.append({"percentile": percentile, "accuracy": accuracy, "f1": f1})
 
 
 indv_entropy_df = pd.DataFrame(indv_entropy)
 indv_entropy_df = pd.DataFrame(indv_entropy)
 
 
 
 
 # Plot the coverage for confidence and accuracy
 # Plot the coverage for confidence and accuracy
 plot_coverage(
 plot_coverage(
-    accuracies_df['percentile'],
-    accuracies_df['accuracy'],
-    indv_conf_df['accuracy'],
-    'Confidence Accuracy Coverage Plot',
-    'Minimum Confidence Percentile (Low to High)',
-    'Accuracy',
-    f'{V2_PATH}/coverage_conf.png',
+    accuracies_df["percentile"],
+    accuracies_df["accuracy"],
+    indv_conf_df["accuracy"],
+    "Confidence Accuracy Coverage Plot",
+    "Minimum Confidence Percentile (Low to High)",
+    "Accuracy",
+    f"{V2_PATH}/coverage_conf.png",
 )
 )
 
 
 # Plot the coverage for confidence and F1
 # Plot the coverage for confidence and F1
 plot_coverage(
 plot_coverage(
-    accuracies_df['percentile'],
-    accuracies_df['f1'],
-    indv_conf_df['f1'],
-    'Confidence F1 Coverage Plot',
-    'Minimum Confidence Percentile (Low to High)',
-    'F1',
-    f'{V2_PATH}/f1_coverage_conf.png',
+    accuracies_df["percentile"],
+    accuracies_df["f1"],
+    indv_conf_df["f1"],
+    "Confidence F1 Coverage Plot",
+    "Minimum Confidence Percentile (Low to High)",
+    "F1",
+    f"{V2_PATH}/f1_coverage_conf.png",
 )
 )
 
 
 # Repeat for standard deviation
 # Repeat for standard deviation
@@ -360,90 +360,90 @@ iter_stdev = it.islice(quantiles_stdev.items(), 0, None)
 for quantile in iter_stdev:
 for quantile in iter_stdev:
     percentile = quantile[0]
     percentile = quantile[0]
 
 
-    filt = stdevs_df[stdevs_df['stdev'] <= quantile[1]]
+    filt = stdevs_df[stdevs_df["stdev"] <= quantile[1]]
     accuracy = (
     accuracy = (
-        filt[filt['predicted_class'] == filt['true_label']].shape[0] / filt.shape[0]
+        filt[filt["predicted_class"] == filt["true_label"]].shape[0] / filt.shape[0]
     )
     )
-    f1 = met.F1(filt['predicted_class'].to_numpy(), filt['true_label'].to_numpy())
+    f1 = met.F1(filt["predicted_class"].to_numpy(), filt["true_label"].to_numpy())
 
 
-    accuracies_stdev.append({'percentile': percentile, 'accuracy': accuracy, 'f1': f1})
+    accuracies_stdev.append({"percentile": percentile, "accuracy": accuracy, "f1": f1})
 
 
 accuracies_stdev_df = pd.DataFrame(accuracies_stdev)
 accuracies_stdev_df = pd.DataFrame(accuracies_stdev)
 
 
 fig, ax = plt.subplots()
 fig, ax = plt.subplots()
 plt.plot(
 plt.plot(
-    accuracies_stdev_df['percentile'],
-    accuracies_stdev_df['accuracy'],
-    'ob',
-    label='Ensemble',
+    accuracies_stdev_df["percentile"],
+    accuracies_stdev_df["accuracy"],
+    "ob",
+    label="Ensemble",
 )
 )
 plt.plot(
 plt.plot(
-    accuracies_stdev_df['percentile'],
-    [accuracy_indv] * len(accuracies_stdev_df['percentile']),
-    'xr',
-    label='Individual (on entire dataset)',
+    accuracies_stdev_df["percentile"],
+    [accuracy_indv] * len(accuracies_stdev_df["percentile"]),
+    "xr",
+    label="Individual (on entire dataset)",
 )
 )
-plt.xlabel('Maximum Standard Deviation Percentile (High to Low)')
-plt.ylabel('Accuracy')
-plt.title('Standard Deviation Accuracy Coverage Plot')
+plt.xlabel("Maximum Standard Deviation Percentile (High to Low)")
+plt.ylabel("Accuracy")
+plt.title("Standard Deviation Accuracy Coverage Plot")
 plt.legend()
 plt.legend()
 plt.gca().invert_xaxis()
 plt.gca().invert_xaxis()
 ax.xaxis.set_major_formatter(ticker.PercentFormatter(xmax=1.0))
 ax.xaxis.set_major_formatter(ticker.PercentFormatter(xmax=1.0))
-plt.savefig(f'{V2_PATH}/coverage_stdev.png')
+plt.savefig(f"{V2_PATH}/coverage_stdev.png")
 plt.close()
 plt.close()
 
 
 # Plot coverage vs F1 for standard deviation
 # Plot coverage vs F1 for standard deviation
 fig, ax = plt.subplots()
 fig, ax = plt.subplots()
 plt.plot(
 plt.plot(
-    accuracies_stdev_df['percentile'], accuracies_stdev_df['f1'], 'ob', label='Ensemble'
+    accuracies_stdev_df["percentile"], accuracies_stdev_df["f1"], "ob", label="Ensemble"
 )
 )
 plt.plot(
 plt.plot(
-    accuracies_stdev_df['percentile'],
-    [f1_indv] * len(accuracies_stdev_df['percentile']),
-    'xr',
-    label='Individual (on entire dataset)',
+    accuracies_stdev_df["percentile"],
+    [f1_indv] * len(accuracies_stdev_df["percentile"]),
+    "xr",
+    label="Individual (on entire dataset)",
 )
 )
-plt.xlabel('Maximum Standard Deviation Percentile (High to Low)')
-plt.ylabel('F1')
-plt.title('Standard Deviation F1 Coverage Plot')
+plt.xlabel("Maximum Standard Deviation Percentile (High to Low)")
+plt.ylabel("F1")
+plt.title("Standard Deviation F1 Coverage Plot")
 plt.legend()
 plt.legend()
 plt.gca().invert_xaxis()
 plt.gca().invert_xaxis()
 ax.xaxis.set_major_formatter(ticker.PercentFormatter(xmax=1.0))
 ax.xaxis.set_major_formatter(ticker.PercentFormatter(xmax=1.0))
-plt.savefig(f'{V2_PATH}/coverage_f1_stdev.png')
+plt.savefig(f"{V2_PATH}/coverage_f1_stdev.png")
 
 
 plt.close()
 plt.close()
 
 
 
 
 # Print overall accuracy
 # Print overall accuracy
 overall_accuracy = (
 overall_accuracy = (
-    confs_df[confs_df['predicted_class'] == confs_df['true_label']].shape[0]
+    confs_df[confs_df["predicted_class"] == confs_df["true_label"]].shape[0]
     / confs_df.shape[0]
     / confs_df.shape[0]
 )
 )
 overall_f1 = met.F1(
 overall_f1 = met.F1(
-    confs_df['predicted_class'].to_numpy(), confs_df['true_label'].to_numpy()
+    confs_df["predicted_class"].to_numpy(), confs_df["true_label"].to_numpy()
 )
 )
 # Calculate ECE and MCE
 # Calculate ECE and MCE
 conf_ece = met.ECE(
 conf_ece = met.ECE(
-    confs_df['predicted_class'].to_numpy(),
-    confs_df['confidence'].to_numpy(),
-    confs_df['true_label'].to_numpy(),
+    confs_df["predicted_class"].to_numpy(),
+    confs_df["confidence"].to_numpy(),
+    confs_df["true_label"].to_numpy(),
 )
 )
 
 
 stdev_ece = met.ECE(
 stdev_ece = met.ECE(
-    stdevs_df['predicted_class'].to_numpy(),
-    stdevs_df['stdev'].to_numpy(),
-    stdevs_df['true_label'].to_numpy(),
+    stdevs_df["predicted_class"].to_numpy(),
+    stdevs_df["stdev"].to_numpy(),
+    stdevs_df["true_label"].to_numpy(),
 )
 )
 
 
 
 
-print(f'Overall accuracy: {overall_accuracy}, Overall F1: {overall_f1},')
-print(f'Confidence ECE: {conf_ece}')
-print(f'Standard Deviation ECE: {stdev_ece}')
+print(f"Overall accuracy: {overall_accuracy}, Overall F1: {overall_f1},")
+print(f"Confidence ECE: {conf_ece}")
+print(f"Standard Deviation ECE: {stdev_ece}")
 
 
 
 
 # Repeat for entropy
 # Repeat for entropy
-quantiles_entropy = entropies_df.quantile(np.linspace(0, 1, 11), interpolation='lower')[
-    'entropy'
+quantiles_entropy = entropies_df.quantile(np.linspace(0, 1, 11), interpolation="lower")[
+    "entropy"
 ]
 ]
 
 
 accuracies_entropy = []
 accuracies_entropy = []
@@ -451,14 +451,14 @@ iter_entropy = it.islice(quantiles_entropy.items(), 0, None)
 for quantile in iter_entropy:
 for quantile in iter_entropy:
     percentile = quantile[0]
     percentile = quantile[0]
 
 
-    filt = entropies_df[entropies_df['entropy'] <= quantile[1]]
+    filt = entropies_df[entropies_df["entropy"] <= quantile[1]]
     accuracy = (
     accuracy = (
-        filt[filt['predicted_class'] == filt['true_label']].shape[0] / filt.shape[0]
+        filt[filt["predicted_class"] == filt["true_label"]].shape[0] / filt.shape[0]
     )
     )
-    f1 = met.F1(filt['predicted_class'].to_numpy(), filt['true_label'].to_numpy())
+    f1 = met.F1(filt["predicted_class"].to_numpy(), filt["true_label"].to_numpy())
 
 
     accuracies_entropy.append(
     accuracies_entropy.append(
-        {'percentile': percentile, 'accuracy': accuracy, 'f1': f1}
+        {"percentile": percentile, "accuracy": accuracy, "f1": f1}
     )
     )
 
 
 accuracies_entropy_df = pd.DataFrame(accuracies_entropy)
 accuracies_entropy_df = pd.DataFrame(accuracies_entropy)
@@ -466,23 +466,23 @@ accuracies_entropy_df = pd.DataFrame(accuracies_entropy)
 
 
 # Plot the coverage for entropy and accuracy
 # Plot the coverage for entropy and accuracy
 plot_coverage(
 plot_coverage(
-    accuracies_entropy_df['percentile'],
-    accuracies_entropy_df['accuracy'],
-    indv_entropy_df['accuracy'],
-    'Entropy Accuracy Coverage Plot',
-    'Minimum Entropy Percentile (Low to High)',
-    'Accuracy',
-    f'{V2_PATH}/coverage_entropy.png',
+    accuracies_entropy_df["percentile"],
+    accuracies_entropy_df["accuracy"],
+    indv_entropy_df["accuracy"],
+    "Entropy Accuracy Coverage Plot",
+    "Minimum Entropy Percentile (Low to High)",
+    "Accuracy",
+    f"{V2_PATH}/coverage_entropy.png",
 )
 )
 
 
 # Plot the coverage for entropy and F1
 # Plot the coverage for entropy and F1
 plot_coverage(
 plot_coverage(
-    accuracies_entropy_df['percentile'],
-    accuracies_entropy_df['f1'],
-    indv_entropy_df['f1'],
-    'Entropy F1 Coverage Plot',
-    'Maximum Entropy Percentile (High to Low)',
-    'F1',
-    f'{V2_PATH}/f1_coverage_entropy.png',
+    accuracies_entropy_df["percentile"],
+    accuracies_entropy_df["f1"],
+    indv_entropy_df["f1"],
+    "Entropy F1 Coverage Plot",
+    "Maximum Entropy Percentile (High to Low)",
+    "F1",
+    f"{V2_PATH}/f1_coverage_entropy.png",
     flip=True,
     flip=True,
 )
 )

+ 20 - 8
utils/data/datasets.py

@@ -18,19 +18,31 @@ Prepares CustomDatasets for training, validating, and testing CNN
 
 
 def prepare_datasets(mri_dir, xls_file, val_split=0.2, seed=50, device=None):
 def prepare_datasets(mri_dir, xls_file, val_split=0.2, seed=50, device=None):
     if device is None:
     if device is None:
-        device = torch.device('cpu')
+        device = torch.device("cpu")
 
 
     rndm = random.Random(seed)
     rndm = random.Random(seed)
-    xls_data = pd.read_csv(xls_file).set_index('Image Data ID')
-    raw_data = glob.glob(mri_dir + '*')
+    xls_data = pd.read_csv(xls_file)
+
+    # Strip all trailing whitespace from dataframe
+    xls_data = xls_data.replace(r"^ +| +$", r"", regex=True)
+
+    # Strip all trailing whitespace from column names
+    xls_data.columns = xls_data.columns.str.strip()
+
+    xls_data = xls_data.set_index("Image Data ID")
+
+    raw_data = glob.glob(mri_dir + "*")
+
+    print(f"Found {len(raw_data)} images in {mri_dir}")
+
     AD_list = []
     AD_list = []
     NL_list = []
     NL_list = []
 
 
     # TODO Check that image is in CSV?
     # TODO Check that image is in CSV?
     for image in raw_data:
     for image in raw_data:
-        if 'NL' in image:
+        if "NL" in image:
             NL_list.append(image)
             NL_list.append(image)
-        elif 'AD' in image:
+        elif "AD" in image:
             AD_list.append(image)
             AD_list.append(image)
 
 
     rndm.shuffle(AD_list)
     rndm.shuffle(AD_list)
@@ -90,7 +102,7 @@ def get_train_val_test(AD_list, NL_list, val_split):
 
 
 
 
 class ADNIDataset(Dataset):
 class ADNIDataset(Dataset):
-    def __init__(self, mri, xls: pd.DataFrame, device=torch.device('cpu')):
+    def __init__(self, mri, xls: pd.DataFrame, device=torch.device("cpu")):
         self.mri_data = mri  # DATA IS A LIST WITH TUPLES (image_dir, class_id)
         self.mri_data = mri  # DATA IS A LIST WITH TUPLES (image_dir, class_id)
         self.xls_data = xls
         self.xls_data = xls
         self.device = device
         self.device = device
@@ -102,9 +114,9 @@ class ADNIDataset(Dataset):
         # Get used data
         # Get used data
 
 
         # data = xls_data.loc[['Sex', 'Age (current)', 'PTID', 'DXCONFID (1=uncertain, 2= mild, 3= moderate, 4=high confidence)', 'Alz_csf']]
         # data = xls_data.loc[['Sex', 'Age (current)', 'PTID', 'DXCONFID (1=uncertain, 2= mild, 3= moderate, 4=high confidence)', 'Alz_csf']]
-        data = xls_data.loc[['Sex', 'Age (current)']]
+        data = xls_data.loc[["Sex", "Age (current)"]]
 
 
-        data.replace({'M': 0, 'F': 1}, inplace=True)
+        data.replace({"M": 0, "F": 1}, inplace=True)
 
 
         # Convert to tensor
         # Convert to tensor
         xls_tensor = torch.tensor(data.values.astype(float))
         xls_tensor = torch.tensor(data.values.astype(float))

Some files were not shown because too many files changed in this diff