1 year ago · db2f133675
--- a/config.toml
+++ b/config.toml
@@ -0,0 +1,27 @@
 
				+#Configuration file for the Alzheimers DL model
			
 
				+
			
 
				+[paths]
			
 
				+mri_data = '/data/data_wnx1/_Data/AlzheimersDL/CNN+RNN-2class-1cnn+data/PET_volumes_customtemplate_float32/'
			
 
				+xls_data = '/export/home/nschense/alzheimers/Pytorch_CNN-RNN/LP_ADNIMERGE.csv'
			
 
				+
			
 
				+#CHANGE THESE BEFORE RUNNING
			
 
				+model_output = '/export/home/nschense/alzheimers/Pytorch_CNN-RNN/saved_models/'
			
 
				+plot_output = '/export/home/nschense/alzheimers/Pytorch_CNN-RNN/plots/'
			
 
				+training_record_output = '/export/home/nschense/alzheimers/Pytorch_CNN-RNN/training_records/'
			
 
				+testing_record_output = '/export/home/nschense/alzheimers/Pytorch_CNN-RNN/testing_records/'
			
 
				+
			
 
				+[cuda]
			
 
				+device = 1
			
 
				+
			
 
				+[training]
			
 
				+batch_size = 64
			
 
				+epochs = 10
			
 
				+learning_rate = 0.0001
			
 
				+runs = 1
			
 
				+
			
 
				+[dataset]
			
 
				+validation_split = 0.3
			
 
				+
			
 
				+[model]
			
 
				+name = 'alzheimers+cnn'
			
 
				+droprate = 0.5
			
--- a/main.py
+++ b/main.py
@@ -1,64 +1,80 @@
 
				+print("--- INITIALIZING LIBRARIES ---")
			
 
				 import torch
			
 
				 from torch import nn
			
 
				 
			
 
				-
			
 
				 # GENERAL PURPOSE
			
 
				 import numpy as np
			
 
				 from datetime import datetime
			
 
				 import pandas as pd
			
 
				 import os
			
 
				+import tomli as tl
			
 
				 
			
 
				 # FOR TRAINING
			
 
				 import torch.optim as optim
			
 
				 import utils.models as models
			
 
				 from utils.training import train_model, test_model, initalize_dataloaders, plot_results
			
 
				+print("--- LIBRARIES INITIALIZED ---")
			
 
				 
			
 
				+#GET CONFIG SETTINGS
			
 
				+if os.getenv('ADL_CONFIG_PATH') is None:
			
 
				+    with open ('config.toml', 'rb') as f:
			
 
				+        config = tl.load(f)
			
 
				+else:
			
 
				+    with open(os.getenv('ADL_CONFIG_PATH'), 'rb') as f:
			
 
				+        config = tl.load(f)
			
 
				 
			
 
				-#Set Default GPU
			
 
				-cuda_device = torch.device('cuda:1')
			
 
				+cuda_device = torch.device(config['cuda']['device'])
			
 
				 torch.set_default_device(cuda_device)
			
 
				 
			
 
				+model_name = config['model']['name']
			
 
				+
			
 
				 print("--- RUNNING ---")
			
 
				 print("Pytorch Version: " + torch. __version__)
			
 
				 
			
 
				 # data & training properties:
			
 
				-val_split = 0.2     # % of val and test, rest will be train
			
 
				-runs = 1
			
 
				-epochs = 30
			
 
				+val_split = config['dataset']['validation_split']     # % of val and test, rest will be train
			
 
				+runs = config['training']['runs']
			
 
				+epochs = config['training']['epochs']
			
 
				 seeds = [np.random.randint(0, 1000) for _ in range(runs)]
			
 
				 
			
 
				-#Data Path
			
 
				-mri_path = '/data/data_wnx1/_Data/AlzheimersDL/CNN+RNN-2class-1cnn+data/PET_volumes_customtemplate_float32/'
			
 
				-
			
 
				-#Local Path
			
 
				-local_path = '/export/home/nschense/alzheimers/Pytorch_CNN-RNN'
			
 
				+#paths
			
 
				+mri_path = config['paths']['mri_data']
			
 
				+xls_path = config['paths']['xls_data']
			
 
				+saved_model_path = config['paths']['model_output']
			
 
				+plot_path = config['paths']['plot_output']
			
 
				+training_record_path = config['paths']['training_record_output']
			
 
				 
			
 
				-xls_path = local_path + '/LP_ADNIMERGE.csv'
			
 
				-saved_model_path = local_path + '/saved_models/'
			
 
				-plot_path = local_path + '/plots/'
			
 
				-training_record_path = local_path + '/training_records/'
			
 
				-
			
 
				-DEBUG = False
			
 
				-
			
 
				-model_CNN = models.CNN_Net(1, 2, 0.5).to(cuda_device)
			
 
				+#model
			
 
				+model_CNN = models.CNN_Net(1, 2, config['model']['droprate']).to(cuda_device)
			
 
				 criterion = nn.BCELoss()
			
 
				-optimizer = optim.Adam(model_CNN.parameters(), lr=0.001)
			
 
				+optimizer = optim.Adam(model_CNN.parameters(), lr=config['training']['learning_rate'])
			
 
				+
			
 
				 
			
 
				-        
			
 
				 for seed in seeds:
			
 
				+    #get time stamp for model
			
 
				     time_stamp = datetime.now().strftime('%Y%m%d+%H%M%S')
			
 
				 
			
 
				-    train_loader, val_loader, test_loader = initalize_dataloaders(mri_path, xls_path, val_split, seed, cuda_device=cuda_device)
			
 
				-    train_results = train_model(model_CNN, seed, time_stamp, epochs, train_loader, val_loader, saved_model_path, "CNN", optimizer, criterion, cuda_device=cuda_device)
			
 
				-    test_model(model_CNN, test_loader, cuda_device=cuda_device)
			
 
				+    #initialize dataloaders, train model, and test model
			
 
				+    train_loader, val_loader, test_loader = initalize_dataloaders(mri_path, xls_path, val_split, seed, cuda_device=cuda_device, batch_size=config['training']['batch_size'])
			
 
				+    
			
 
				+
			
 
				+    print("--- TRAINING MODEL ---")
			
 
				+    print("Seed: ", seed)
			
 
				+    
			
 
				+        
			
 
				+    train_results = train_model(model_CNN, seed, time_stamp, epochs, train_loader, val_loader, saved_model_path, model_name, optimizer, criterion, cuda_device=cuda_device)
			
 
				+    print("--- TESTING MODEL ---")
			
 
				+    predicted, actual, correct, incorrect = test_model(model_CNN, test_loader, cuda_device=cuda_device)
			
 
				+    
			
 
				+    print("Accuracy: " + str(correct / (correct + incorrect)))
			
 
				     
			
 
				-    #Plot results
			
 
				-    plot_results(train_results["train_acc"], train_results["train_loss"], train_results["val_acc"], train_results["val_loss"], "CNN", time_stamp, plot_path)
			
 
				+    #Plot results and confusion matrix
			
 
				+    plot_results(train_results["train_acc"], train_results["train_loss"], train_results["val_acc"], train_results["val_loss"], model_name, time_stamp, plot_path)
			
 
				     
			
 
				     #Save training results
			
 
				     if not os.path.exists(training_record_path):
			
 
				         os.makedirs(training_record_path)
			
 
				-    train_results.to_csv(training_record_path + "CNN_t-" + time_stamp + "_s-" + str(seed) + "_e-" + str(epochs) + ".csv")
			
 
				+    train_results.to_csv(training_record_path + model_name + "_t-" + time_stamp + "_s-" + str(seed) + "_e-" + str(epochs) + ".csv")
			
 
				     
			
 
				     
			
 
				     
			
--- a/test_models.py
+++ b/test_models.py
@@ -0,0 +1,61 @@
 
				+print("--- INITIALIZING LIBRARIES ---")
			
 
				+from utils.training import train_model, test_model, initalize_dataloaders, plot_confusion_matrix, plot_roc_curve
			
 
				+import tomli as tl
			
 
				+import torch
			
 
				+import os
			
 
				+from utils.models import CNN_Net
			
 
				+ 
			
 
				+print("--- LIBRARIES INITIALIZED ---")
			
 
				+
			
 
				+#GET CONFIG SETTINGS
			
 
				+if os.getenv('ADL_CONFIG_PATH') is None:
			
 
				+    with open ('config.toml', 'rb') as f:
			
 
				+        config = tl.load(f)
			
 
				+else:
			
 
				+    with open(os.getenv('ADL_CONFIG_PATH'), 'rb') as f:
			
 
				+        config = tl.load(f)
			
 
				+        
			
 
				+cuda_device = torch.device(config['cuda']['device'])
			
 
				+
			
 
				+#For each file in the model directory, run model tests and save results
			
 
				+plot_path = config['paths']['plot_output']
			
 
				+model_path = config['paths']['model_output']
			
 
				+test_output_path = config['paths']['testing_record_output']
			
 
				+
			
 
				+#get all files in model directory
			
 
				+model_files = os.listdir(model_path)
			
 
				+
			
 
				+#for each model in the model path, determine timestamp from file name and load the model, then test the model
			
 
				+print("--- TESTING MODELS ---")
			
 
				+for model_file in model_files:
			
 
				+    #get model name from file name
			
 
				+    model_name = model_file[:model_file.find("_")]
			
 
				+    
			
 
				+    #get timestamp from file name
			
 
				+    timestamp = model_file[(model_file.find("t-") + 2): model_file.find("_", model_file.find("t-"))]
			
 
				+    
			
 
				+    #get seed from file name
			
 
				+    seed = int(model_file[(model_file.find("s-") + 2): model_file.find("_", model_file.find("s-"))])
			
 
				+    
			
 
				+    print("  - Testing Model: " + timestamp + ", Seed: ", seed)
			
 
				+    print("    * Loading Dataset")
			
 
				+    
			
 
				+    _, _, test_loader = initalize_dataloaders(config['paths']['mri_data'], config['paths']['xls_data'], config['dataset']['validation_split'], seed, cuda_device=torch.device('cpu'), batch_size=config['training']['batch_size'])
			
 
				+    
			
 
				+    print("    * Loading Model")
			
 
				+    model = torch.load(model_path + model_file)
			
 
				+    
			
 
				+    print("    * Testing Model")
			
 
				+    predicted, actual, correct, incorrect = test_model(model, test_loader, cuda_device=cuda_device)
			
 
				+    print("    * Accuracy: " + str(correct / (correct + incorrect)))
			
 
				+        
			
 
				+    plot_confusion_matrix(predicted, actual, model_name, timestamp, plot_path)
			
 
				+    plot_roc_curve(predicted, actual, model_name, timestamp, plot_path)
			
 
				+    
			
 
				+    
			
 
				+
			
 
				+    
			
 
				+    
			
 
				+    
			
 
				+    
			
 
				+    
			
--- a/utils/preprocess.py
+++ b/utils/preprocess.py
@@ -21,10 +21,6 @@ def prepare_datasets(mri_dir, xls_file, val_split=0.2, seed=50):
 
				     AD_list = []
			
 
				     NL_list = []
			
 
				 
			
 
				-
			
 
				-    print("--- DATA INFO ---")
			
 
				-    print("Amount of images: " + str(len(raw_data)))
			
 
				-
			
 
				     # TODO Check that image is in CSV?
			
 
				     for image in raw_data:
			
 
				         if "NL" in image:
			
@@ -32,9 +28,6 @@ def prepare_datasets(mri_dir, xls_file, val_split=0.2, seed=50):
 
				         elif "AD" in image:
			
 
				             AD_list.append(image)
			
 
				 
			
 
				-    print("Total AD: " + str(len(AD_list)))
			
 
				-    print("Total NL: " + str(len(NL_list)))
			
 
				-
			
 
				     rndm.shuffle(AD_list)
			
 
				     rndm.shuffle(NL_list)
			
 
				 
			
--- a/utils/training.py
+++ b/utils/training.py
@@ -5,23 +5,17 @@ from utils.preprocess import prepare_datasets
 
				 from torch.utils.data import DataLoader
			
 
				 import pandas as pd
			
 
				 import matplotlib.pyplot as plt
			
 
				+from sklearn.metrics import ConfusionMatrixDisplay, roc_curve, roc_auc_score, RocCurveDisplay
			
 
				+import numpy as np
			
 
				 
			
 
				 
			
 
				 
			
 
				 
			
 
				 
			
 
				 def train_model(model, seed, timestamp, epochs, train_loader, val_loader, saved_model_path, model_name, optimizer, criterion, cuda_device=torch.device('cuda:0')):
			
 
				-    #Print Shape of Image Data
			
 
				-
			
 
				-    #Print Training Data Length
			
 
				-    print("Length of Training Data: ", len(train_loader))
			
 
				-
			
 
				-
			
 
				-    print("--- INITIALIZING MODEL ---")
			
 
				-    print("Seed: ", seed)
			
 
				+   
			
 
				     epoch_number = 0
			
 
				 
			
 
				-    print("--- TRAINING MODEL ---")
			
 
				     train_losses = []
			
 
				     train_accs = []
			
 
				     val_losses = []
			
@@ -34,7 +28,7 @@ def train_model(model, seed, timestamp, epochs, train_loader, val_loader, saved_
 
				         
			
 
				         #Training
			
 
				         train_length = len(train_loader)
			
 
				-        for _, data in tqdm(enumerate(train_loader, 0), total=train_length, desc="Epoch " + str(epoch), unit="batch"):
			
 
				+        for _, data in tqdm(enumerate(train_loader, 0), total=train_length, desc="Epoch " + str(epoch) + "/" + str(epochs), unit="batch"):
			
 
				             mri, xls, label = data
			
 
				 
			
 
				             optimizer.zero_grad()
			
@@ -96,7 +90,7 @@ def train_model(model, seed, timestamp, epochs, train_loader, val_loader, saved_
 
				     if not os.path.exists(saved_model_path):
			
 
				         os.makedirs(saved_model_path)
			
 
				     
			
 
				-    torch.save(model.state_dict(), saved_model_path + model_name + "_t-" + timestamp + "_s-" + str(seed) + "_e-" + str(epochs) + ".pt")
			
 
				+    torch.save(model, saved_model_path + model_name + "_t-" + timestamp + "_s-" + str(seed) + "_e-" + str(epochs) + ".pkl")
			
 
				     
			
 
				     #Create dataframe with training and validation losses and accuracies, set index to epoch
			
 
				     df = pd.DataFrame()
			
@@ -109,10 +103,12 @@ def train_model(model, seed, timestamp, epochs, train_loader, val_loader, saved_
 
				     return df
			
 
				     
			
 
				 def test_model(model, test_loader, cuda_device=torch.device('cuda:0')):
			
 
				-    print("--- TESTING MODEL ---")
			
 
				     #Test model
			
 
				     correct = 0
			
 
				     incorrect = 0
			
 
				+    
			
 
				+    predictions = []
			
 
				+    actual = []
			
 
				 
			
 
				     with torch.no_grad():
			
 
				         length = len(test_loader)
			
@@ -130,13 +126,16 @@ def test_model(model, test_loader, cuda_device=torch.device('cuda:0')):
 
				 
			
 
				             incorrect += (predicted != labels).sum().item()
			
 
				             correct += (predicted == labels).sum().item()
			
 
				+            
			
 
				+            
			
 
				+            predictions.extend(predicted.tolist())
			
 
				+            actual.extend(labels.tolist())
			
 
				+                
			
 
				+    return predictions, actual, correct, incorrect
			
 
				 
			
 
				-    print("Model Accuracy: ", 100 * correct / (correct + incorrect))
			
 
				-
			
 
				-def initalize_dataloaders(mri_path, xls_path, val_split, seed, cuda_device=torch.device('cuda:0')):
			
 
				+def initalize_dataloaders(mri_path, xls_path, val_split, seed, cuda_device=torch.device('cuda:0'), batch_size=64):
			
 
				     training_data, val_data, test_data = prepare_datasets(mri_path, xls_path, val_split, seed)
			
 
				 
			
 
				-    batch_size = 64
			
 
				     train_dataloader = DataLoader(training_data, batch_size=batch_size, shuffle=True, generator=torch.Generator(device=cuda_device))
			
 
				     test_dataloader = DataLoader(test_data, batch_size=(batch_size // 4), shuffle=True, generator=torch.Generator(device=cuda_device))
			
 
				     val_dataloader = DataLoader(val_data, batch_size=batch_size, shuffle=True, generator=torch.Generator(device=cuda_device))
			
@@ -158,6 +157,7 @@ def plot_results(train_acc, train_loss, val_acc, val_loss, model_name, timestamp
 
				     plt.title("Accuracy of " + model_name + " Model: " + timestamp)
			
 
				     plt.legend()
			
 
				     plt.savefig(plot_path + model_name + "_t-" + timestamp + "_acc.png")
			
 
				+    plt.close()
			
 
				     
			
 
				     #Loss Plot
			
 
				     plt.figure()
			
@@ -168,7 +168,30 @@ def plot_results(train_acc, train_loss, val_acc, val_loss, model_name, timestamp
 
				     plt.title("Loss of " + model_name + " Model: " + timestamp)
			
 
				     plt.legend()
			
 
				     plt.savefig(plot_path + model_name + "_t-" + timestamp + "_loss.png")
			
 
				+    plt.close()
			
 
				     
			
 
				+def plot_confusion_matrix(predicted, actual, model_name, timestamp, plot_path):
			
 
				+    #Create confusion matrix
			
 
				+    if not os.path.exists(plot_path):
			
 
				+        os.makedirs(plot_path)
			
 
				     
			
 
				+    ConfusionMatrixDisplay.from_predictions(predicted, actual).plot()
			
 
				+    plt.savefig(plot_path + model_name + "_t-" + timestamp + "_confusion_matrix.png")
			
 
				+    plt.close()
			
 
				     
			
 
				+def plot_roc_curve(predicted, actual, model_name, timestamp, plot_path):
			
 
				+    #Create ROC Curve
			
 
				+    if not os.path.exists(plot_path):
			
 
				+        os.makedirs(plot_path)
			
 
				+    
			
 
				+    np.array(predicted, dtype=np.float64)
			
 
				+    np.array(actual, dtype=np.float64)
			
 
				+    
			
 
				+    fpr, tpr, _ = roc_curve(actual, predicted)
			
 
				+    print(fpr, tpr)
			
 
				+    auc = roc_auc_score(actual, predicted)
			
 
				+    plt.figure()
			
 
				+    RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=auc).plot()
			
 
				+    plt.savefig(plot_path + model_name + "_t-" + timestamp + "_roc_curve.png")
			
 
				+    plt.close()