From 1b6092d0525d6bc1a02a5b3e7d97f3a1adae8f5c Mon Sep 17 00:00:00 2001 From: s47700 <s47700@bht-berlin.de> Date: Tue, 26 Mar 2024 14:15:41 +0100 Subject: [PATCH] Upload New File --- train/TRAIN_MODEL_1_.py | 337 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 337 insertions(+) create mode 100644 train/TRAIN_MODEL_1_.py diff --git a/train/TRAIN_MODEL_1_.py b/train/TRAIN_MODEL_1_.py new file mode 100644 index 0000000..dbfd6ee --- /dev/null +++ b/train/TRAIN_MODEL_1_.py @@ -0,0 +1,337 @@ +import torch +import torchvision +import torchvision.transforms as transforms +import torch.nn as nn + +import os +import sys +import numpy as np +import time + +import yaml +import json + +import cv2 + +""" +Author: Jenö Faist, Paul Judis +Refernces: LFI-3 cnn.py +""" + +""" +Template File to Train a CNN Network +""" + +""" +This file trains the convolutional neural network with the selected dataset and hyperparameters. +The model is saved in the TRAIN_MODELS folder and can be further trained from there or moved to +COMPLETE_MODELS to save it. +""" + +if __name__ == '__main__': + + """ + You Define all Data Paths here commonly you save the Model and Hyperparmeters in TRAIN_MODELS + """ + + absolutepath = os.path.dirname(__file__) + + + traing_set_PATH = absolutepath+'/DATASETS/early_3D_Kegel_SET/train' + hyperparameters_PATH = absolutepath+'/TRAIN_MODELS/hyperparameters.yaml' + + """ + All Save PATHS + """ + model_save_PATH = absolutepath+'/TRAIN_MODELS/3D_DEC_MODEL_.pt' + training_history_PATH = absolutepath+'/TRAIN_MODELS/3D_DEC_MODEL_TRAIN_HISTORY_.json' + training_save_PATH = absolutepath+'/TRAIN_MODELS/3D_DEC_MODEL_TRAIN_SAVE_.json' + + + """ + This are the References Printed in the Consol to insure that CUDA uses your GPU + and the DATA Paths are set Correctly + """ + + print("STARTING CNN TRAINING") + print("---------------------------") + print("Cuda Version: " + torch.version.cuda) + print("Cuda: "+str(torch.cuda.is_available())) + print("GPU: "+str(torch.cuda.get_device_name())) + print("Current Folderpath: "+absolutepath) + print("Model Saving in: "+model_save_PATH) + print("Training Save in: "+training_save_PATH) + print("Hyperparamtes in: "+hyperparameters_PATH) + print("Training History Save in: "+training_history_PATH) + print("---------------------------") + + + + + + + + """ + Setting up Pytorch and setting the seed so that results can be recreated + """ + torch.manual_seed(0) + # Set device + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + + + """ + Loading the Hyperparamters if Possible if not use Default Hyperparamters + """ + + print("Loading Hyperparamters") # CONSOLE + + + data = { + 'default': + {'batch_size': 64, + 'num_epochs': 10, + 'learning_rate': 0.001} + } + try: + with open(hyperparameters_PATH, "r") as stream: + data = yaml.safe_load(stream) + print("[!!!] Hyperparamters Loaded Successfully ! [!!!]") + except: + print("[?!?] Hyperparamters couldn't be loaded ! [?!?]") + print("[!!!] Using Default Hyperparamters ! [!!!]") + + + + # Set hyperparameters + num_epochs = data['default']['num_epochs'] + batch_size = data['default']['batch_size'] + learning_rate = data['default']['learning_rate'] + + print("---------------------------") # CONSOLE + + + + """ + SETTING UP TRAINING DATA SET + Initialize transformations for data augmentation. + """ + transform = transforms.Compose( + [ + transforms.ToTensor(), + transforms.Resize((256,256)), + ] + ) + + # Load the Dataset with the transformations + train_dataset = torchvision.datasets.ImageFolder( + root= traing_set_PATH, + transform=transform + ) + + + train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=8) + + """ + In this Path you Define the Neural Network Model that you want be using. + For this File: its default resnet18 + """ + + model = torchvision.models.resnet18(weights='DEFAULT') # base model + num_ftrs = model.fc.in_features + model.fc = nn.Linear(num_ftrs, 2) + + + """ + This Loades the not fulled trained Model if training was canceled, + so that the Training of Model can be continued. + """ + + # Last training Epoch and Batch + last_eb = [0,0] + # A List to safe the loss over time for later plots + training_history = [[],[]] + + try: + model.load_state_dict(torch.load(model_save_PATH),strict=False) + model.eval() + + + with open(training_save_PATH, 'r') as f: + last_eb = json.load(f) + + with open(training_history_PATH, 'r') as f: + training_history = json.load(f) + + + + print("[!!!] Using previous Trained Model Starting from Epoch:"+str(last_eb[0])+" Batch:"+str(last_eb[1])+" [!!!]") + except: + print("[!!!] No last training model found, starting new training with base model [!!!]") + + + ### MODEL TORCH MODIFIERS ### + + # Problem give errors (TODO) + # Parallelize training across multiple GPUs + #model = torch.nn.DataParallel(model) + + # Set the model to run on the device + model = model.to(device) + + + """ + Here you are defining wich error function and optimizer you want be using + """ + criterion = torch.nn.CrossEntropyLoss() + optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) + + + + + ### CONSOLE PRINTS #### + print("---------------------") + print("Beginn Training") + print("Starting with " + "Epoch:"+str(last_eb[0])+" Batch:"+str(last_eb[1])) + print("---------------------") + ####################### + + # Time Variables for tracking + start_time = time.time() + current_time = time.time() + last_time = 0 + + + + """ + In here the Model gets Trainied using the specifications above. + The trainings loop is in a try so if a KeyboardInterrupt happens argo the Programm gets closed + the Model gets automaticly saved. + """ + try: + + + for epoch in range(last_eb[0],num_epochs): + """ + If we train a Model that gets continued in Training then this part insures that + the last trained batches get skipped. + """ + + list_dt = [] + + count = last_eb[1] + train_loader_iter = iter(train_loader) + data_load_time_start = time.time() + data_load_last_time = 0 + list_data_time = [] + for n in range(last_eb[1]): + next(train_loader_iter) + data_load_time = time.time() - data_load_time_start - data_load_last_time + list_data_time.append(data_load_time) + + avg_dt = sum(list_data_time)/len(list_data_time) + + sys.stdout.write("\033[K") + est_time =((last_eb[1] -n)*avg_dt)/60**2 + print('Skiping last Trained Batches Estimated Left Time:' + '%.2f h ' % est_time + 'Batches to skip:'+str((last_eb[1] -n)), end='\r') + data_load_last_time = time.time() - data_load_time_start + + sys.stdout.write("\033[K") + + + print("Training Epoch: "+str(epoch)) # Console + + + + + for inputs, labels in train_loader: + + """ + Train the Model with this Batch + """ + # Move input and label tensors to the device + + inputs = inputs.to(device) + labels = labels.to(device) + + # Zero out the optimizer + optimizer.zero_grad() + + # Forward pass + outputs = model(inputs) + loss = criterion(outputs, labels) + + # Backward pass + loss.backward() + optimizer.step() + + + """ + The Real Training Ends here. + This Part is just for visuals so that you now how far the Model is trained + and how much time is left for this Epoch + """ + current_time = time.time() - start_time + dt = current_time - last_time + list_dt.append(dt) + + training_history[0].append(loss.item()) + + _, preds = torch.max(outputs, 1) + training_history[1].append(torch.sum(preds == labels.data).item()) + + avg_dt = sum(list_dt)/len(list_dt) + avg_loss = sum(training_history[0])/len(training_history[0]) + + + + Time_Estimate = (round((len(train_dataset)-count*batch_size)/batch_size)*avg_dt)/60**2 + sys.stdout.write("\033[K") + print('Training Model | Estimated Left Time for this Epoch: ' + "%.2f h" % Time_Estimate + "| Current Average Loss: " + "%.5f" % avg_loss + '| Batches Left:'+str(round((len(train_dataset)-count*batch_size)/batch_size)), end='\r') + last_time = current_time + + + + count += 1 + + """ + Every Epoch the Model gets Saved Plus the Training History containing all Loses and Accurarcys for every Batch + """ + sys.stdout.write("\033[K") + + + print(f'Epoch {epoch+1}/{num_epochs} Done, Loss: {sum(training_history[0])/len(training_history[0]):.4f}') + last_eb = [epoch+1,0] + with open(training_save_PATH, 'w') as f: + json.dump(last_eb, f) + torch.save(model.state_dict(), model_save_PATH) + with open(training_history_PATH, 'w') as f: + json.dump(training_history, f) + + + except KeyboardInterrupt: + + """ + This Part Saves the Model if at any Time the Training gets cancelled argo the programm gets Closed + """ + + with open(training_save_PATH, 'w') as f: + sys.stdout.write("\033[K") + print("---------------------") + print("[!!!] Trainings Interruption SAVING TRAINING [!!!]") + json.dump(last_eb, f) + torch.save(model.state_dict(), model_save_PATH) + print("[!!!] Training Saved [!!!]") + print("[!!!] Epoch: " + str(epoch) + " Batch: " + str(count)+" [!!!]") + with open(training_history_PATH, 'w') as f: + json.dump(training_history, f) + + """ + After Training save the model and Training Histroy + """ + sys.stdout.write("\033[K") + print("---------------------") + print(f'Finished Training, Loss: {sum(training_history[0])/len(training_history[0]):.4f}') + torch.save(model.state_dict(), model_save_PATH) + with open(training_history_PATH, 'w') as f: + json.dump(training_history, f) \ No newline at end of file -- GitLab