Upload New File

1b6092d0 · s47700 · 4531fbfe · 1b6092d0
Commit 1b6092d0 authored 11 months ago by s47700
--- a/train/TRAIN_MODEL_1_.py
+++ b/train/TRAIN_MODEL_1_.py
+import torch
+import torchvision
+import torchvision.transforms as transforms
+import torch.nn as nn
+import os
+import sys
+import numpy as np
+import time
+import yaml
+import json
+import cv2 
+"""
+Author: Jenö Faist, Paul Judis 
+Refernces: LFI-3 cnn.py 
+"""
+"""
+Template File to Train a CNN Network 
+"""
+"""
+This file trains the convolutional neural network with the selected dataset and hyperparameters. 
+The model is saved in the TRAIN_MODELS folder and can be further trained from there or moved to 
+COMPLETE_MODELS to save it.
+"""
+if __name__ == '__main__':
+    """
+    You Define all Data Paths here commonly you save the Model and Hyperparmeters in TRAIN_MODELS 
+    """
+    absolutepath = os.path.dirname(__file__)
+    traing_set_PATH = absolutepath+'/DATASETS/early_3D_Kegel_SET/train'
+    hyperparameters_PATH = absolutepath+'/TRAIN_MODELS/hyperparameters.yaml'
+    """
+    All Save PATHS
+    """
+    model_save_PATH = absolutepath+'/TRAIN_MODELS/3D_DEC_MODEL_.pt'
+    training_history_PATH = absolutepath+'/TRAIN_MODELS/3D_DEC_MODEL_TRAIN_HISTORY_.json'
+    training_save_PATH = absolutepath+'/TRAIN_MODELS/3D_DEC_MODEL_TRAIN_SAVE_.json'
+    """
+    This are the References Printed in the Consol to insure that CUDA uses your GPU 
+    and the DATA Paths are set Correctly
+    """
+    print("STARTING CNN TRAINING")
+    print("---------------------------")
+    print("Cuda Version:             " + torch.version.cuda)
+    print("Cuda:                     "+str(torch.cuda.is_available()))
+    print("GPU:                      "+str(torch.cuda.get_device_name()))
+    print("Current Folderpath:       "+absolutepath)
+    print("Model Saving in:          "+model_save_PATH)
+    print("Training Save in:         "+training_save_PATH)
+    print("Hyperparamtes in:         "+hyperparameters_PATH)
+    print("Training History Save in: "+training_history_PATH)
+    print("---------------------------")
+    """
+    Setting up Pytorch and setting the seed so that results can be recreated
+    """
+    torch.manual_seed(0)
+    # Set device
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    """
+    Loading the Hyperparamters if Possible if not use Default Hyperparamters
+    """
+    print("Loading Hyperparamters") # CONSOLE
+    data = { 
+        'default': 
+        {'batch_size': 64,
+         'num_epochs': 10,
+         'learning_rate': 0.001}
+        }
+    try:
+        with open(hyperparameters_PATH, "r") as stream:
+            data = yaml.safe_load(stream)
+            print("[!!!] Hyperparamters Loaded Successfully ! [!!!]")
+    except:
+        print("[?!?] Hyperparamters couldn't be loaded ! [?!?]")
+        print("[!!!] Using Default Hyperparamters ! [!!!]")
+    # Set hyperparameters
+    num_epochs = data['default']['num_epochs']
+    batch_size = data['default']['batch_size']
+    learning_rate = data['default']['learning_rate']
+    print("---------------------------") # CONSOLE
+    """ 
+    SETTING UP TRAINING DATA SET 
+    Initialize transformations for data augmentation.
+    """
+    transform = transforms.Compose(
+    [
+        transforms.ToTensor(),
+        transforms.Resize((256,256)),  
+    ]
+    )
+    # Load the Dataset with the transformations
+    train_dataset = torchvision.datasets.ImageFolder(
+        root= traing_set_PATH, 
+        transform=transform
+    )
+    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=8)
+    """
+    In this Path you Define the Neural Network Model that you want be using.
+    For this File: its default resnet18
+    """
+    model = torchvision.models.resnet18(weights='DEFAULT') # base model
+    num_ftrs = model.fc.in_features
+    model.fc = nn.Linear(num_ftrs, 2)
+    """
+    This Loades the not fulled trained Model if training was canceled,
+    so that the Training of Model can be continued.
+    """
+    # Last training Epoch and Batch
+    last_eb = [0,0]
+    # A List to safe the loss over time for later plots
+    training_history = [[],[]]
+    try:
+       model.load_state_dict(torch.load(model_save_PATH),strict=False)
+       model.eval()
+       with open(training_save_PATH, 'r') as f:
+           last_eb = json.load(f)
+       with open(training_history_PATH, 'r') as f:
+           training_history = json.load(f)
+       print("[!!!] Using previous Trained Model Starting from Epoch:"+str(last_eb[0])+" Batch:"+str(last_eb[1])+" [!!!]")
+    except:
+       print("[!!!] No last training model found, starting new training with base model [!!!]") 
+    ### MODEL TORCH MODIFIERS ###
+    # Problem give errors (TODO)
+    # Parallelize training across multiple GPUs
+    #model = torch.nn.DataParallel(model)
+    # Set the model to run on the device
+    model = model.to(device)
+    """
+    Here you are defining wich error function and optimizer you want be using
+    """
+    criterion = torch.nn.CrossEntropyLoss()
+    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
+    ### CONSOLE PRINTS ####
+    print("---------------------")
+    print("Beginn Training")
+    print("Starting with " + "Epoch:"+str(last_eb[0])+" Batch:"+str(last_eb[1]))
+    print("---------------------")
+    #######################
+    # Time Variables for tracking
+    start_time = time.time()
+    current_time = time.time()
+    last_time = 0
+    """
+    In here the Model gets Trainied using the specifications above.
+    The trainings loop is in a try so if a KeyboardInterrupt happens argo the Programm gets closed
+    the Model gets automaticly saved.
+    """
+    try:
+        for epoch in range(last_eb[0],num_epochs):
+            """
+            If we train a Model that gets continued in Training then this part insures that 
+            the last trained batches get skipped.
+            """
+            list_dt = []
+            count = last_eb[1]
+            train_loader_iter = iter(train_loader)
+            data_load_time_start = time.time()
+            data_load_last_time = 0
+            list_data_time = []
+            for n in range(last_eb[1]):
+                next(train_loader_iter)
+                data_load_time = time.time() - data_load_time_start - data_load_last_time 
+                list_data_time.append(data_load_time)
+                avg_dt = sum(list_data_time)/len(list_data_time)
+                sys.stdout.write("\033[K")
+                est_time =((last_eb[1] -n)*avg_dt)/60**2
+                print('Skiping last Trained Batches Estimated Left Time:' + '%.2f h ' % est_time + 'Batches to skip:'+str((last_eb[1] -n)), end='\r')
+                data_load_last_time = time.time() - data_load_time_start
+            sys.stdout.write("\033[K")   
+            print("Training Epoch: "+str(epoch)) # Console
+            for inputs, labels in train_loader:
+                """
+                Train the Model with this Batch
+                """
+                # Move input and label tensors to the device
+                inputs = inputs.to(device)
+                labels = labels.to(device)
+                # Zero out the optimizer
+                optimizer.zero_grad()
+                # Forward pass
+                outputs = model(inputs)
+                loss = criterion(outputs, labels)
+                # Backward pass
+                loss.backward()
+                optimizer.step()
+                """
+                The Real Training Ends here.
+                This Part is just for visuals so that you now how far the Model is trained 
+                and how much time is left for this Epoch
+                """
+                current_time = time.time() - start_time
+                dt = current_time - last_time
+                list_dt.append(dt)
+                training_history[0].append(loss.item())
+                _, preds = torch.max(outputs, 1)
+                training_history[1].append(torch.sum(preds == labels.data).item())
+                avg_dt = sum(list_dt)/len(list_dt)
+                avg_loss = sum(training_history[0])/len(training_history[0])
+                Time_Estimate = (round((len(train_dataset)-count*batch_size)/batch_size)*avg_dt)/60**2 
+                sys.stdout.write("\033[K")
+                print('Training Model | Estimated Left Time for this Epoch: ' + "%.2f h" % Time_Estimate + "| Current Average Loss: " + "%.5f" % avg_loss + '| Batches Left:'+str(round((len(train_dataset)-count*batch_size)/batch_size)), end='\r')
+                last_time = current_time
+                count += 1 
+            """
+            Every Epoch the Model gets Saved Plus the Training History containing all Loses and Accurarcys for every Batch
+            """
+            sys.stdout.write("\033[K") 
+            print(f'Epoch {epoch+1}/{num_epochs} Done, Loss: {sum(training_history[0])/len(training_history[0]):.4f}')
+            last_eb = [epoch+1,0]
+            with open(training_save_PATH, 'w') as f:
+                json.dump(last_eb, f) 
+                torch.save(model.state_dict(), model_save_PATH)
+            with open(training_history_PATH, 'w') as f:
+                json.dump(training_history, f) 
+    except KeyboardInterrupt:
+        """
+        This Part Saves the Model if at any Time the Training gets cancelled argo the programm gets Closed
+        """
+        with open(training_save_PATH, 'w') as f:
+            sys.stdout.write("\033[K") 
+            print("---------------------")
+            print("[!!!] Trainings Interruption SAVING TRAINING [!!!]")
+            json.dump(last_eb, f) 
+            torch.save(model.state_dict(), model_save_PATH)
+            print("[!!!] Training Saved [!!!]")
+            print("[!!!] Epoch: " + str(epoch) + " Batch: " + str(count)+" [!!!]")
+        with open(training_history_PATH, 'w') as f:
+            json.dump(training_history, f) 
+    """
+    After Training save the model and Training Histroy
+    """
+    sys.stdout.write("\033[K")     
+    print("---------------------")
+    print(f'Finished Training, Loss: {sum(training_history[0])/len(training_history[0]):.4f}')
+    torch.save(model.state_dict(), model_save_PATH)
+    with open(training_history_PATH, 'w') as f:
+            json.dump(training_history, f) 
\ No newline at end of file