From dc75a82ddb80bf55ec4d099ab148d3f2af7c7ec1 Mon Sep 17 00:00:00 2001
From: beardyface <henryamwilliams@gmail.com>
Date: Fri, 13 Oct 2023 16:29:45 +1300
Subject: [PATCH 1/4] moved train_loops into library for external use

---
 .../util/arguement_parser.py                  |   2 +-
 example/example_training_loops.py             |   6 +-
 example/policy_example.py                     | 143 -----------------
 example/ppo_example.py                        | 130 ----------------
 example/value_example.py                      | 147 ------------------
 5 files changed, 4 insertions(+), 424 deletions(-)
 delete mode 100644 example/policy_example.py
 delete mode 100644 example/ppo_example.py
 delete mode 100644 example/value_example.py

diff --git a/cares_reinforcement_learning/util/arguement_parser.py b/cares_reinforcement_learning/util/arguement_parser.py
index 69db253a..8f889962 100644
--- a/cares_reinforcement_learning/util/arguement_parser.py
+++ b/cares_reinforcement_learning/util/arguement_parser.py
@@ -86,7 +86,7 @@ def parse_args():
     parser.add_argument('--G', type=int, default=10, help="Number of learning updates each step of training")
     parser.add_argument('--batch_size', type=int, default=32, help="Batch Size used during training")
 
-    parser.add_argument('--max_steps_exploration', type=int, default=10000, help="Total number of steps for exploration before training")
+    parser.add_argument('--max_steps_exploration', type=int, default=1000, help="Total number of steps for exploration before training")
     parser.add_argument('--max_steps_training', type=int, default=100000, help="Total number of steps to train the algorithm")
 
     parser.add_argument('--number_steps_per_evaluation', type=int, default=10000, help="The number of steps inbetween evaluation runs during training")
diff --git a/example/example_training_loops.py b/example/example_training_loops.py
index c3b2551c..1ee36c7e 100644
--- a/example/example_training_loops.py
+++ b/example/example_training_loops.py
@@ -9,9 +9,9 @@
 from cares_reinforcement_learning.util import EnvironmentFactory
 from cares_reinforcement_learning.util import arguement_parser as ap
 
-import example.policy_example as pbe
-import example.value_example as vbe
-import ppo_example as ppe
+import cares_reinforcement_learning.train_loops.policy_loop as pbe
+import cares_reinforcement_learning.train_loops.value_loop as vbe
+import cares_reinforcement_learning.train_loops.ppo_loop as ppe
 
 import gym
 from gym import spaces
diff --git a/example/policy_example.py b/example/policy_example.py
deleted file mode 100644
index 494bb2e1..00000000
--- a/example/policy_example.py
+++ /dev/null
@@ -1,143 +0,0 @@
-from cares_reinforcement_learning.memory import MemoryBuffer
-from cares_reinforcement_learning.memory.augments import *
-from cares_reinforcement_learning.util import helpers as hlp, Record
-
-import cv2
-import time
-import gym
-import logging
-import numpy as np
-
-def evaluate_policy_network(env, agent, args, record=None, total_steps=0):
-
-    if record is not None:
-        frame = env.grab_frame()
-        record.start_video(total_steps+1, frame)
-
-    number_eval_episodes = int(args["number_eval_episodes"])
-    
-    state = env.reset()
-
-    for eval_episode_counter in range(number_eval_episodes):
-        episode_timesteps = 0
-        episode_reward = 0
-        episode_num = 0
-        done = False
-        truncated = False
-
-        while not done and not truncated:
-            episode_timesteps += 1
-            action = agent.select_action_from_policy(state, evaluation=True)
-            action_env = hlp.denormalize(action, env.max_action_value, env.min_action_value)
-
-            state, reward, done, truncated = env.step(action_env)
-            episode_reward += reward
-
-            if eval_episode_counter == 0 and record is not None:
-                frame = env.grab_frame()
-                record.log_video(frame)
-
-            if done or truncated:
-                if record is not None:
-                    record.log_eval(
-                        total_steps=total_steps+1,
-                        episode=eval_episode_counter+1, 
-                        episode_reward=episode_reward,
-                        display=True
-                    )
-
-                # Reset environment
-                state = env.reset()
-                episode_reward = 0
-                episode_timesteps = 0
-                episode_num += 1
-    
-    record.stop_video()
-
-def policy_based_train(env, agent, memory, record, args):
-    start_time = time.time()
-
-    max_steps_training = args["max_steps_training"]
-    max_steps_exploration = args["max_steps_exploration"]
-    number_steps_per_evaluation = args["number_steps_per_evaluation"]
-
-    logging.info(f"Training {max_steps_training} Exploration {max_steps_exploration} Evaluation {number_steps_per_evaluation}")
-
-    batch_size = args["batch_size"]
-    seed = args["seed"]
-    G = args["G"]
-
-    episode_timesteps = 0
-    episode_reward = 0
-    episode_num = 0
-
-    evaluate = False
-
-    state = env.reset()
-
-    episode_start = time.time()
-    for total_step_counter in range(int(max_steps_training)):
-        episode_timesteps += 1
-
-        if total_step_counter < max_steps_exploration:
-            logging.info(f"Running Exploration Steps {total_step_counter+1}/{max_steps_exploration}")
-            # action range the env uses [e.g. -2 , 2 for pendulum]
-            action_env = np.random.uniform(env.min_action_value, env.max_action_value, size=env.action_num)
-            # algorithm range [-1, 1] - note for DMCS this is redudenant but required for openai
-            action = hlp.normalize(action_env, env.max_action_value, env.min_action_value)  
-        else:
-            # algorithm range [-1, 1]
-            action = agent.select_action_from_policy(state)
-            # mapping to env range [e.g. -2 , 2 for pendulum] - note for DMCS this is redudenant but required for openai
-            action_env = hlp.denormalize(action, env.max_action_value, env.min_action_value)  
-
-        next_state, reward, done, truncated = env.step(action_env)
-        memory.add(state=state, action=action, reward=reward, next_state=next_state, done=done)
-
-        state = next_state
-        episode_reward += reward
-
-        if total_step_counter >= max_steps_exploration:
-            for i in range(G):
-                experience = memory.sample(batch_size)
-                info = agent.train_policy((
-                    experience['state'],
-                    experience['action'],
-                    experience['reward'],
-                    experience['next_state'],
-                    experience['done']
-                ))
-                memory.update_priorities(experience['indices'], info)
-                # record.log_info(info, display=False)
-
-        if (total_step_counter+1) % number_steps_per_evaluation == 0:
-            evaluate = True
-
-        if done or truncated:
-            episode_time = time.time() - episode_start
-            record.log_train(
-                total_steps = total_step_counter + 1,
-                episode = episode_num + 1,
-                episode_steps=episode_timesteps,
-                episode_reward = episode_reward,
-                episode_time = episode_time,
-                display = True
-            )
-
-            if evaluate:
-                logging.info("*************--Evaluation Loop--*************")
-                args["evaluation_seed"] = seed
-                evaluate_policy_network(env, agent, args, record=record, total_steps=total_step_counter)
-                logging.info("--------------------------------------------")
-                evaluate = False
-
-            # Reset environment
-            state = env.reset()
-            episode_timesteps = 0
-            episode_reward = 0
-            episode_num += 1
-            episode_start = time.time()
-
-    end_time = time.time()
-    elapsed_time = end_time - start_time
-    print('Training time:', time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))
\ No newline at end of file
diff --git a/example/ppo_example.py b/example/ppo_example.py
deleted file mode 100644
index 25671d0d..00000000
--- a/example/ppo_example.py
+++ /dev/null
@@ -1,130 +0,0 @@
-from cares_reinforcement_learning.memory import *
-from cares_reinforcement_learning.util import helpers as hlp
-from cares_reinforcement_learning.util import Record
-
-import time
-import gym
-import logging
-
-from timeit import default_timer as timer
-
-def evaluate_ppo_network(env, agent, args, record=None, total_steps=0):
-    
-    if record is not None:
-        frame = env.grab_frame()
-        record.start_video(total_steps+1, frame)
-
-    number_eval_episodes = int(args["number_eval_episodes"])
-
-    state = env.reset()
-
-    for eval_episode_counter in range(number_eval_episodes):
-        episode_timesteps = 0
-        episode_reward = 0
-        episode_num = 0
-        done = False
-        truncated = False
-        
-        while not done and not truncated:
-            episode_timesteps += 1
-            action, log_prob = agent.select_action_from_policy(state)
-            action_env = hlp.denormalize(action, env.max_action_value, env.min_action_value)
-
-            state, reward, done, truncated = env.step(action_env)
-            episode_reward += reward
-
-            if eval_episode_counter == 0 and record is not None:
-                frame = env.grab_frame()
-                record.log_video(frame)
-
-            if done or truncated:
-                if record is not None:
-                    record.log_eval(
-                            total_steps=total_steps+1,
-                            episode=eval_episode_counter+1, 
-                            episode_reward=episode_reward,
-                            display=True
-                        )
-                
-                # Reset environment
-                state = env.reset()
-                episode_reward = 0
-                episode_timesteps = 0
-                episode_num += 1
-
-    record.stop_video()
-
-def ppo_train(env, agent, record, args):
-    start_time = time.time()
-
-    seed = args["seed"]
-    max_steps_training = args["max_steps_training"]
-    max_steps_per_batch = args["max_steps_per_batch"]
-    number_steps_per_evaluation = args["number_steps_per_evaluation"]
-
-    episode_timesteps = 0
-    episode_num = 0
-    episode_reward = 0
-
-    memory = MemoryBuffer()
-
-    evaluate = False
-
-    state = env.reset()
-
-    episode_start = time.time()
-    for total_step_counter in range(int(max_steps_training)):
-        episode_timesteps += 1
-
-        action, log_prob = agent.select_action_from_policy(state)
-        action_env = hlp.denormalize(action, env.max_action_value, env.min_action_value)
-
-        next_state, reward, done, truncated = env.step(action_env)
-        memory.add(state=state, action=action, reward=reward, next_state=next_state, done=done, log_prob=log_prob)
-
-        state = next_state
-        episode_reward += reward
-
-        if (total_step_counter+1) % max_steps_per_batch == 0:
-            experience = memory.flush()
-            info = agent.train_policy((
-                experience['state'],
-                experience['action'],
-                experience['reward'],
-                experience['next_state'],
-                experience['done'],
-                experience['log_prob']
-            ))
-            # record.log_info(info, display=False)
-
-        if (total_step_counter+1) % number_steps_per_evaluation == 0:
-            evaluate = True
-
-        if done or truncated:
-            episode_time = time.time() - episode_start
-            record.log_train(
-                total_steps = total_step_counter + 1,
-                episode = episode_num + 1,
-                episode_steps=episode_timesteps,
-                episode_reward = episode_reward,
-                episode_time = episode_time,
-                display = True
-            )
-
-            if evaluate:
-                logging.info("*************--Evaluation Loop--*************")
-                args["evaluation_seed"] = seed
-                evaluate_ppo_network(env, agent, args, record=record, total_steps=total_step_counter)
-                logging.info("--------------------------------------------")
-                evaluate = False
-
-            # Reset environment
-            state = env.reset()
-            episode_timesteps = 0
-            episode_reward = 0
-            episode_num += 1
-            episode_start = time.time()
-
-    end_time = time.time()
-    elapsed_time = end_time - start_time
-    print('Training time:', time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))
\ No newline at end of file
diff --git a/example/value_example.py b/example/value_example.py
deleted file mode 100644
index ffe1ab64..00000000
--- a/example/value_example.py
+++ /dev/null
@@ -1,147 +0,0 @@
-from cares_reinforcement_learning.memory import *
-from cares_reinforcement_learning.util import helpers as hlp, Record
-
-import numpy as np
-import time
-import gym
-import logging
-import random
-
-from random import randrange
-
-from timeit import default_timer as timer
-
-def evaluate_value_network(env, agent, args, record=None, total_steps=0):
-
-    if record is not None:
-        frame = env.grab_frame()
-        record.start_video(total_steps+1, frame)
-
-    number_eval_episodes = int(args["number_eval_episodes"])
-    
-    state = env.reset()
-    
-    exploration_rate = args["exploration_min"]
-
-    for eval_episode_counter in range(number_eval_episodes):
-        episode_timesteps = 0
-        episode_reward = 0
-        episode_num = 0
-        done = False
-        truncated = False
-        
-        while not done and not truncated:
-            episode_timesteps += 1
-
-            if random.random() < exploration_rate:
-                action = randrange(env.action_num)
-            else:
-                action = agent.select_action_from_policy(state)
-
-            state, reward, done, truncated = env.step(action)
-            episode_reward += reward
-
-            if eval_episode_counter == 0 and record is not None:
-                frame = env.grab_frame()
-                record.log_video(frame)
-
-            if done or truncated:
-                if record is not None:
-                    record.log_eval(
-                        total_steps=total_steps+1,
-                        episode=eval_episode_counter+1, 
-                        episode_reward=episode_reward,
-                        display=True
-                    )
-
-                # Reset environment
-                state = env.reset()
-                episode_reward = 0
-                episode_timesteps = 0
-                episode_num += 1
-
-    record.stop_video()
-
-def value_based_train(env, agent, memory, record, args):
-    start_time = time.time()
-
-    max_steps_training = args["max_steps_training"]
-    exploration_min = args["exploration_min"]
-    exploration_decay = args["exploration_decay"]
-    number_steps_per_evaluation = args["number_steps_per_evaluation"]
-
-    batch_size = args["batch_size"]
-    seed = args["seed"]
-    G = args["G"]
-
-    episode_timesteps = 0
-    episode_reward = 0
-    episode_num = 0
-    
-    evaluate = False
-
-    state = env.reset()
-
-    exploration_rate = 1
-
-    episode_start = time.time()
-    for total_step_counter in range(int(max_steps_training)):
-        episode_timesteps += 1
-
-        exploration_rate *= exploration_decay
-        exploration_rate = max(exploration_min, exploration_rate)
-
-        if random.random() < exploration_rate:
-            action = randrange(env.action_num)
-        else:
-            action = agent.select_action_from_policy(state)
-
-        next_state, reward, done, truncated = env.step(action)
-        memory.add(state=state, action=action, reward=reward, next_state=next_state, done=done)
-        state = next_state
-        episode_reward += reward
-
-        if len(memory) > batch_size:
-            for _ in range(G):
-                experience = memory.sample(batch_size)
-                info = agent.train_policy((
-                    experience['state'],
-                    experience['action'],
-                    experience['reward'],
-                    experience['next_state'],
-                    experience['done']
-                ))
-                memory.update_priorities(experience['indices'], info)
-                # record.log_info(info, display=False)
-            
-        if (total_step_counter+1) % number_steps_per_evaluation == 0:
-            evaluate = True
-
-        if done or truncated:
-            episode_time = time.time() - episode_start
-            record.log_train(
-                total_steps = total_step_counter + 1,
-                episode = episode_num + 1,
-                episode_steps=episode_timesteps,
-                episode_reward = episode_reward,
-                episode_time = episode_time,
-                display = True
-            )
-
-            if evaluate:
-                logging.info("*************--Evaluation Loop--*************")
-                args["evaluation_seed"] = seed
-                evaluate_value_network(env, agent, args, record=record, total_steps=total_step_counter)
-                logging.info("--------------------------------------------")
-                evaluate = False
-
-            # Reset environment
-            state = env.reset()
-            episode_timesteps = 0
-            episode_reward = 0
-            episode_num += 1
-            episode_start = time.time()
-
-    end_time = time.time()
-    elapsed_time = end_time - start_time
-    print('Training time:', time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))
\ No newline at end of file

From d32e72312e31409f468eef1a3382b54a2ea44fc4 Mon Sep 17 00:00:00 2001
From: beardyface <henryamwilliams@gmail.com>
Date: Fri, 13 Oct 2023 16:37:48 +1300
Subject: [PATCH 2/4] Updated Network Factory behaviour

---
 .../train_loops/policy_loop.py                | 143 +++++++++++++++++
 .../train_loops/ppo_loop.py                   | 130 ++++++++++++++++
 .../train_loops/value_loop.py                 | 147 ++++++++++++++++++
 .../util/NetworkFactory.py                    |   4 +-
 example/example_training_loops.py             |   2 +
 5 files changed, 425 insertions(+), 1 deletion(-)
 create mode 100644 cares_reinforcement_learning/train_loops/policy_loop.py
 create mode 100644 cares_reinforcement_learning/train_loops/ppo_loop.py
 create mode 100644 cares_reinforcement_learning/train_loops/value_loop.py

diff --git a/cares_reinforcement_learning/train_loops/policy_loop.py b/cares_reinforcement_learning/train_loops/policy_loop.py
new file mode 100644
index 00000000..494bb2e1
--- /dev/null
+++ b/cares_reinforcement_learning/train_loops/policy_loop.py
@@ -0,0 +1,143 @@
+from cares_reinforcement_learning.memory import MemoryBuffer
+from cares_reinforcement_learning.memory.augments import *
+from cares_reinforcement_learning.util import helpers as hlp, Record
+
+import cv2
+import time
+import gym
+import logging
+import numpy as np
+
+def evaluate_policy_network(env, agent, args, record=None, total_steps=0):
+
+    if record is not None:
+        frame = env.grab_frame()
+        record.start_video(total_steps+1, frame)
+
+    number_eval_episodes = int(args["number_eval_episodes"])
+    
+    state = env.reset()
+
+    for eval_episode_counter in range(number_eval_episodes):
+        episode_timesteps = 0
+        episode_reward = 0
+        episode_num = 0
+        done = False
+        truncated = False
+
+        while not done and not truncated:
+            episode_timesteps += 1
+            action = agent.select_action_from_policy(state, evaluation=True)
+            action_env = hlp.denormalize(action, env.max_action_value, env.min_action_value)
+
+            state, reward, done, truncated = env.step(action_env)
+            episode_reward += reward
+
+            if eval_episode_counter == 0 and record is not None:
+                frame = env.grab_frame()
+                record.log_video(frame)
+
+            if done or truncated:
+                if record is not None:
+                    record.log_eval(
+                        total_steps=total_steps+1,
+                        episode=eval_episode_counter+1, 
+                        episode_reward=episode_reward,
+                        display=True
+                    )
+
+                # Reset environment
+                state = env.reset()
+                episode_reward = 0
+                episode_timesteps = 0
+                episode_num += 1
+    
+    record.stop_video()
+
+def policy_based_train(env, agent, memory, record, args):
+    start_time = time.time()
+
+    max_steps_training = args["max_steps_training"]
+    max_steps_exploration = args["max_steps_exploration"]
+    number_steps_per_evaluation = args["number_steps_per_evaluation"]
+
+    logging.info(f"Training {max_steps_training} Exploration {max_steps_exploration} Evaluation {number_steps_per_evaluation}")
+
+    batch_size = args["batch_size"]
+    seed = args["seed"]
+    G = args["G"]
+
+    episode_timesteps = 0
+    episode_reward = 0
+    episode_num = 0
+
+    evaluate = False
+
+    state = env.reset()
+
+    episode_start = time.time()
+    for total_step_counter in range(int(max_steps_training)):
+        episode_timesteps += 1
+
+        if total_step_counter < max_steps_exploration:
+            logging.info(f"Running Exploration Steps {total_step_counter+1}/{max_steps_exploration}")
+            # action range the env uses [e.g. -2 , 2 for pendulum]
+            action_env = np.random.uniform(env.min_action_value, env.max_action_value, size=env.action_num)
+            # algorithm range [-1, 1] - note for DMCS this is redudenant but required for openai
+            action = hlp.normalize(action_env, env.max_action_value, env.min_action_value)  
+        else:
+            # algorithm range [-1, 1]
+            action = agent.select_action_from_policy(state)
+            # mapping to env range [e.g. -2 , 2 for pendulum] - note for DMCS this is redudenant but required for openai
+            action_env = hlp.denormalize(action, env.max_action_value, env.min_action_value)  
+
+        next_state, reward, done, truncated = env.step(action_env)
+        memory.add(state=state, action=action, reward=reward, next_state=next_state, done=done)
+
+        state = next_state
+        episode_reward += reward
+
+        if total_step_counter >= max_steps_exploration:
+            for i in range(G):
+                experience = memory.sample(batch_size)
+                info = agent.train_policy((
+                    experience['state'],
+                    experience['action'],
+                    experience['reward'],
+                    experience['next_state'],
+                    experience['done']
+                ))
+                memory.update_priorities(experience['indices'], info)
+                # record.log_info(info, display=False)
+
+        if (total_step_counter+1) % number_steps_per_evaluation == 0:
+            evaluate = True
+
+        if done or truncated:
+            episode_time = time.time() - episode_start
+            record.log_train(
+                total_steps = total_step_counter + 1,
+                episode = episode_num + 1,
+                episode_steps=episode_timesteps,
+                episode_reward = episode_reward,
+                episode_time = episode_time,
+                display = True
+            )
+
+            if evaluate:
+                logging.info("*************--Evaluation Loop--*************")
+                args["evaluation_seed"] = seed
+                evaluate_policy_network(env, agent, args, record=record, total_steps=total_step_counter)
+                logging.info("--------------------------------------------")
+                evaluate = False
+
+            # Reset environment
+            state = env.reset()
+            episode_timesteps = 0
+            episode_reward = 0
+            episode_num += 1
+            episode_start = time.time()
+
+    end_time = time.time()
+    elapsed_time = end_time - start_time
+    print('Training time:', time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))
\ No newline at end of file
diff --git a/cares_reinforcement_learning/train_loops/ppo_loop.py b/cares_reinforcement_learning/train_loops/ppo_loop.py
new file mode 100644
index 00000000..25671d0d
--- /dev/null
+++ b/cares_reinforcement_learning/train_loops/ppo_loop.py
@@ -0,0 +1,130 @@
+from cares_reinforcement_learning.memory import *
+from cares_reinforcement_learning.util import helpers as hlp
+from cares_reinforcement_learning.util import Record
+
+import time
+import gym
+import logging
+
+from timeit import default_timer as timer
+
+def evaluate_ppo_network(env, agent, args, record=None, total_steps=0):
+    
+    if record is not None:
+        frame = env.grab_frame()
+        record.start_video(total_steps+1, frame)
+
+    number_eval_episodes = int(args["number_eval_episodes"])
+
+    state = env.reset()
+
+    for eval_episode_counter in range(number_eval_episodes):
+        episode_timesteps = 0
+        episode_reward = 0
+        episode_num = 0
+        done = False
+        truncated = False
+        
+        while not done and not truncated:
+            episode_timesteps += 1
+            action, log_prob = agent.select_action_from_policy(state)
+            action_env = hlp.denormalize(action, env.max_action_value, env.min_action_value)
+
+            state, reward, done, truncated = env.step(action_env)
+            episode_reward += reward
+
+            if eval_episode_counter == 0 and record is not None:
+                frame = env.grab_frame()
+                record.log_video(frame)
+
+            if done or truncated:
+                if record is not None:
+                    record.log_eval(
+                            total_steps=total_steps+1,
+                            episode=eval_episode_counter+1, 
+                            episode_reward=episode_reward,
+                            display=True
+                        )
+                
+                # Reset environment
+                state = env.reset()
+                episode_reward = 0
+                episode_timesteps = 0
+                episode_num += 1
+
+    record.stop_video()
+
+def ppo_train(env, agent, record, args):
+    start_time = time.time()
+
+    seed = args["seed"]
+    max_steps_training = args["max_steps_training"]
+    max_steps_per_batch = args["max_steps_per_batch"]
+    number_steps_per_evaluation = args["number_steps_per_evaluation"]
+
+    episode_timesteps = 0
+    episode_num = 0
+    episode_reward = 0
+
+    memory = MemoryBuffer()
+
+    evaluate = False
+
+    state = env.reset()
+
+    episode_start = time.time()
+    for total_step_counter in range(int(max_steps_training)):
+        episode_timesteps += 1
+
+        action, log_prob = agent.select_action_from_policy(state)
+        action_env = hlp.denormalize(action, env.max_action_value, env.min_action_value)
+
+        next_state, reward, done, truncated = env.step(action_env)
+        memory.add(state=state, action=action, reward=reward, next_state=next_state, done=done, log_prob=log_prob)
+
+        state = next_state
+        episode_reward += reward
+
+        if (total_step_counter+1) % max_steps_per_batch == 0:
+            experience = memory.flush()
+            info = agent.train_policy((
+                experience['state'],
+                experience['action'],
+                experience['reward'],
+                experience['next_state'],
+                experience['done'],
+                experience['log_prob']
+            ))
+            # record.log_info(info, display=False)
+
+        if (total_step_counter+1) % number_steps_per_evaluation == 0:
+            evaluate = True
+
+        if done or truncated:
+            episode_time = time.time() - episode_start
+            record.log_train(
+                total_steps = total_step_counter + 1,
+                episode = episode_num + 1,
+                episode_steps=episode_timesteps,
+                episode_reward = episode_reward,
+                episode_time = episode_time,
+                display = True
+            )
+
+            if evaluate:
+                logging.info("*************--Evaluation Loop--*************")
+                args["evaluation_seed"] = seed
+                evaluate_ppo_network(env, agent, args, record=record, total_steps=total_step_counter)
+                logging.info("--------------------------------------------")
+                evaluate = False
+
+            # Reset environment
+            state = env.reset()
+            episode_timesteps = 0
+            episode_reward = 0
+            episode_num += 1
+            episode_start = time.time()
+
+    end_time = time.time()
+    elapsed_time = end_time - start_time
+    print('Training time:', time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))
\ No newline at end of file
diff --git a/cares_reinforcement_learning/train_loops/value_loop.py b/cares_reinforcement_learning/train_loops/value_loop.py
new file mode 100644
index 00000000..ffe1ab64
--- /dev/null
+++ b/cares_reinforcement_learning/train_loops/value_loop.py
@@ -0,0 +1,147 @@
+from cares_reinforcement_learning.memory import *
+from cares_reinforcement_learning.util import helpers as hlp, Record
+
+import numpy as np
+import time
+import gym
+import logging
+import random
+
+from random import randrange
+
+from timeit import default_timer as timer
+
+def evaluate_value_network(env, agent, args, record=None, total_steps=0):
+
+    if record is not None:
+        frame = env.grab_frame()
+        record.start_video(total_steps+1, frame)
+
+    number_eval_episodes = int(args["number_eval_episodes"])
+    
+    state = env.reset()
+    
+    exploration_rate = args["exploration_min"]
+
+    for eval_episode_counter in range(number_eval_episodes):
+        episode_timesteps = 0
+        episode_reward = 0
+        episode_num = 0
+        done = False
+        truncated = False
+        
+        while not done and not truncated:
+            episode_timesteps += 1
+
+            if random.random() < exploration_rate:
+                action = randrange(env.action_num)
+            else:
+                action = agent.select_action_from_policy(state)
+
+            state, reward, done, truncated = env.step(action)
+            episode_reward += reward
+
+            if eval_episode_counter == 0 and record is not None:
+                frame = env.grab_frame()
+                record.log_video(frame)
+
+            if done or truncated:
+                if record is not None:
+                    record.log_eval(
+                        total_steps=total_steps+1,
+                        episode=eval_episode_counter+1, 
+                        episode_reward=episode_reward,
+                        display=True
+                    )
+
+                # Reset environment
+                state = env.reset()
+                episode_reward = 0
+                episode_timesteps = 0
+                episode_num += 1
+
+    record.stop_video()
+
+def value_based_train(env, agent, memory, record, args):
+    start_time = time.time()
+
+    max_steps_training = args["max_steps_training"]
+    exploration_min = args["exploration_min"]
+    exploration_decay = args["exploration_decay"]
+    number_steps_per_evaluation = args["number_steps_per_evaluation"]
+
+    batch_size = args["batch_size"]
+    seed = args["seed"]
+    G = args["G"]
+
+    episode_timesteps = 0
+    episode_reward = 0
+    episode_num = 0
+    
+    evaluate = False
+
+    state = env.reset()
+
+    exploration_rate = 1
+
+    episode_start = time.time()
+    for total_step_counter in range(int(max_steps_training)):
+        episode_timesteps += 1
+
+        exploration_rate *= exploration_decay
+        exploration_rate = max(exploration_min, exploration_rate)
+
+        if random.random() < exploration_rate:
+            action = randrange(env.action_num)
+        else:
+            action = agent.select_action_from_policy(state)
+
+        next_state, reward, done, truncated = env.step(action)
+        memory.add(state=state, action=action, reward=reward, next_state=next_state, done=done)
+        state = next_state
+        episode_reward += reward
+
+        if len(memory) > batch_size:
+            for _ in range(G):
+                experience = memory.sample(batch_size)
+                info = agent.train_policy((
+                    experience['state'],
+                    experience['action'],
+                    experience['reward'],
+                    experience['next_state'],
+                    experience['done']
+                ))
+                memory.update_priorities(experience['indices'], info)
+                # record.log_info(info, display=False)
+            
+        if (total_step_counter+1) % number_steps_per_evaluation == 0:
+            evaluate = True
+
+        if done or truncated:
+            episode_time = time.time() - episode_start
+            record.log_train(
+                total_steps = total_step_counter + 1,
+                episode = episode_num + 1,
+                episode_steps=episode_timesteps,
+                episode_reward = episode_reward,
+                episode_time = episode_time,
+                display = True
+            )
+
+            if evaluate:
+                logging.info("*************--Evaluation Loop--*************")
+                args["evaluation_seed"] = seed
+                evaluate_value_network(env, agent, args, record=record, total_steps=total_step_counter)
+                logging.info("--------------------------------------------")
+                evaluate = False
+
+            # Reset environment
+            state = env.reset()
+            episode_timesteps = 0
+            episode_reward = 0
+            episode_num += 1
+            episode_start = time.time()
+
+    end_time = time.time()
+    elapsed_time = end_time - start_time
+    print('Training time:', time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))
\ No newline at end of file
diff --git a/cares_reinforcement_learning/util/NetworkFactory.py b/cares_reinforcement_learning/util/NetworkFactory.py
index 91ea3a11..acf0c656 100644
--- a/cares_reinforcement_learning/util/NetworkFactory.py
+++ b/cares_reinforcement_learning/util/NetworkFactory.py
@@ -1,4 +1,5 @@
 import torch
+import logging
 
 def create_DQN(args):
     from cares_reinforcement_learning.algorithm.value import DQN
@@ -159,4 +160,5 @@ def create_network(self, algorithm, args):
             return create_SAC(args)
         elif algorithm == "TD3":
             return create_TD3(args)
-        raise ValueError(f"Unkown algorithm: {algorithm}")
+        logging.warn(f"Algorithm: {algorithm} is not in the default cares_rl factory")
+        return None
diff --git a/example/example_training_loops.py b/example/example_training_loops.py
index 1ee36c7e..20c8d915 100644
--- a/example/example_training_loops.py
+++ b/example/example_training_loops.py
@@ -57,6 +57,8 @@ def main():
 
         logging.info(f"Algorithm: {args['algorithm']}")
         agent = network_factory.create_network(args["algorithm"], args)
+        if agent == None:
+            raise ValueError(f"Unkown agent for default algorithms {args['algorithm']}")
 
         memory = memory_factory.create_memory(args['memory'], args)
         logging.info(f"Memory: {args['memory']}")

From 6fde5a90bd65f999f6893d0ce71ef02965f76386 Mon Sep 17 00:00:00 2001
From: beardyface <henryamwilliams@gmail.com>
Date: Mon, 16 Oct 2023 11:36:03 +1300
Subject: [PATCH 3/4] Adjusted args to be modular externally

---
 .../util/arguement_parser.py                     | 16 ++++++++--------
 example/example_training_loops.py                |  3 ++-
 2 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/cares_reinforcement_learning/util/arguement_parser.py b/cares_reinforcement_learning/util/arguement_parser.py
index 8f889962..d968e401 100644
--- a/cares_reinforcement_learning/util/arguement_parser.py
+++ b/cares_reinforcement_learning/util/arguement_parser.py
@@ -73,9 +73,9 @@ def algorithm_args(parent_parser):
     parser_DoubleDQN.add_argument('--exploration_min', type=float, default=1e-3)
     parser_DoubleDQN.add_argument('--exploration_decay', type=float, default=0.95)
 
-    return alg_parser
+    return alg_parser, alg_parsers
 
-def parse_args():
+def environment_parser():
     parser = argparse.ArgumentParser(add_help=False)  # Add an argument
     
     parser.add_argument('--number_training_iterations', type=int, default=1, help="Total amount of training iterations to complete")
@@ -97,10 +97,10 @@ def parse_args():
     parser.add_argument('--plot_frequency', type=int, default=100, help="How many steps between updating the running plot of the training and evaluation data during training")
     parser.add_argument('--checkpoint_frequency', type=int, default=100, help="How many steps between saving check point models of the agent during training")
 
-    parser = algorithm_args(parent_parser=parser)
+    return parser
+
+def create_parser():
+    parser = environment_parser()
+    parser, alg_parsers = algorithm_args(parent_parser=parser)
     parser = environment_args(parent_parser=parser)
-    
-    return vars(parser.parse_args()) # converts to a dictionary
-  
-if __name__ == '__main__':
-    print(parse_args())
+    return parser
\ No newline at end of file
diff --git a/example/example_training_loops.py b/example/example_training_loops.py
index 20c8d915..c8ac76f4 100644
--- a/example/example_training_loops.py
+++ b/example/example_training_loops.py
@@ -28,7 +28,8 @@ def set_seed(seed):
     random.seed(seed)
 
 def main():
-    args = ap.parse_args()
+    parser = ap.create_parser()
+    args = vars(parser.parse_args()) # converts to a dictionary
 
     args["device"] = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
     logging.info(f"Device: {args['device']}")

From be422cd8441ce3c45153e943015d588b168344cd Mon Sep 17 00:00:00 2001
From: beardyface <henryamwilliams@gmail.com>
Date: Mon, 16 Oct 2023 11:47:21 +1300
Subject: [PATCH 4/4] Updated tests

---
 tests/test_utils.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/test_utils.py b/tests/test_utils.py
index c8e83cc2..b4348a8b 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -72,9 +72,9 @@ def test_create_network():
 
     agent = factory.create_network("TD3", args)
     assert isinstance(agent, TD3), "Failed to create TD3 agent"
-
-    with pytest.raises(ValueError):
-        factory.create_network("Unknown", args)
+    
+    agent = factory.create_network("Unknown", args)
+    assert agent is None, f"Unkown failed to return None: returned {agent}"
 
 
 def test_denormalize():