From dc75a82ddb80bf55ec4d099ab148d3f2af7c7ec1 Mon Sep 17 00:00:00 2001 From: beardyface <henryamwilliams@gmail.com> Date: Fri, 13 Oct 2023 16:29:45 +1300 Subject: [PATCH 1/4] moved train_loops into library for external use --- .../util/arguement_parser.py | 2 +- example/example_training_loops.py | 6 +- example/policy_example.py | 143 ----------------- example/ppo_example.py | 130 ---------------- example/value_example.py | 147 ------------------ 5 files changed, 4 insertions(+), 424 deletions(-) delete mode 100644 example/policy_example.py delete mode 100644 example/ppo_example.py delete mode 100644 example/value_example.py diff --git a/cares_reinforcement_learning/util/arguement_parser.py b/cares_reinforcement_learning/util/arguement_parser.py index 69db253a..8f889962 100644 --- a/cares_reinforcement_learning/util/arguement_parser.py +++ b/cares_reinforcement_learning/util/arguement_parser.py @@ -86,7 +86,7 @@ def parse_args(): parser.add_argument('--G', type=int, default=10, help="Number of learning updates each step of training") parser.add_argument('--batch_size', type=int, default=32, help="Batch Size used during training") - parser.add_argument('--max_steps_exploration', type=int, default=10000, help="Total number of steps for exploration before training") + parser.add_argument('--max_steps_exploration', type=int, default=1000, help="Total number of steps for exploration before training") parser.add_argument('--max_steps_training', type=int, default=100000, help="Total number of steps to train the algorithm") parser.add_argument('--number_steps_per_evaluation', type=int, default=10000, help="The number of steps inbetween evaluation runs during training") diff --git a/example/example_training_loops.py b/example/example_training_loops.py index c3b2551c..1ee36c7e 100644 --- a/example/example_training_loops.py +++ b/example/example_training_loops.py @@ -9,9 +9,9 @@ from cares_reinforcement_learning.util import EnvironmentFactory from cares_reinforcement_learning.util import arguement_parser as ap -import example.policy_example as pbe -import example.value_example as vbe -import ppo_example as ppe +import cares_reinforcement_learning.train_loops.policy_loop as pbe +import cares_reinforcement_learning.train_loops.value_loop as vbe +import cares_reinforcement_learning.train_loops.ppo_loop as ppe import gym from gym import spaces diff --git a/example/policy_example.py b/example/policy_example.py deleted file mode 100644 index 494bb2e1..00000000 --- a/example/policy_example.py +++ /dev/null @@ -1,143 +0,0 @@ -from cares_reinforcement_learning.memory import MemoryBuffer -from cares_reinforcement_learning.memory.augments import * -from cares_reinforcement_learning.util import helpers as hlp, Record - -import cv2 -import time -import gym -import logging -import numpy as np - -def evaluate_policy_network(env, agent, args, record=None, total_steps=0): - - if record is not None: - frame = env.grab_frame() - record.start_video(total_steps+1, frame) - - number_eval_episodes = int(args["number_eval_episodes"]) - - state = env.reset() - - for eval_episode_counter in range(number_eval_episodes): - episode_timesteps = 0 - episode_reward = 0 - episode_num = 0 - done = False - truncated = False - - while not done and not truncated: - episode_timesteps += 1 - action = agent.select_action_from_policy(state, evaluation=True) - action_env = hlp.denormalize(action, env.max_action_value, env.min_action_value) - - state, reward, done, truncated = env.step(action_env) - episode_reward += reward - - if eval_episode_counter == 0 and record is not None: - frame = env.grab_frame() - record.log_video(frame) - - if done or truncated: - if record is not None: - record.log_eval( - total_steps=total_steps+1, - episode=eval_episode_counter+1, - episode_reward=episode_reward, - display=True - ) - - # Reset environment - state = env.reset() - episode_reward = 0 - episode_timesteps = 0 - episode_num += 1 - - record.stop_video() - -def policy_based_train(env, agent, memory, record, args): - start_time = time.time() - - max_steps_training = args["max_steps_training"] - max_steps_exploration = args["max_steps_exploration"] - number_steps_per_evaluation = args["number_steps_per_evaluation"] - - logging.info(f"Training {max_steps_training} Exploration {max_steps_exploration} Evaluation {number_steps_per_evaluation}") - - batch_size = args["batch_size"] - seed = args["seed"] - G = args["G"] - - episode_timesteps = 0 - episode_reward = 0 - episode_num = 0 - - evaluate = False - - state = env.reset() - - episode_start = time.time() - for total_step_counter in range(int(max_steps_training)): - episode_timesteps += 1 - - if total_step_counter < max_steps_exploration: - logging.info(f"Running Exploration Steps {total_step_counter+1}/{max_steps_exploration}") - # action range the env uses [e.g. -2 , 2 for pendulum] - action_env = np.random.uniform(env.min_action_value, env.max_action_value, size=env.action_num) - # algorithm range [-1, 1] - note for DMCS this is redudenant but required for openai - action = hlp.normalize(action_env, env.max_action_value, env.min_action_value) - else: - # algorithm range [-1, 1] - action = agent.select_action_from_policy(state) - # mapping to env range [e.g. -2 , 2 for pendulum] - note for DMCS this is redudenant but required for openai - action_env = hlp.denormalize(action, env.max_action_value, env.min_action_value) - - next_state, reward, done, truncated = env.step(action_env) - memory.add(state=state, action=action, reward=reward, next_state=next_state, done=done) - - state = next_state - episode_reward += reward - - if total_step_counter >= max_steps_exploration: - for i in range(G): - experience = memory.sample(batch_size) - info = agent.train_policy(( - experience['state'], - experience['action'], - experience['reward'], - experience['next_state'], - experience['done'] - )) - memory.update_priorities(experience['indices'], info) - # record.log_info(info, display=False) - - if (total_step_counter+1) % number_steps_per_evaluation == 0: - evaluate = True - - if done or truncated: - episode_time = time.time() - episode_start - record.log_train( - total_steps = total_step_counter + 1, - episode = episode_num + 1, - episode_steps=episode_timesteps, - episode_reward = episode_reward, - episode_time = episode_time, - display = True - ) - - if evaluate: - logging.info("*************--Evaluation Loop--*************") - args["evaluation_seed"] = seed - evaluate_policy_network(env, agent, args, record=record, total_steps=total_step_counter) - logging.info("--------------------------------------------") - evaluate = False - - # Reset environment - state = env.reset() - episode_timesteps = 0 - episode_reward = 0 - episode_num += 1 - episode_start = time.time() - - end_time = time.time() - elapsed_time = end_time - start_time - print('Training time:', time.strftime("%H:%M:%S", time.gmtime(elapsed_time))) \ No newline at end of file diff --git a/example/ppo_example.py b/example/ppo_example.py deleted file mode 100644 index 25671d0d..00000000 --- a/example/ppo_example.py +++ /dev/null @@ -1,130 +0,0 @@ -from cares_reinforcement_learning.memory import * -from cares_reinforcement_learning.util import helpers as hlp -from cares_reinforcement_learning.util import Record - -import time -import gym -import logging - -from timeit import default_timer as timer - -def evaluate_ppo_network(env, agent, args, record=None, total_steps=0): - - if record is not None: - frame = env.grab_frame() - record.start_video(total_steps+1, frame) - - number_eval_episodes = int(args["number_eval_episodes"]) - - state = env.reset() - - for eval_episode_counter in range(number_eval_episodes): - episode_timesteps = 0 - episode_reward = 0 - episode_num = 0 - done = False - truncated = False - - while not done and not truncated: - episode_timesteps += 1 - action, log_prob = agent.select_action_from_policy(state) - action_env = hlp.denormalize(action, env.max_action_value, env.min_action_value) - - state, reward, done, truncated = env.step(action_env) - episode_reward += reward - - if eval_episode_counter == 0 and record is not None: - frame = env.grab_frame() - record.log_video(frame) - - if done or truncated: - if record is not None: - record.log_eval( - total_steps=total_steps+1, - episode=eval_episode_counter+1, - episode_reward=episode_reward, - display=True - ) - - # Reset environment - state = env.reset() - episode_reward = 0 - episode_timesteps = 0 - episode_num += 1 - - record.stop_video() - -def ppo_train(env, agent, record, args): - start_time = time.time() - - seed = args["seed"] - max_steps_training = args["max_steps_training"] - max_steps_per_batch = args["max_steps_per_batch"] - number_steps_per_evaluation = args["number_steps_per_evaluation"] - - episode_timesteps = 0 - episode_num = 0 - episode_reward = 0 - - memory = MemoryBuffer() - - evaluate = False - - state = env.reset() - - episode_start = time.time() - for total_step_counter in range(int(max_steps_training)): - episode_timesteps += 1 - - action, log_prob = agent.select_action_from_policy(state) - action_env = hlp.denormalize(action, env.max_action_value, env.min_action_value) - - next_state, reward, done, truncated = env.step(action_env) - memory.add(state=state, action=action, reward=reward, next_state=next_state, done=done, log_prob=log_prob) - - state = next_state - episode_reward += reward - - if (total_step_counter+1) % max_steps_per_batch == 0: - experience = memory.flush() - info = agent.train_policy(( - experience['state'], - experience['action'], - experience['reward'], - experience['next_state'], - experience['done'], - experience['log_prob'] - )) - # record.log_info(info, display=False) - - if (total_step_counter+1) % number_steps_per_evaluation == 0: - evaluate = True - - if done or truncated: - episode_time = time.time() - episode_start - record.log_train( - total_steps = total_step_counter + 1, - episode = episode_num + 1, - episode_steps=episode_timesteps, - episode_reward = episode_reward, - episode_time = episode_time, - display = True - ) - - if evaluate: - logging.info("*************--Evaluation Loop--*************") - args["evaluation_seed"] = seed - evaluate_ppo_network(env, agent, args, record=record, total_steps=total_step_counter) - logging.info("--------------------------------------------") - evaluate = False - - # Reset environment - state = env.reset() - episode_timesteps = 0 - episode_reward = 0 - episode_num += 1 - episode_start = time.time() - - end_time = time.time() - elapsed_time = end_time - start_time - print('Training time:', time.strftime("%H:%M:%S", time.gmtime(elapsed_time))) \ No newline at end of file diff --git a/example/value_example.py b/example/value_example.py deleted file mode 100644 index ffe1ab64..00000000 --- a/example/value_example.py +++ /dev/null @@ -1,147 +0,0 @@ -from cares_reinforcement_learning.memory import * -from cares_reinforcement_learning.util import helpers as hlp, Record - -import numpy as np -import time -import gym -import logging -import random - -from random import randrange - -from timeit import default_timer as timer - -def evaluate_value_network(env, agent, args, record=None, total_steps=0): - - if record is not None: - frame = env.grab_frame() - record.start_video(total_steps+1, frame) - - number_eval_episodes = int(args["number_eval_episodes"]) - - state = env.reset() - - exploration_rate = args["exploration_min"] - - for eval_episode_counter in range(number_eval_episodes): - episode_timesteps = 0 - episode_reward = 0 - episode_num = 0 - done = False - truncated = False - - while not done and not truncated: - episode_timesteps += 1 - - if random.random() < exploration_rate: - action = randrange(env.action_num) - else: - action = agent.select_action_from_policy(state) - - state, reward, done, truncated = env.step(action) - episode_reward += reward - - if eval_episode_counter == 0 and record is not None: - frame = env.grab_frame() - record.log_video(frame) - - if done or truncated: - if record is not None: - record.log_eval( - total_steps=total_steps+1, - episode=eval_episode_counter+1, - episode_reward=episode_reward, - display=True - ) - - # Reset environment - state = env.reset() - episode_reward = 0 - episode_timesteps = 0 - episode_num += 1 - - record.stop_video() - -def value_based_train(env, agent, memory, record, args): - start_time = time.time() - - max_steps_training = args["max_steps_training"] - exploration_min = args["exploration_min"] - exploration_decay = args["exploration_decay"] - number_steps_per_evaluation = args["number_steps_per_evaluation"] - - batch_size = args["batch_size"] - seed = args["seed"] - G = args["G"] - - episode_timesteps = 0 - episode_reward = 0 - episode_num = 0 - - evaluate = False - - state = env.reset() - - exploration_rate = 1 - - episode_start = time.time() - for total_step_counter in range(int(max_steps_training)): - episode_timesteps += 1 - - exploration_rate *= exploration_decay - exploration_rate = max(exploration_min, exploration_rate) - - if random.random() < exploration_rate: - action = randrange(env.action_num) - else: - action = agent.select_action_from_policy(state) - - next_state, reward, done, truncated = env.step(action) - memory.add(state=state, action=action, reward=reward, next_state=next_state, done=done) - state = next_state - episode_reward += reward - - if len(memory) > batch_size: - for _ in range(G): - experience = memory.sample(batch_size) - info = agent.train_policy(( - experience['state'], - experience['action'], - experience['reward'], - experience['next_state'], - experience['done'] - )) - memory.update_priorities(experience['indices'], info) - # record.log_info(info, display=False) - - if (total_step_counter+1) % number_steps_per_evaluation == 0: - evaluate = True - - if done or truncated: - episode_time = time.time() - episode_start - record.log_train( - total_steps = total_step_counter + 1, - episode = episode_num + 1, - episode_steps=episode_timesteps, - episode_reward = episode_reward, - episode_time = episode_time, - display = True - ) - - if evaluate: - logging.info("*************--Evaluation Loop--*************") - args["evaluation_seed"] = seed - evaluate_value_network(env, agent, args, record=record, total_steps=total_step_counter) - logging.info("--------------------------------------------") - evaluate = False - - # Reset environment - state = env.reset() - episode_timesteps = 0 - episode_reward = 0 - episode_num += 1 - episode_start = time.time() - - end_time = time.time() - elapsed_time = end_time - start_time - print('Training time:', time.strftime("%H:%M:%S", time.gmtime(elapsed_time))) \ No newline at end of file From d32e72312e31409f468eef1a3382b54a2ea44fc4 Mon Sep 17 00:00:00 2001 From: beardyface <henryamwilliams@gmail.com> Date: Fri, 13 Oct 2023 16:37:48 +1300 Subject: [PATCH 2/4] Updated Network Factory behaviour --- .../train_loops/policy_loop.py | 143 +++++++++++++++++ .../train_loops/ppo_loop.py | 130 ++++++++++++++++ .../train_loops/value_loop.py | 147 ++++++++++++++++++ .../util/NetworkFactory.py | 4 +- example/example_training_loops.py | 2 + 5 files changed, 425 insertions(+), 1 deletion(-) create mode 100644 cares_reinforcement_learning/train_loops/policy_loop.py create mode 100644 cares_reinforcement_learning/train_loops/ppo_loop.py create mode 100644 cares_reinforcement_learning/train_loops/value_loop.py diff --git a/cares_reinforcement_learning/train_loops/policy_loop.py b/cares_reinforcement_learning/train_loops/policy_loop.py new file mode 100644 index 00000000..494bb2e1 --- /dev/null +++ b/cares_reinforcement_learning/train_loops/policy_loop.py @@ -0,0 +1,143 @@ +from cares_reinforcement_learning.memory import MemoryBuffer +from cares_reinforcement_learning.memory.augments import * +from cares_reinforcement_learning.util import helpers as hlp, Record + +import cv2 +import time +import gym +import logging +import numpy as np + +def evaluate_policy_network(env, agent, args, record=None, total_steps=0): + + if record is not None: + frame = env.grab_frame() + record.start_video(total_steps+1, frame) + + number_eval_episodes = int(args["number_eval_episodes"]) + + state = env.reset() + + for eval_episode_counter in range(number_eval_episodes): + episode_timesteps = 0 + episode_reward = 0 + episode_num = 0 + done = False + truncated = False + + while not done and not truncated: + episode_timesteps += 1 + action = agent.select_action_from_policy(state, evaluation=True) + action_env = hlp.denormalize(action, env.max_action_value, env.min_action_value) + + state, reward, done, truncated = env.step(action_env) + episode_reward += reward + + if eval_episode_counter == 0 and record is not None: + frame = env.grab_frame() + record.log_video(frame) + + if done or truncated: + if record is not None: + record.log_eval( + total_steps=total_steps+1, + episode=eval_episode_counter+1, + episode_reward=episode_reward, + display=True + ) + + # Reset environment + state = env.reset() + episode_reward = 0 + episode_timesteps = 0 + episode_num += 1 + + record.stop_video() + +def policy_based_train(env, agent, memory, record, args): + start_time = time.time() + + max_steps_training = args["max_steps_training"] + max_steps_exploration = args["max_steps_exploration"] + number_steps_per_evaluation = args["number_steps_per_evaluation"] + + logging.info(f"Training {max_steps_training} Exploration {max_steps_exploration} Evaluation {number_steps_per_evaluation}") + + batch_size = args["batch_size"] + seed = args["seed"] + G = args["G"] + + episode_timesteps = 0 + episode_reward = 0 + episode_num = 0 + + evaluate = False + + state = env.reset() + + episode_start = time.time() + for total_step_counter in range(int(max_steps_training)): + episode_timesteps += 1 + + if total_step_counter < max_steps_exploration: + logging.info(f"Running Exploration Steps {total_step_counter+1}/{max_steps_exploration}") + # action range the env uses [e.g. -2 , 2 for pendulum] + action_env = np.random.uniform(env.min_action_value, env.max_action_value, size=env.action_num) + # algorithm range [-1, 1] - note for DMCS this is redudenant but required for openai + action = hlp.normalize(action_env, env.max_action_value, env.min_action_value) + else: + # algorithm range [-1, 1] + action = agent.select_action_from_policy(state) + # mapping to env range [e.g. -2 , 2 for pendulum] - note for DMCS this is redudenant but required for openai + action_env = hlp.denormalize(action, env.max_action_value, env.min_action_value) + + next_state, reward, done, truncated = env.step(action_env) + memory.add(state=state, action=action, reward=reward, next_state=next_state, done=done) + + state = next_state + episode_reward += reward + + if total_step_counter >= max_steps_exploration: + for i in range(G): + experience = memory.sample(batch_size) + info = agent.train_policy(( + experience['state'], + experience['action'], + experience['reward'], + experience['next_state'], + experience['done'] + )) + memory.update_priorities(experience['indices'], info) + # record.log_info(info, display=False) + + if (total_step_counter+1) % number_steps_per_evaluation == 0: + evaluate = True + + if done or truncated: + episode_time = time.time() - episode_start + record.log_train( + total_steps = total_step_counter + 1, + episode = episode_num + 1, + episode_steps=episode_timesteps, + episode_reward = episode_reward, + episode_time = episode_time, + display = True + ) + + if evaluate: + logging.info("*************--Evaluation Loop--*************") + args["evaluation_seed"] = seed + evaluate_policy_network(env, agent, args, record=record, total_steps=total_step_counter) + logging.info("--------------------------------------------") + evaluate = False + + # Reset environment + state = env.reset() + episode_timesteps = 0 + episode_reward = 0 + episode_num += 1 + episode_start = time.time() + + end_time = time.time() + elapsed_time = end_time - start_time + print('Training time:', time.strftime("%H:%M:%S", time.gmtime(elapsed_time))) \ No newline at end of file diff --git a/cares_reinforcement_learning/train_loops/ppo_loop.py b/cares_reinforcement_learning/train_loops/ppo_loop.py new file mode 100644 index 00000000..25671d0d --- /dev/null +++ b/cares_reinforcement_learning/train_loops/ppo_loop.py @@ -0,0 +1,130 @@ +from cares_reinforcement_learning.memory import * +from cares_reinforcement_learning.util import helpers as hlp +from cares_reinforcement_learning.util import Record + +import time +import gym +import logging + +from timeit import default_timer as timer + +def evaluate_ppo_network(env, agent, args, record=None, total_steps=0): + + if record is not None: + frame = env.grab_frame() + record.start_video(total_steps+1, frame) + + number_eval_episodes = int(args["number_eval_episodes"]) + + state = env.reset() + + for eval_episode_counter in range(number_eval_episodes): + episode_timesteps = 0 + episode_reward = 0 + episode_num = 0 + done = False + truncated = False + + while not done and not truncated: + episode_timesteps += 1 + action, log_prob = agent.select_action_from_policy(state) + action_env = hlp.denormalize(action, env.max_action_value, env.min_action_value) + + state, reward, done, truncated = env.step(action_env) + episode_reward += reward + + if eval_episode_counter == 0 and record is not None: + frame = env.grab_frame() + record.log_video(frame) + + if done or truncated: + if record is not None: + record.log_eval( + total_steps=total_steps+1, + episode=eval_episode_counter+1, + episode_reward=episode_reward, + display=True + ) + + # Reset environment + state = env.reset() + episode_reward = 0 + episode_timesteps = 0 + episode_num += 1 + + record.stop_video() + +def ppo_train(env, agent, record, args): + start_time = time.time() + + seed = args["seed"] + max_steps_training = args["max_steps_training"] + max_steps_per_batch = args["max_steps_per_batch"] + number_steps_per_evaluation = args["number_steps_per_evaluation"] + + episode_timesteps = 0 + episode_num = 0 + episode_reward = 0 + + memory = MemoryBuffer() + + evaluate = False + + state = env.reset() + + episode_start = time.time() + for total_step_counter in range(int(max_steps_training)): + episode_timesteps += 1 + + action, log_prob = agent.select_action_from_policy(state) + action_env = hlp.denormalize(action, env.max_action_value, env.min_action_value) + + next_state, reward, done, truncated = env.step(action_env) + memory.add(state=state, action=action, reward=reward, next_state=next_state, done=done, log_prob=log_prob) + + state = next_state + episode_reward += reward + + if (total_step_counter+1) % max_steps_per_batch == 0: + experience = memory.flush() + info = agent.train_policy(( + experience['state'], + experience['action'], + experience['reward'], + experience['next_state'], + experience['done'], + experience['log_prob'] + )) + # record.log_info(info, display=False) + + if (total_step_counter+1) % number_steps_per_evaluation == 0: + evaluate = True + + if done or truncated: + episode_time = time.time() - episode_start + record.log_train( + total_steps = total_step_counter + 1, + episode = episode_num + 1, + episode_steps=episode_timesteps, + episode_reward = episode_reward, + episode_time = episode_time, + display = True + ) + + if evaluate: + logging.info("*************--Evaluation Loop--*************") + args["evaluation_seed"] = seed + evaluate_ppo_network(env, agent, args, record=record, total_steps=total_step_counter) + logging.info("--------------------------------------------") + evaluate = False + + # Reset environment + state = env.reset() + episode_timesteps = 0 + episode_reward = 0 + episode_num += 1 + episode_start = time.time() + + end_time = time.time() + elapsed_time = end_time - start_time + print('Training time:', time.strftime("%H:%M:%S", time.gmtime(elapsed_time))) \ No newline at end of file diff --git a/cares_reinforcement_learning/train_loops/value_loop.py b/cares_reinforcement_learning/train_loops/value_loop.py new file mode 100644 index 00000000..ffe1ab64 --- /dev/null +++ b/cares_reinforcement_learning/train_loops/value_loop.py @@ -0,0 +1,147 @@ +from cares_reinforcement_learning.memory import * +from cares_reinforcement_learning.util import helpers as hlp, Record + +import numpy as np +import time +import gym +import logging +import random + +from random import randrange + +from timeit import default_timer as timer + +def evaluate_value_network(env, agent, args, record=None, total_steps=0): + + if record is not None: + frame = env.grab_frame() + record.start_video(total_steps+1, frame) + + number_eval_episodes = int(args["number_eval_episodes"]) + + state = env.reset() + + exploration_rate = args["exploration_min"] + + for eval_episode_counter in range(number_eval_episodes): + episode_timesteps = 0 + episode_reward = 0 + episode_num = 0 + done = False + truncated = False + + while not done and not truncated: + episode_timesteps += 1 + + if random.random() < exploration_rate: + action = randrange(env.action_num) + else: + action = agent.select_action_from_policy(state) + + state, reward, done, truncated = env.step(action) + episode_reward += reward + + if eval_episode_counter == 0 and record is not None: + frame = env.grab_frame() + record.log_video(frame) + + if done or truncated: + if record is not None: + record.log_eval( + total_steps=total_steps+1, + episode=eval_episode_counter+1, + episode_reward=episode_reward, + display=True + ) + + # Reset environment + state = env.reset() + episode_reward = 0 + episode_timesteps = 0 + episode_num += 1 + + record.stop_video() + +def value_based_train(env, agent, memory, record, args): + start_time = time.time() + + max_steps_training = args["max_steps_training"] + exploration_min = args["exploration_min"] + exploration_decay = args["exploration_decay"] + number_steps_per_evaluation = args["number_steps_per_evaluation"] + + batch_size = args["batch_size"] + seed = args["seed"] + G = args["G"] + + episode_timesteps = 0 + episode_reward = 0 + episode_num = 0 + + evaluate = False + + state = env.reset() + + exploration_rate = 1 + + episode_start = time.time() + for total_step_counter in range(int(max_steps_training)): + episode_timesteps += 1 + + exploration_rate *= exploration_decay + exploration_rate = max(exploration_min, exploration_rate) + + if random.random() < exploration_rate: + action = randrange(env.action_num) + else: + action = agent.select_action_from_policy(state) + + next_state, reward, done, truncated = env.step(action) + memory.add(state=state, action=action, reward=reward, next_state=next_state, done=done) + state = next_state + episode_reward += reward + + if len(memory) > batch_size: + for _ in range(G): + experience = memory.sample(batch_size) + info = agent.train_policy(( + experience['state'], + experience['action'], + experience['reward'], + experience['next_state'], + experience['done'] + )) + memory.update_priorities(experience['indices'], info) + # record.log_info(info, display=False) + + if (total_step_counter+1) % number_steps_per_evaluation == 0: + evaluate = True + + if done or truncated: + episode_time = time.time() - episode_start + record.log_train( + total_steps = total_step_counter + 1, + episode = episode_num + 1, + episode_steps=episode_timesteps, + episode_reward = episode_reward, + episode_time = episode_time, + display = True + ) + + if evaluate: + logging.info("*************--Evaluation Loop--*************") + args["evaluation_seed"] = seed + evaluate_value_network(env, agent, args, record=record, total_steps=total_step_counter) + logging.info("--------------------------------------------") + evaluate = False + + # Reset environment + state = env.reset() + episode_timesteps = 0 + episode_reward = 0 + episode_num += 1 + episode_start = time.time() + + end_time = time.time() + elapsed_time = end_time - start_time + print('Training time:', time.strftime("%H:%M:%S", time.gmtime(elapsed_time))) \ No newline at end of file diff --git a/cares_reinforcement_learning/util/NetworkFactory.py b/cares_reinforcement_learning/util/NetworkFactory.py index 91ea3a11..acf0c656 100644 --- a/cares_reinforcement_learning/util/NetworkFactory.py +++ b/cares_reinforcement_learning/util/NetworkFactory.py @@ -1,4 +1,5 @@ import torch +import logging def create_DQN(args): from cares_reinforcement_learning.algorithm.value import DQN @@ -159,4 +160,5 @@ def create_network(self, algorithm, args): return create_SAC(args) elif algorithm == "TD3": return create_TD3(args) - raise ValueError(f"Unkown algorithm: {algorithm}") + logging.warn(f"Algorithm: {algorithm} is not in the default cares_rl factory") + return None diff --git a/example/example_training_loops.py b/example/example_training_loops.py index 1ee36c7e..20c8d915 100644 --- a/example/example_training_loops.py +++ b/example/example_training_loops.py @@ -57,6 +57,8 @@ def main(): logging.info(f"Algorithm: {args['algorithm']}") agent = network_factory.create_network(args["algorithm"], args) + if agent == None: + raise ValueError(f"Unkown agent for default algorithms {args['algorithm']}") memory = memory_factory.create_memory(args['memory'], args) logging.info(f"Memory: {args['memory']}") From 6fde5a90bd65f999f6893d0ce71ef02965f76386 Mon Sep 17 00:00:00 2001 From: beardyface <henryamwilliams@gmail.com> Date: Mon, 16 Oct 2023 11:36:03 +1300 Subject: [PATCH 3/4] Adjusted args to be modular externally --- .../util/arguement_parser.py | 16 ++++++++-------- example/example_training_loops.py | 3 ++- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/cares_reinforcement_learning/util/arguement_parser.py b/cares_reinforcement_learning/util/arguement_parser.py index 8f889962..d968e401 100644 --- a/cares_reinforcement_learning/util/arguement_parser.py +++ b/cares_reinforcement_learning/util/arguement_parser.py @@ -73,9 +73,9 @@ def algorithm_args(parent_parser): parser_DoubleDQN.add_argument('--exploration_min', type=float, default=1e-3) parser_DoubleDQN.add_argument('--exploration_decay', type=float, default=0.95) - return alg_parser + return alg_parser, alg_parsers -def parse_args(): +def environment_parser(): parser = argparse.ArgumentParser(add_help=False) # Add an argument parser.add_argument('--number_training_iterations', type=int, default=1, help="Total amount of training iterations to complete") @@ -97,10 +97,10 @@ def parse_args(): parser.add_argument('--plot_frequency', type=int, default=100, help="How many steps between updating the running plot of the training and evaluation data during training") parser.add_argument('--checkpoint_frequency', type=int, default=100, help="How many steps between saving check point models of the agent during training") - parser = algorithm_args(parent_parser=parser) + return parser + +def create_parser(): + parser = environment_parser() + parser, alg_parsers = algorithm_args(parent_parser=parser) parser = environment_args(parent_parser=parser) - - return vars(parser.parse_args()) # converts to a dictionary - -if __name__ == '__main__': - print(parse_args()) + return parser \ No newline at end of file diff --git a/example/example_training_loops.py b/example/example_training_loops.py index 20c8d915..c8ac76f4 100644 --- a/example/example_training_loops.py +++ b/example/example_training_loops.py @@ -28,7 +28,8 @@ def set_seed(seed): random.seed(seed) def main(): - args = ap.parse_args() + parser = ap.create_parser() + args = vars(parser.parse_args()) # converts to a dictionary args["device"] = torch.device('cuda' if torch.cuda.is_available() else 'cpu') logging.info(f"Device: {args['device']}") From be422cd8441ce3c45153e943015d588b168344cd Mon Sep 17 00:00:00 2001 From: beardyface <henryamwilliams@gmail.com> Date: Mon, 16 Oct 2023 11:47:21 +1300 Subject: [PATCH 4/4] Updated tests --- tests/test_utils.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/test_utils.py b/tests/test_utils.py index c8e83cc2..b4348a8b 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -72,9 +72,9 @@ def test_create_network(): agent = factory.create_network("TD3", args) assert isinstance(agent, TD3), "Failed to create TD3 agent" - - with pytest.raises(ValueError): - factory.create_network("Unknown", args) + + agent = factory.create_network("Unknown", args) + assert agent is None, f"Unkown failed to return None: returned {agent}" def test_denormalize():