From 3ff62e7296da557ebe47228ba6a1239f10b1cd64 Mon Sep 17 00:00:00 2001 From: beardyface Date: Tue, 10 Oct 2023 15:59:45 +1300 Subject: [PATCH 1/9] Incorporating in DMCS into the process --- .../util/EnvironmentFactory.py | 130 ++++++++++++++++++ cares_reinforcement_learning/util/Record.py | 2 +- cares_reinforcement_learning/util/__init__.py | 1 + example/example_training_loops.py | 48 ++++--- example/policy_example.py | 35 ++--- 5 files changed, 173 insertions(+), 43 deletions(-) create mode 100644 cares_reinforcement_learning/util/EnvironmentFactory.py diff --git a/cares_reinforcement_learning/util/EnvironmentFactory.py b/cares_reinforcement_learning/util/EnvironmentFactory.py new file mode 100644 index 00000000..8d78ae49 --- /dev/null +++ b/cares_reinforcement_learning/util/EnvironmentFactory.py @@ -0,0 +1,130 @@ +import logging + +import gym +from gym import spaces + +from dm_control import suite + +import numpy as np +from collections import deque + +# from typing import override +from functools import cached_property + +class EnvironmentFactory: + def __init__(self) -> None: + pass + + def create_environment(self, gym_environment, args): + logging.info(f"Training Environment: {gym_environment}") + if gym_environment == 'dmcs': + env = DMCSImageEnvironment(args=args) if args['image_observation'] else DMCS(args=args) + elif gym_environment == "gym": + env = OpenAIGym(args=args) + else: + raise ValueError(f"Unkown environment: {gym_environment}") + return env + +class OpenAIGym: + def __init__(self, args) -> None: + logging.info(f"Training task {args['task']}") + self.env = gym.make(args["task"], render_mode=(None if args['render'] == "None" else args['render'])) + self.env.action_space.seed(args['seed']) + + @cached_property + def max_action_value(self): + return self.env.action_space.high[0] + + @cached_property + def min_action_value(self): + return self.env.action_space.low[0] + + @cached_property + def observation_space(self): + return self.env.observation_space.shape[0] + + @cached_property + def action_num(self): + if type(self.env.action_space) == spaces.Box: + action_num = self.env.action_space.shape[0] + elif type(self.env.action_space) == spaces.Discrete: + action_num= self.env.action_space.n + else: + raise ValueError(f"Unhandled action space type: {type(self.env.action_space)}") + return action_num + + def reset(self): + state, _ = self.env.reset() + return state + + def step(self, action): + state, reward, done, truncated, _ = self.env.step(action) + return state, reward, done, truncated + +class DMCS: + def __init__(self, args) -> None: + logging.info(f"Training on Domain {args['domain']}") + logging.info(f"Training with Task {args['task']}") + + self.env = suite.load(args['domain'], args['task'], task_kwargs={'random': args['seed']}) + + @cached_property + def min_action_value(self): + return self.env.action_spec().minimum[0] + + @cached_property + def max_action_value(self): + return self.env.action_spec().maximum[0] + + @cached_property + def observation_space(self): + time_step = self.env.reset() + observation = np.hstack(list(time_step.observation.values())) # # e.g. position, orientation, joint_angles + return len(observation) + + @cached_property + def action_num(self): + return self.env.action_spec().shape[0] + + def reset(self): + time_step = self.env.reset() + observation = np.hstack(list(time_step.observation.values())) # # e.g. position, orientation, joint_angles + return observation + + def step(self, action): + time_step = self.env.step(action) + state, reward, done = np.hstack(list(time_step.observation.values())), time_step.reward, time_step.last() + return state, reward, done, False # for consistency with open ai gym just add false for truncated + +# TODO paramatise the observation size 3x84x84 +class DMCSImageEnvironment(DMCS): + def __init__(self, args, k=3): + self.k = k # number of frames to be stacked + self.frames_stacked = deque([], maxlen=k) + + super().__init__(args=args) + + # @override + @property + def observation_space(self): + return self.env.observation_space.shape[0] + + # @override + def reset(self): + _ = self.env.reset() + frame = self.env.physics.render(84, 84, camera_id=0) # --> shape= (84, 84, 3) + frame = np.moveaxis(frame, -1, 0) # --> shape= (3, 84, 84) + for _ in range(self.k): + self.frames_stacked.append(frame) + stacked_frames = np.concatenate(list(self.frames_stacked), axis=0) # --> shape = (9, 84, 84) + return stacked_frames + + # @override + def step(self, action): + time_step = self.env.step(action) + reward, done = time_step.reward, time_step.last() + frame = self.env.physics.render(84, 84, camera_id=0) + frame = np.moveaxis(frame, -1, 0) + self.frames_stacked.append(frame) + stacked_frames = np.concatenate(list(self.frames_stacked), axis=0) + return stacked_frames, reward, done, False # for consistency with open ai gym just add false for truncated \ No newline at end of file diff --git a/cares_reinforcement_learning/util/Record.py b/cares_reinforcement_learning/util/Record.py index 4b06447b..9a3fdb6e 100644 --- a/cares_reinforcement_learning/util/Record.py +++ b/cares_reinforcement_learning/util/Record.py @@ -70,7 +70,7 @@ def save_data(self, data_frame, filename, logs, display=True): string = '| ' + string + ' |' if display: - print(string) + logging.info(string) def save(self): logging.info(f"Saving final outputs") diff --git a/cares_reinforcement_learning/util/__init__.py b/cares_reinforcement_learning/util/__init__.py index 7af8c198..391658ee 100644 --- a/cares_reinforcement_learning/util/__init__.py +++ b/cares_reinforcement_learning/util/__init__.py @@ -1,3 +1,4 @@ from .NetworkFactory import NetworkFactory from .Record import Record +from .EnvironmentFactory import EnvironmentFactory diff --git a/example/example_training_loops.py b/example/example_training_loops.py index 506df6f4..52c297b9 100644 --- a/example/example_training_loops.py +++ b/example/example_training_loops.py @@ -1,10 +1,13 @@ import time import argparse +import logging +logging.basicConfig(level=logging.INFO) from cares_reinforcement_learning.util import NetworkFactory from cares_reinforcement_learning.memory import MemoryBuffer from cares_reinforcement_learning.memory.augments import * from cares_reinforcement_learning.util import Record +from cares_reinforcement_learning.util import EnvironmentFactory import example.policy_example as pbe import example.value_example as vbe @@ -15,24 +18,23 @@ import torch import random -import logging import numpy as np -logging.basicConfig(level=logging.INFO) - -def set_seed(env, seed): +def set_seed(seed): torch.manual_seed(seed) np.random.seed(seed) random.seed(seed) - env.action_space.seed(seed) def parse_args(): parser = argparse.ArgumentParser() # Add an argument + parser.add_argument('--gym_environment', type=str, required=True) + parser.add_argument('--domain', type=str) parser.add_argument('--task', type=str, required=True) parser.add_argument('--render', type=str, default="None") parser.add_argument('--algorithm', type=str, required=True) parser.add_argument('--memory', type=str, default="MemoryBuffer") + parser.add_argument('--image_observation', type=bool, default=False) parser.add_argument('--G', type=int, default=10) parser.add_argument('--gamma', type=float, default=0.99) @@ -40,9 +42,9 @@ def parse_args(): parser.add_argument('--batch_size', type=int, default=32) parser.add_argument('--max_steps_exploration', type=int, default=10000) - parser.add_argument('--max_steps_training', type=int, default=50000) + parser.add_argument('--max_steps_training', type=int, default=100000) - parser.add_argument('--number_steps_per_evaluation', type=int, default=1000) + parser.add_argument('--number_steps_per_evaluation', type=int, default=10000) parser.add_argument('--number_eval_episodes', type=int, default=10) parser.add_argument('--seed', type=int, default=571) @@ -67,31 +69,27 @@ def parse_args(): def main(): args = parse_args() args["device"] = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + logging.info(f"Device: {args['device']}") logging.info(f"Training on {args['task']}") - env = gym.make(args["task"], render_mode=(None if args['render'] == "None" else args['render'])) - - logging.info(f"Device: {args['device']}") + env_factory = EnvironmentFactory() + + gym_environment = args['gym_environment'] + env = env_factory.create_environment(gym_environment=gym_environment, args=args) - args["observation_size"] = env.observation_space.shape[0] + args["observation_size"] = env.observation_space logging.info(f"Observation Size: {args['observation_size']}") - if type(env.action_space) == spaces.Box: - args["action_num"] = env.action_space.shape[0] - elif type(env.action_space) == spaces.Discrete: - args["action_num"] = env.action_space.n - else: - raise ValueError(f"Unhandled action space type: {type(env.action_space)}") + args['action_num'] = env.action_num logging.info(f"Action Num: {args['action_num']}") logging.info(f"Seed: {args['seed']}") - set_seed(env, args["seed"]) + set_seed(args["seed"]) # Create the network we are using factory = NetworkFactory() logging.info(f"Algorithm: {args['algorithm']}") agent = factory.create_network(args["algorithm"], args) - logging.info(f"Algorithm: {args['algorithm']}") # TODO move to memory factory as we add new PER if args["memory"] == "MemoryBuffer": @@ -110,16 +108,16 @@ def main(): # Train the policy or value based approach if args["algorithm"] == "PPO": ppe.ppo_train(env, agent, record, args) - env = gym.make(env.spec.id, render_mode="human") - ppe.evaluate_ppo_network(env, agent, args) + # env = gym.make(env.spec.id, render_mode="human") + # ppe.evaluate_ppo_network(env, agent, args) elif agent.type == "policy": pbe.policy_based_train(env, agent, memory, record, args) - env = gym.make(env.spec.id, render_mode="human") - pbe.evaluate_policy_network(env, agent, args) + # env = gym.make(env.spec.id, render_mode="human") + # pbe.evaluate_policy_network(env, agent, args) elif agent.type == "value": vbe.value_based_train(env, agent, memory, record, args) - env = gym.make(env.spec.id, render_mode="human") - vbe.evaluate_value_network(env, agent, args) + # env = gym.make(env.spec.id, render_mode="human") + # vbe.evaluate_value_network(env, agent, args) else: raise ValueError(f"Agent type is unkown: {agent.type}") diff --git a/example/policy_example.py b/example/policy_example.py index cdeab164..fd06f117 100644 --- a/example/policy_example.py +++ b/example/policy_example.py @@ -5,15 +5,13 @@ import time import gym import logging +import numpy as np def evaluate_policy_network(env, agent, args, record=None, total_steps=0): number_eval_episodes = int(args["number_eval_episodes"]) - min_action_value = env.action_space.low[0] - max_action_value = env.action_space.high[0] - - state, _ = env.reset() + state = env.reset() for eval_episode_counter in range(number_eval_episodes): episode_timesteps = 0 @@ -25,9 +23,9 @@ def evaluate_policy_network(env, agent, args, record=None, total_steps=0): while not done and not truncated: episode_timesteps += 1 action = agent.select_action_from_policy(state, evaluation=True) - action_env = hlp.denormalize(action, max_action_value, min_action_value) + action_env = hlp.denormalize(action, env.max_action_value, env.min_action_value) - state, reward, done, truncated, _ = env.step(action_env) + state, reward, done, truncated = env.step(action_env) episode_reward += reward if done or truncated: @@ -40,7 +38,7 @@ def evaluate_policy_network(env, agent, args, record=None, total_steps=0): ) # Reset environment - state, _ = env.reset() + state = env.reset() episode_reward = 0 episode_timesteps = 0 episode_num += 1 @@ -52,20 +50,19 @@ def policy_based_train(env, agent, memory, record, args): max_steps_exploration = args["max_steps_exploration"] number_steps_per_evaluation = args["number_steps_per_evaluation"] + logging.info(f"Training {max_steps_training} Exploration {max_steps_exploration} Evaluation {number_steps_per_evaluation}") + batch_size = args["batch_size"] seed = args["seed"] G = args["G"] - min_action_value = env.action_space.low[0] - max_action_value = env.action_space.high[0] - episode_timesteps = 0 episode_reward = 0 episode_num = 0 evaluate = False - state, _ = env.reset(seed=seed) + state = env.reset() episode_start = time.time() for total_step_counter in range(int(max_steps_training)): @@ -73,13 +70,17 @@ def policy_based_train(env, agent, memory, record, args): if total_step_counter < max_steps_exploration: logging.info(f"Running Exploration Steps {total_step_counter+1}/{max_steps_exploration}") - action_env = env.action_space.sample() # action range the env uses [e.g. -2 , 2 for pendulum] - action = hlp.normalize(action_env, max_action_value, min_action_value) # algorithm range [-1, 1] + # action range the env uses [e.g. -2 , 2 for pendulum] + action_env = np.random.uniform(env.min_action_value, env.max_action_value, size=env.action_num) + # algorithm range [-1, 1] - note for DMCS this is redudenant but required for openai + action = hlp.normalize(action_env, env.max_action_value, env.min_action_value) else: - action = agent.select_action_from_policy(state) # algorithm range [-1, 1] - action_env = hlp.denormalize(action, max_action_value, min_action_value) # mapping to env range [e.g. -2 , 2 for pendulum] + # algorithm range [-1, 1] + action = agent.select_action_from_policy(state) + # mapping to env range [e.g. -2 , 2 for pendulum] - note for DMCS this is redudenant but required for openai + action_env = hlp.denormalize(action, env.max_action_value, env.min_action_value) - next_state, reward, done, truncated, info = env.step(action_env) + next_state, reward, done, truncated = env.step(action_env) memory.add(state=state, action=action, reward=reward, next_state=next_state, done=done) state = next_state @@ -120,7 +121,7 @@ def policy_based_train(env, agent, memory, record, args): evaluate = False # Reset environment - state, _ = env.reset() + state = env.reset() episode_timesteps = 0 episode_reward = 0 episode_num += 1 From a54ce3650d03e7f3698b9dbc753da96cd4a70ea6 Mon Sep 17 00:00:00 2001 From: beardyface Date: Wed, 11 Oct 2023 08:49:29 +1300 Subject: [PATCH 2/9] Added video recording of the evaluation loop --- .../util/EnvironmentFactory.py | 14 +++++++++++++- cares_reinforcement_learning/util/Record.py | 18 +++++++++++++++++- example/example_training_loops.py | 9 --------- example/policy_example.py | 11 +++++++++++ 4 files changed, 41 insertions(+), 11 deletions(-) diff --git a/cares_reinforcement_learning/util/EnvironmentFactory.py b/cares_reinforcement_learning/util/EnvironmentFactory.py index 8d78ae49..3ea3e4ff 100644 --- a/cares_reinforcement_learning/util/EnvironmentFactory.py +++ b/cares_reinforcement_learning/util/EnvironmentFactory.py @@ -1,5 +1,7 @@ import logging +import cv2 + import gym from gym import spaces @@ -28,7 +30,7 @@ def create_environment(self, gym_environment, args): class OpenAIGym: def __init__(self, args) -> None: logging.info(f"Training task {args['task']}") - self.env = gym.make(args["task"], render_mode=(None if args['render'] == "None" else args['render'])) + self.env = gym.make(args["task"], render_mode="rgb_array") self.env.action_space.seed(args['seed']) @cached_property @@ -60,6 +62,11 @@ def reset(self): def step(self, action): state, reward, done, truncated, _ = self.env.step(action) return state, reward, done, truncated + + def grab_frame(self): + frame = self.env.render() + frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) # Convert to BGR for use with OpenCV + return frame class DMCS: def __init__(self, args) -> None: @@ -95,6 +102,11 @@ def step(self, action): time_step = self.env.step(action) state, reward, done = np.hstack(list(time_step.observation.values())), time_step.reward, time_step.last() return state, reward, done, False # for consistency with open ai gym just add false for truncated + + def grab_frame(self): + frame = self.env.physics.render(camera_id=0, height=240, width=300) + frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) # Convert to BGR for use with OpenCV + return frame # TODO paramatise the observation size 3x84x84 class DMCSImageEnvironment(DMCS): diff --git a/cares_reinforcement_learning/util/Record.py b/cares_reinforcement_learning/util/Record.py index 9a3fdb6e..bd5060c1 100644 --- a/cares_reinforcement_learning/util/Record.py +++ b/cares_reinforcement_learning/util/Record.py @@ -1,5 +1,6 @@ import os import logging +import cv2 import pandas as pd @@ -36,6 +37,18 @@ def __init__(self, glob_log_dir=None, log_dir=None, network=None, config=None) - with open(f'{self.directory}/config.yml', 'w') as outfile: yaml.dump(config, outfile, default_flow_style=False) + def start_video(self, file_name, frame): + fps = 30 + video_name = f"{self.directory}/videos/{file_name}.mp4" + height, width, channels = frame.shape + self.video = cv2.VideoWriter(video_name, cv2.VideoWriter_fourcc(*'mp4v'), fps, (width, height)) + + def stop_video(self): + self.video.release() + + def log_video(self, frame): + self.video.write(frame) + def log_info(self, info, display=False): self.info_data = pd.concat([self.info_data, pd.DataFrame([info])], ignore_index=True) self.save_data(self.info_data, "info", info, display=display) @@ -97,4 +110,7 @@ def __initialise_directories(self): os.mkdir(f'{self.directory}/models') if not os.path.exists(f'{self.directory}/figures'): - os.mkdir(f'{self.directory}/figures') + os.mkdir(f'{self.directory}/figures') + + if not os.path.exists(f'{self.directory}/videos'): + os.mkdir(f'{self.directory}/videos') diff --git a/example/example_training_loops.py b/example/example_training_loops.py index 52c297b9..e861c0a0 100644 --- a/example/example_training_loops.py +++ b/example/example_training_loops.py @@ -31,7 +31,6 @@ def parse_args(): parser.add_argument('--gym_environment', type=str, required=True) parser.add_argument('--domain', type=str) parser.add_argument('--task', type=str, required=True) - parser.add_argument('--render', type=str, default="None") parser.add_argument('--algorithm', type=str, required=True) parser.add_argument('--memory', type=str, default="MemoryBuffer") parser.add_argument('--image_observation', type=bool, default=False) @@ -62,8 +61,6 @@ def parse_args(): parser.add_argument('--plot_frequency', type=int, default=100) parser.add_argument('--checkpoint_frequency', type=int, default=100) - parser.add_argument('--display', type=str, default=True) - return vars(parser.parse_args()) # converts into a dictionary def main(): @@ -108,16 +105,10 @@ def main(): # Train the policy or value based approach if args["algorithm"] == "PPO": ppe.ppo_train(env, agent, record, args) - # env = gym.make(env.spec.id, render_mode="human") - # ppe.evaluate_ppo_network(env, agent, args) elif agent.type == "policy": pbe.policy_based_train(env, agent, memory, record, args) - # env = gym.make(env.spec.id, render_mode="human") - # pbe.evaluate_policy_network(env, agent, args) elif agent.type == "value": vbe.value_based_train(env, agent, memory, record, args) - # env = gym.make(env.spec.id, render_mode="human") - # vbe.evaluate_value_network(env, agent, args) else: raise ValueError(f"Agent type is unkown: {agent.type}") diff --git a/example/policy_example.py b/example/policy_example.py index fd06f117..34a77f57 100644 --- a/example/policy_example.py +++ b/example/policy_example.py @@ -2,6 +2,7 @@ from cares_reinforcement_learning.memory.augments import * from cares_reinforcement_learning.util import helpers as hlp, Record +import cv2 import time import gym import logging @@ -9,6 +10,10 @@ def evaluate_policy_network(env, agent, args, record=None, total_steps=0): + if record is not None: + frame = env.grab_frame() + record.start_video(total_steps+1, frame) + number_eval_episodes = int(args["number_eval_episodes"]) state = env.reset() @@ -28,6 +33,10 @@ def evaluate_policy_network(env, agent, args, record=None, total_steps=0): state, reward, done, truncated = env.step(action_env) episode_reward += reward + if eval_episode_counter == 0 and record is not None: + frame = env.grab_frame() + record.log_video(frame) + if done or truncated: if record is not None: record.log_eval( @@ -42,6 +51,8 @@ def evaluate_policy_network(env, agent, args, record=None, total_steps=0): episode_reward = 0 episode_timesteps = 0 episode_num += 1 + + record.stop_video() def policy_based_train(env, agent, memory, record, args): start_time = time.time() From 746ad59310daa115012636c51dd00bf4aa541ba7 Mon Sep 17 00:00:00 2001 From: beardyface Date: Wed, 11 Oct 2023 10:31:29 +1300 Subject: [PATCH 3/9] Updated README --- README.md | 44 ++++++++++++++++++++++++++++++++++++++------ requirements.txt | 6 +----- 2 files changed, 39 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index 3054bee3..1784c5ce 100644 --- a/README.md +++ b/README.md @@ -8,24 +8,56 @@ The CARES reinforcement learning bed used as the foundation for RL related proje Consult the repository [wiki](https://github.com/UoA-CARES/cares_reinforcement_learning/wiki) for a guide on how to use the package ## Installation Instructions -`git clone` the repository +If you want to utilise the GPU with Pytorch install CUDA first - https://developer.nvidia.com/cuda-toolkit -If you would like to leverage your machine's GPU, uncomment the optional dependencies in the `requirements.txt` before moving on. +Install Pytorch following the instructions here - https://pytorch.org/get-started/locally/ + +`git clone` the repository into your desired directory on your local machine Run `pip3 install -r requirements.txt` in the **root directory** of the package To make the module **globally accessible** in your working environment run `pip3 install --editable .` in the **project root** ## Running an Example -This repository includes a script that allows you to run any OpenAI environment – provided you comply with all the dependencies for that environment. These examples make use of the package, and can provide an example on how one might use the package in their own environments. +This repository includes a script that allows you to run any OpenAI Gymnasium (https://github.com/Farama-Foundation/Gymnasium) or Deep Mind Control Suite (https://github.com/google-deepmind/dm_control) environment – provided you comply with all the dependencies for that environment. These examples make use of the package, and can provide an example on how one might use the package in their own environments. + +`example_training_loops.py` takes in hyperparameters that allow you to customise the training run enviromment – OpenAI or DMCS Environment - or RL algorithm. Use `python3 example_training_loops.py -h` for help on what hyperparameters are available for customisation. + +An example is found below for running on the OpenAI and DMCS environments with TD3: +``` +python3 example_training_loops.py --gym_environment gym --task HalfCheetah-v4 --algorithm TD3 + -`example_training_loops.py` takes in hyperparameters that allow you to customise the training run – OpenAI Environment, training steps, gamma... Use `python3 example_training_loops.py -h` for help on what hyperparameters are available for customisation. +python3 example_training_loops.py --gym_environment dmcs --domain ball_in_cup --task catch --algorithm TD3 +``` + +### Data Outputs +All data from a training run is saved into '~/cares_rl_logs'. A folder will be created for each training run named as 'ALGORITHM-TASK-YY_MM_DD:HH:MM:SS', e.g. 'TD3-HalfCheetah-v4-23_10_11_08:47:22'. This folder will contain the following directories and information saved during the training session: -An example is found below: ``` -python3 example_training_loops.py --task 'Pendulum-v1' --algorithm PPO --max_steps_training 1000000 --seed 571 --gamma 0.99 --actor_lr 0.0001 --critic_lr 0.001 +ALGORITHM-TASK-YY_MM_DD:HH:MM:SS/ +├─ config.py +├─ data +| ├─ train.csv +| ├─ eval.csv +├─ figures +| ├─ eval.png +| ├─ train.png +├─ models +| ├─ model.pht +| ├─ CHECKPOINT_N.pht +| ├─ ... +├─ videos +| ├─ STEP.mp4 +| ├─ ... ``` +### Plotting +The plotting utility will plot the data contained in the training data. An example of how to plot the data from one or multiple training sessions together is shown below. Running 'python3 plotter.py -h' will provide details on the plotting parameters. + +``` +python3 plotter.py -s ~/cares_rl_logs -d ~/cares_rl_logs/ALGORITHM-TASK-YY_MM_DD:HH:MM:SS -w 20 +``` ## Package Structure diff --git a/requirements.txt b/requirements.txt index a2a50f8b..56d00fc1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -27,8 +27,4 @@ torchvision==0.14.1 typing_extensions==4.4.0 urllib3==1.26.13 PyYAML==6.0 -# Optional Dependencies if you want to leverage the GPU -# nvidia-cublas-cu11==11.10.3.66 -# nvidia-cuda-nvrtc-cu11==11.7.99 -# nvidia-cuda-runtime-cu11==11.7.99 -# nvidia-cudnn-cu11==8.5.0.96 +dm_control==1.0.10 From 68a54d4809b291bed938e346386fac091cba60cf Mon Sep 17 00:00:00 2001 From: beardyface Date: Wed, 11 Oct 2023 13:52:03 +1300 Subject: [PATCH 4/9] Updated Argparse --- README.md | 4 ++-- .../util/EnvironmentFactory.py | 2 +- example/example_training_loops.py | 23 +++++++++++++++---- 3 files changed, 21 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index 1784c5ce..2a057527 100644 --- a/README.md +++ b/README.md @@ -25,10 +25,10 @@ This repository includes a script that allows you to run any OpenAI Gymnasium (h An example is found below for running on the OpenAI and DMCS environments with TD3: ``` -python3 example_training_loops.py --gym_environment gym --task HalfCheetah-v4 --algorithm TD3 +python3 example_training_loops.py openai --task HalfCheetah-v4 --algorithm TD3 -python3 example_training_loops.py --gym_environment dmcs --domain ball_in_cup --task catch --algorithm TD3 +python3 example_training_loops.py dmcs --domain ball_in_cup --task catch --algorithm TD3 ``` ### Data Outputs diff --git a/cares_reinforcement_learning/util/EnvironmentFactory.py b/cares_reinforcement_learning/util/EnvironmentFactory.py index 3ea3e4ff..ce3b688c 100644 --- a/cares_reinforcement_learning/util/EnvironmentFactory.py +++ b/cares_reinforcement_learning/util/EnvironmentFactory.py @@ -21,7 +21,7 @@ def create_environment(self, gym_environment, args): logging.info(f"Training Environment: {gym_environment}") if gym_environment == 'dmcs': env = DMCSImageEnvironment(args=args) if args['image_observation'] else DMCS(args=args) - elif gym_environment == "gym": + elif gym_environment == "openai": env = OpenAIGym(args=args) else: raise ValueError(f"Unkown environment: {gym_environment}") diff --git a/example/example_training_loops.py b/example/example_training_loops.py index e861c0a0..b8af0b34 100644 --- a/example/example_training_loops.py +++ b/example/example_training_loops.py @@ -25,12 +25,23 @@ def set_seed(seed): np.random.seed(seed) random.seed(seed) -def parse_args(): - parser = argparse.ArgumentParser() # Add an argument +def environment_args(parent_parser): + env_parser = argparse.ArgumentParser() + env_parsers = env_parser.add_subparsers(help='sub-command help', dest='gym_environment', required=True) + + # create the parser for the DMCS sub-command + parser_dmcs = env_parsers.add_parser('dmcs', help='DMCS', parents=[parent_parser]) + parser_dmcs.add_argument('--domain', type=str, required=True) + parser_dmcs.add_argument('--task', type=str, required=True) + + # create the parser for the OpenAI sub-command + parser_openai = env_parsers.add_parser('openai', help='openai', parents=[parent_parser]) + parser_openai.add_argument('--task', type=str, required=True) + return env_parser - parser.add_argument('--gym_environment', type=str, required=True) - parser.add_argument('--domain', type=str) - parser.add_argument('--task', type=str, required=True) +def parse_args(): + parser = argparse.ArgumentParser(add_help=False) # Add an argument + parser.add_argument('--algorithm', type=str, required=True) parser.add_argument('--memory', type=str, default="MemoryBuffer") parser.add_argument('--image_observation', type=bool, default=False) @@ -61,6 +72,8 @@ def parse_args(): parser.add_argument('--plot_frequency', type=int, default=100) parser.add_argument('--checkpoint_frequency', type=int, default=100) + parser = environment_args(parent_parser=parser) # NOTE this has to go after the rest of parser is created + return vars(parser.parse_args()) # converts into a dictionary def main(): From bf270f0e8370d101ff8934decb9e2edb00a51906 Mon Sep 17 00:00:00 2001 From: beardyface Date: Wed, 11 Oct 2023 14:49:35 +1300 Subject: [PATCH 5/9] Updated arguement passing --- .../util/arguement_parser.py | 100 ++++++++++++++++++ example/example_training_loops.py | 55 +--------- 2 files changed, 103 insertions(+), 52 deletions(-) create mode 100644 cares_reinforcement_learning/util/arguement_parser.py diff --git a/cares_reinforcement_learning/util/arguement_parser.py b/cares_reinforcement_learning/util/arguement_parser.py new file mode 100644 index 00000000..9e2b4a68 --- /dev/null +++ b/cares_reinforcement_learning/util/arguement_parser.py @@ -0,0 +1,100 @@ +""" +Example of using sub-parser, sub-commands and sub-sub-commands :-) +""" + +import argparse + +def environment_args(parent_parser): + env_parser = argparse.ArgumentParser() + env_parsers = env_parser.add_subparsers(help='sub-command help', dest='gym_environment', required=True) + + # create the parser for the DMCS sub-command + parser_dmcs = env_parsers.add_parser('dmcs', help='DMCS', parents=[parent_parser]) + required = parser_dmcs.add_argument_group('required arguments') + required.add_argument('--domain', type=str, required=True) + required.add_argument('--task', type=str, required=True) + + # create the parser for the OpenAI sub-command + parser_openai = env_parsers.add_parser('openai', help='openai', parents=[parent_parser]) + required = parser_openai.add_argument_group('required arguments') + required.add_argument('--task', type=str, required=True) + return env_parser + +def algorithm_args(parent_parser): + alg_parser = argparse.ArgumentParser(add_help=False) + alg_parsers = alg_parser.add_subparsers(help='sub-command help', dest='algorithm', required=True) + + # create the parser for TD3 with default parameters + parser_TD3 = alg_parsers.add_parser('TD3', help='TD3', parents=[parent_parser]) + parser_TD3.add_argument('--actor_lr', type=float, default=1e-4) + parser_TD3.add_argument('--critic_lr', type=float, default=1e-3) + parser_TD3.add_argument('--gamma', type=float, default=0.99) + parser_TD3.add_argument('--tau', type=float, default=0.005) + + # create the parser for DDPG with default parameters + parser_DDPG = alg_parsers.add_parser('DDPG', help='DDPG', parents=[parent_parser]) + parser_DDPG.add_argument('--actor_lr', type=float, default=1e-4) + parser_DDPG.add_argument('--critic_lr', type=float, default=1e-3) + parser_DDPG.add_argument('--gamma', type=float, default=0.99) + parser_DDPG.add_argument('--tau', type=float, default=0.005) + + # create the parser for SAC with default parameters + parser_SAC = alg_parsers.add_parser('SAC', help='SAC', parents=[parent_parser]) + parser_SAC.add_argument('--actor_lr', type=float, default=1e-4) + parser_SAC.add_argument('--critic_lr', type=float, default=1e-3) + parser_SAC.add_argument('--gamma', type=float, default=0.99) + parser_SAC.add_argument('--tau', type=float, default=0.005) + + # create the parser for PPO with default parameters + parser_PPO = alg_parsers.add_parser('PPO', help='SAC', parents=[parent_parser]) + parser_PPO.add_argument('--actor_lr', type=float, default=1e-4) + parser_PPO.add_argument('--critic_lr', type=float, required=1e-3) + parser_PPO.add_argument('--gamma', type=float, required=0.99) + + # create the parser for DQN with default parameters + parser_DQN = alg_parsers.add_parser('DQN', help='DQN', parents=[parent_parser]) + parser_DQN.add_argument('--lr', type=float, default=1e-3) + parser_DQN.add_argument('--gamma', type=float, required=0.99) + + # create the parser for DuelingDQN with default parameters + parser_DuelingDQN = alg_parsers.add_parser('DuelingDQN', help='DuelingDQN', parents=[parent_parser]) + parser_DuelingDQN.add_argument('--lr', type=float, default=1e-3) + parser_DuelingDQN.add_argument('--gamma', type=float, required=0.99) + + # create the parser for DoubleDQN with default parameters + parser_DoubleDQN = alg_parsers.add_parser('DoubleDQN', help='DoubleDQN', parents=[parent_parser]) + parser_DoubleDQN.add_argument('--lr', type=float, default=1e-3) + parser_DoubleDQN.add_argument('--gamma', type=float, required=0.99) + + return alg_parser + +def parse_args(): + parser = argparse.ArgumentParser(add_help=False) # Add an argument + + parser.add_argument('--memory', type=str, default="MemoryBuffer") + parser.add_argument('--image_observation', type=bool, default=False) + + parser.add_argument('--G', type=int, default=10) + parser.add_argument('--batch_size', type=int, default=32) + + parser.add_argument('--max_steps_exploration', type=int, default=10000) + parser.add_argument('--max_steps_training', type=int, default=100000) + + parser.add_argument('--number_steps_per_evaluation', type=int, default=10000) + parser.add_argument('--number_eval_episodes', type=int, default=10) + + parser.add_argument('--seed', type=int, default=571) + parser.add_argument('--evaluation_seed', type=int, default=152) + + parser.add_argument('--max_steps_per_batch', type=float, default=5000) + + parser.add_argument('--plot_frequency', type=int, default=100) + parser.add_argument('--checkpoint_frequency', type=int, default=100) + + parser = algorithm_args(parent_parser=parser) + parser = environment_args(parent_parser=parser) + + return vars(parser.parse_args()) # converts to a dictionary + +if __name__ == '__main__': + parse_args() diff --git a/example/example_training_loops.py b/example/example_training_loops.py index b8af0b34..b77ad72c 100644 --- a/example/example_training_loops.py +++ b/example/example_training_loops.py @@ -8,6 +8,7 @@ from cares_reinforcement_learning.memory.augments import * from cares_reinforcement_learning.util import Record from cares_reinforcement_learning.util import EnvironmentFactory +from cares_reinforcement_learning.util import arguement_parser as ap import example.policy_example as pbe import example.value_example as vbe @@ -25,59 +26,9 @@ def set_seed(seed): np.random.seed(seed) random.seed(seed) -def environment_args(parent_parser): - env_parser = argparse.ArgumentParser() - env_parsers = env_parser.add_subparsers(help='sub-command help', dest='gym_environment', required=True) - - # create the parser for the DMCS sub-command - parser_dmcs = env_parsers.add_parser('dmcs', help='DMCS', parents=[parent_parser]) - parser_dmcs.add_argument('--domain', type=str, required=True) - parser_dmcs.add_argument('--task', type=str, required=True) - - # create the parser for the OpenAI sub-command - parser_openai = env_parsers.add_parser('openai', help='openai', parents=[parent_parser]) - parser_openai.add_argument('--task', type=str, required=True) - return env_parser - -def parse_args(): - parser = argparse.ArgumentParser(add_help=False) # Add an argument - - parser.add_argument('--algorithm', type=str, required=True) - parser.add_argument('--memory', type=str, default="MemoryBuffer") - parser.add_argument('--image_observation', type=bool, default=False) - - parser.add_argument('--G', type=int, default=10) - parser.add_argument('--gamma', type=float, default=0.99) - parser.add_argument('--tau', type=float, default=0.005) - parser.add_argument('--batch_size', type=int, default=32) - - parser.add_argument('--max_steps_exploration', type=int, default=10000) - parser.add_argument('--max_steps_training', type=int, default=100000) - - parser.add_argument('--number_steps_per_evaluation', type=int, default=10000) - parser.add_argument('--number_eval_episodes', type=int, default=10) - - parser.add_argument('--seed', type=int, default=571) - parser.add_argument('--evaluation_seed', type=int, default=152) - - parser.add_argument('--actor_lr', type=float, default=1e-4) - parser.add_argument('--critic_lr', type=float, default=1e-3) - - parser.add_argument('--lr', type=float, default=1e-3) - parser.add_argument('--exploration_min', type=float, default=1e-3) - parser.add_argument('--exploration_decay', type=float, default=0.95) - - parser.add_argument('--max_steps_per_batch', type=float, default=5000) - - parser.add_argument('--plot_frequency', type=int, default=100) - parser.add_argument('--checkpoint_frequency', type=int, default=100) - - parser = environment_args(parent_parser=parser) # NOTE this has to go after the rest of parser is created - - return vars(parser.parse_args()) # converts into a dictionary - def main(): - args = parse_args() + args = ap.parse_args() + args["device"] = torch.device('cuda' if torch.cuda.is_available() else 'cpu') logging.info(f"Device: {args['device']}") From 6330355f9757d6a31dd1d5668519c5a6763f22fc Mon Sep 17 00:00:00 2001 From: beardyface Date: Wed, 11 Oct 2023 15:07:52 +1300 Subject: [PATCH 6/9] Updated arguements and readme --- README.md | 6 +-- .../util/arguement_parser.py | 47 +++++++++---------- 2 files changed, 26 insertions(+), 27 deletions(-) diff --git a/README.md b/README.md index 2a057527..c9e602d6 100644 --- a/README.md +++ b/README.md @@ -21,14 +21,14 @@ To make the module **globally accessible** in your working environment run `pip3 ## Running an Example This repository includes a script that allows you to run any OpenAI Gymnasium (https://github.com/Farama-Foundation/Gymnasium) or Deep Mind Control Suite (https://github.com/google-deepmind/dm_control) environment – provided you comply with all the dependencies for that environment. These examples make use of the package, and can provide an example on how one might use the package in their own environments. -`example_training_loops.py` takes in hyperparameters that allow you to customise the training run enviromment – OpenAI or DMCS Environment - or RL algorithm. Use `python3 example_training_loops.py -h` for help on what hyperparameters are available for customisation. +`example_training_loops.py` takes in hyperparameters that allow you to customise the training run enviromment – OpenAI or DMCS Environment - or RL algorithm. Use `python3 example_training_loops.py -h` for help on what parameters are available for customisation. An example is found below for running on the OpenAI and DMCS environments with TD3: ``` -python3 example_training_loops.py openai --task HalfCheetah-v4 --algorithm TD3 +python3 example_training_loops.py openai --task HalfCheetah-v4 TD3 -python3 example_training_loops.py dmcs --domain ball_in_cup --task catch --algorithm TD3 +python3 example_training_loops.py dmcs --domain ball_in_cup --task catch TD3 ``` ### Data Outputs diff --git a/cares_reinforcement_learning/util/arguement_parser.py b/cares_reinforcement_learning/util/arguement_parser.py index 9e2b4a68..73b0b766 100644 --- a/cares_reinforcement_learning/util/arguement_parser.py +++ b/cares_reinforcement_learning/util/arguement_parser.py @@ -6,23 +6,23 @@ def environment_args(parent_parser): env_parser = argparse.ArgumentParser() - env_parsers = env_parser.add_subparsers(help='sub-command help', dest='gym_environment', required=True) + env_parsers = env_parser.add_subparsers(title="Environment", description="OpenAI Gym or Deep Mind Control Suite", help='choose', dest='gym_environment', required=True) # create the parser for the DMCS sub-command - parser_dmcs = env_parsers.add_parser('dmcs', help='DMCS', parents=[parent_parser]) + parser_dmcs = env_parsers.add_parser('dmcs', help='Deep Mind Control Suite', parents=[parent_parser]) required = parser_dmcs.add_argument_group('required arguments') required.add_argument('--domain', type=str, required=True) required.add_argument('--task', type=str, required=True) # create the parser for the OpenAI sub-command - parser_openai = env_parsers.add_parser('openai', help='openai', parents=[parent_parser]) + parser_openai = env_parsers.add_parser('openai', help='OpenAI Gymnasium', parents=[parent_parser]) required = parser_openai.add_argument_group('required arguments') required.add_argument('--task', type=str, required=True) return env_parser def algorithm_args(parent_parser): alg_parser = argparse.ArgumentParser(add_help=False) - alg_parsers = alg_parser.add_subparsers(help='sub-command help', dest='algorithm', required=True) + alg_parsers = alg_parser.add_subparsers(help='Select which RL algorith you want to use', dest='algorithm', required=True) # create the parser for TD3 with default parameters parser_TD3 = alg_parsers.add_parser('TD3', help='TD3', parents=[parent_parser]) @@ -48,48 +48,47 @@ def algorithm_args(parent_parser): # create the parser for PPO with default parameters parser_PPO = alg_parsers.add_parser('PPO', help='SAC', parents=[parent_parser]) parser_PPO.add_argument('--actor_lr', type=float, default=1e-4) - parser_PPO.add_argument('--critic_lr', type=float, required=1e-3) - parser_PPO.add_argument('--gamma', type=float, required=0.99) + parser_PPO.add_argument('--critic_lr', type=float, default=1e-3) + parser_PPO.add_argument('--gamma', type=float, default=0.99) + parser_PPO.add_argument('--max_steps_per_batch', type=float, default=5000) # create the parser for DQN with default parameters parser_DQN = alg_parsers.add_parser('DQN', help='DQN', parents=[parent_parser]) parser_DQN.add_argument('--lr', type=float, default=1e-3) - parser_DQN.add_argument('--gamma', type=float, required=0.99) + parser_DQN.add_argument('--gamma', type=float, default=0.99) # create the parser for DuelingDQN with default parameters parser_DuelingDQN = alg_parsers.add_parser('DuelingDQN', help='DuelingDQN', parents=[parent_parser]) parser_DuelingDQN.add_argument('--lr', type=float, default=1e-3) - parser_DuelingDQN.add_argument('--gamma', type=float, required=0.99) + parser_DuelingDQN.add_argument('--gamma', type=float, default=0.99) # create the parser for DoubleDQN with default parameters parser_DoubleDQN = alg_parsers.add_parser('DoubleDQN', help='DoubleDQN', parents=[parent_parser]) parser_DoubleDQN.add_argument('--lr', type=float, default=1e-3) - parser_DoubleDQN.add_argument('--gamma', type=float, required=0.99) + parser_DoubleDQN.add_argument('--gamma', type=float, default=0.99) return alg_parser def parse_args(): parser = argparse.ArgumentParser(add_help=False) # Add an argument - parser.add_argument('--memory', type=str, default="MemoryBuffer") - parser.add_argument('--image_observation', type=bool, default=False) + # TODO + parser.add_argument('--memory', type=str, default="MemoryBuffer", help="Memory type - options: {MemoryBuffer, PER}") + parser.add_argument('--image_observation', type=bool, default=False, help="Use image as the observation state from the environment") - parser.add_argument('--G', type=int, default=10) - parser.add_argument('--batch_size', type=int, default=32) + parser.add_argument('--G', type=int, default=10, help="Number of learning updates each step of training") + parser.add_argument('--batch_size', type=int, default=32, help="Batch Size used during training") - parser.add_argument('--max_steps_exploration', type=int, default=10000) - parser.add_argument('--max_steps_training', type=int, default=100000) + parser.add_argument('--max_steps_exploration', type=int, default=10000, help="Total number of steps for exploration before training") + parser.add_argument('--max_steps_training', type=int, default=100000, help="Total number of steps to train the algorithm") - parser.add_argument('--number_steps_per_evaluation', type=int, default=10000) - parser.add_argument('--number_eval_episodes', type=int, default=10) + parser.add_argument('--number_steps_per_evaluation', type=int, default=10000, help="The number of steps inbetween evaluation runs during training") + parser.add_argument('--number_eval_episodes', type=int, default=10, help="The number of episodes to evaluate the agent on during training") - parser.add_argument('--seed', type=int, default=571) - parser.add_argument('--evaluation_seed', type=int, default=152) + parser.add_argument('--seed', type=int, default=571, help="The random seed to set for training") - parser.add_argument('--max_steps_per_batch', type=float, default=5000) - - parser.add_argument('--plot_frequency', type=int, default=100) - parser.add_argument('--checkpoint_frequency', type=int, default=100) + parser.add_argument('--plot_frequency', type=int, default=100, help="How many steps between updating the running plot of the training and evaluation data during training") + parser.add_argument('--checkpoint_frequency', type=int, default=100, help="How many steps between saving check point models of the agent during training") parser = algorithm_args(parent_parser=parser) parser = environment_args(parent_parser=parser) @@ -97,4 +96,4 @@ def parse_args(): return vars(parser.parse_args()) # converts to a dictionary if __name__ == '__main__': - parse_args() + print(parse_args()) From d29d19fd9e703289c3b32d55afdd45f654270365 Mon Sep 17 00:00:00 2001 From: beardyface Date: Wed, 11 Oct 2023 16:54:50 +1300 Subject: [PATCH 7/9] Updated Value and PPO training loops --- .../util/EnvironmentFactory.py | 38 +++++++++++++++++-- .../util/arguement_parser.py | 7 +++- example/policy_example.py | 2 +- example/ppo_example.py | 32 +++++++++------- example/value_example.py | 28 ++++++++++---- 5 files changed, 80 insertions(+), 27 deletions(-) diff --git a/cares_reinforcement_learning/util/EnvironmentFactory.py b/cares_reinforcement_learning/util/EnvironmentFactory.py index ce3b688c..fe259661 100644 --- a/cares_reinforcement_learning/util/EnvironmentFactory.py +++ b/cares_reinforcement_learning/util/EnvironmentFactory.py @@ -20,7 +20,7 @@ def __init__(self) -> None: def create_environment(self, gym_environment, args): logging.info(f"Training Environment: {gym_environment}") if gym_environment == 'dmcs': - env = DMCSImageEnvironment(args=args) if args['image_observation'] else DMCS(args=args) + env = DMCSImage(args=args) if args['image_observation'] else DMCS(args=args) elif gym_environment == "openai": env = OpenAIGym(args=args) else: @@ -67,6 +67,38 @@ def grab_frame(self): frame = self.env.render() frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) # Convert to BGR for use with OpenCV return frame + +class OpenAIGymImage: + def __init__(self, args, k=3): + self.k = k # number of frames to be stacked + self.frames_stacked = deque([], maxlen=k) + + super().__init__(args=args) + + # @override + @property + def observation_space(self): + raise NotImplementedError("Not Implemented Yet") + + # @override + def reset(self): + _ = self.env.reset() + frame = self.env.physics.render(84, 84, camera_id=0) # --> shape= (84, 84, 3) + frame = np.moveaxis(frame, -1, 0) # --> shape= (3, 84, 84) + for _ in range(self.k): + self.frames_stacked.append(frame) + stacked_frames = np.concatenate(list(self.frames_stacked), axis=0) # --> shape = (9, 84, 84) + return stacked_frames + + # @override + def step(self, action): + time_step = self.env.step(action) + reward, done = time_step.reward, time_step.last() + frame = self.env.physics.render(84, 84, camera_id=0) + frame = np.moveaxis(frame, -1, 0) + self.frames_stacked.append(frame) + stacked_frames = np.concatenate(list(self.frames_stacked), axis=0) + return stacked_frames, reward, done, False # for consistency with open ai gym just add false for truncated class DMCS: def __init__(self, args) -> None: @@ -109,7 +141,7 @@ def grab_frame(self): return frame # TODO paramatise the observation size 3x84x84 -class DMCSImageEnvironment(DMCS): +class DMCSImage(DMCS): def __init__(self, args, k=3): self.k = k # number of frames to be stacked self.frames_stacked = deque([], maxlen=k) @@ -119,7 +151,7 @@ def __init__(self, args, k=3): # @override @property def observation_space(self): - return self.env.observation_space.shape[0] + raise NotImplementedError("Not Implemented Yet") # @override def reset(self): diff --git a/cares_reinforcement_learning/util/arguement_parser.py b/cares_reinforcement_learning/util/arguement_parser.py index 73b0b766..65652cde 100644 --- a/cares_reinforcement_learning/util/arguement_parser.py +++ b/cares_reinforcement_learning/util/arguement_parser.py @@ -56,23 +56,28 @@ def algorithm_args(parent_parser): parser_DQN = alg_parsers.add_parser('DQN', help='DQN', parents=[parent_parser]) parser_DQN.add_argument('--lr', type=float, default=1e-3) parser_DQN.add_argument('--gamma', type=float, default=0.99) + parser_DQN.add_argument('--exploration_min', type=float, default=1e-3) + parser_DQN.add_argument('--exploration_decay', type=float, default=0.95) # create the parser for DuelingDQN with default parameters parser_DuelingDQN = alg_parsers.add_parser('DuelingDQN', help='DuelingDQN', parents=[parent_parser]) parser_DuelingDQN.add_argument('--lr', type=float, default=1e-3) parser_DuelingDQN.add_argument('--gamma', type=float, default=0.99) + parser_DuelingDQN.add_argument('--exploration_min', type=float, default=1e-3) + parser_DuelingDQN.add_argument('--exploration_decay', type=float, default=0.95) # create the parser for DoubleDQN with default parameters parser_DoubleDQN = alg_parsers.add_parser('DoubleDQN', help='DoubleDQN', parents=[parent_parser]) parser_DoubleDQN.add_argument('--lr', type=float, default=1e-3) parser_DoubleDQN.add_argument('--gamma', type=float, default=0.99) + parser_DoubleDQN.add_argument('--exploration_min', type=float, default=1e-3) + parser_DoubleDQN.add_argument('--exploration_decay', type=float, default=0.95) return alg_parser def parse_args(): parser = argparse.ArgumentParser(add_help=False) # Add an argument - # TODO parser.add_argument('--memory', type=str, default="MemoryBuffer", help="Memory type - options: {MemoryBuffer, PER}") parser.add_argument('--image_observation', type=bool, default=False, help="Use image as the observation state from the environment") diff --git a/example/policy_example.py b/example/policy_example.py index 34a77f57..494bb2e1 100644 --- a/example/policy_example.py +++ b/example/policy_example.py @@ -82,7 +82,7 @@ def policy_based_train(env, agent, memory, record, args): if total_step_counter < max_steps_exploration: logging.info(f"Running Exploration Steps {total_step_counter+1}/{max_steps_exploration}") # action range the env uses [e.g. -2 , 2 for pendulum] - action_env = np.random.uniform(env.min_action_value, env.max_action_value, size=env.action_num) + action_env = np.random.uniform(env.min_action_value, env.max_action_value, size=env.action_num) # algorithm range [-1, 1] - note for DMCS this is redudenant but required for openai action = hlp.normalize(action_env, env.max_action_value, env.min_action_value) else: diff --git a/example/ppo_example.py b/example/ppo_example.py index 603a5a97..25671d0d 100644 --- a/example/ppo_example.py +++ b/example/ppo_example.py @@ -10,12 +10,13 @@ def evaluate_ppo_network(env, agent, args, record=None, total_steps=0): - number_eval_episodes = int(args["number_eval_episodes"]) + if record is not None: + frame = env.grab_frame() + record.start_video(total_steps+1, frame) - min_action_value = env.action_space.low[0] - max_action_value = env.action_space.high[0] + number_eval_episodes = int(args["number_eval_episodes"]) - state, _ = env.reset() + state = env.reset() for eval_episode_counter in range(number_eval_episodes): episode_timesteps = 0 @@ -27,11 +28,15 @@ def evaluate_ppo_network(env, agent, args, record=None, total_steps=0): while not done and not truncated: episode_timesteps += 1 action, log_prob = agent.select_action_from_policy(state) - action_env = hlp.denormalize(action, max_action_value, min_action_value) + action_env = hlp.denormalize(action, env.max_action_value, env.min_action_value) - state, reward, done, truncated, _ = env.step(action_env) + state, reward, done, truncated = env.step(action_env) episode_reward += reward + if eval_episode_counter == 0 and record is not None: + frame = env.grab_frame() + record.log_video(frame) + if done or truncated: if record is not None: record.log_eval( @@ -42,11 +47,13 @@ def evaluate_ppo_network(env, agent, args, record=None, total_steps=0): ) # Reset environment - state, _ = env.reset() + state = env.reset() episode_reward = 0 episode_timesteps = 0 episode_num += 1 + record.stop_video() + def ppo_train(env, agent, record, args): start_time = time.time() @@ -55,9 +62,6 @@ def ppo_train(env, agent, record, args): max_steps_per_batch = args["max_steps_per_batch"] number_steps_per_evaluation = args["number_steps_per_evaluation"] - min_action_value = env.action_space.low[0] - max_action_value = env.action_space.high[0] - episode_timesteps = 0 episode_num = 0 episode_reward = 0 @@ -66,16 +70,16 @@ def ppo_train(env, agent, record, args): evaluate = False - state, _ = env.reset(seed=seed) + state = env.reset() episode_start = time.time() for total_step_counter in range(int(max_steps_training)): episode_timesteps += 1 action, log_prob = agent.select_action_from_policy(state) - action_env = hlp.denormalize(action, max_action_value, min_action_value) + action_env = hlp.denormalize(action, env.max_action_value, env.min_action_value) - next_state, reward, done, truncated, _ = env.step(action_env) + next_state, reward, done, truncated = env.step(action_env) memory.add(state=state, action=action, reward=reward, next_state=next_state, done=done, log_prob=log_prob) state = next_state @@ -115,7 +119,7 @@ def ppo_train(env, agent, record, args): evaluate = False # Reset environment - state, _ = env.reset() + state = env.reset() episode_timesteps = 0 episode_reward = 0 episode_num += 1 diff --git a/example/value_example.py b/example/value_example.py index 9eeb7a92..ffe1ab64 100644 --- a/example/value_example.py +++ b/example/value_example.py @@ -1,18 +1,25 @@ from cares_reinforcement_learning.memory import * from cares_reinforcement_learning.util import helpers as hlp, Record +import numpy as np import time import gym import logging import random +from random import randrange + from timeit import default_timer as timer def evaluate_value_network(env, agent, args, record=None, total_steps=0): + if record is not None: + frame = env.grab_frame() + record.start_video(total_steps+1, frame) + number_eval_episodes = int(args["number_eval_episodes"]) - state, _ = env.reset() + state = env.reset() exploration_rate = args["exploration_min"] @@ -27,13 +34,17 @@ def evaluate_value_network(env, agent, args, record=None, total_steps=0): episode_timesteps += 1 if random.random() < exploration_rate: - action = env.action_space.sample() + action = randrange(env.action_num) else: action = agent.select_action_from_policy(state) - state, reward, done, truncated, _ = env.step(action) + state, reward, done, truncated = env.step(action) episode_reward += reward + if eval_episode_counter == 0 and record is not None: + frame = env.grab_frame() + record.log_video(frame) + if done or truncated: if record is not None: record.log_eval( @@ -44,11 +55,12 @@ def evaluate_value_network(env, agent, args, record=None, total_steps=0): ) # Reset environment - state, _ = env.reset() + state = env.reset() episode_reward = 0 episode_timesteps = 0 episode_num += 1 + record.stop_video() def value_based_train(env, agent, memory, record, args): start_time = time.time() @@ -68,7 +80,7 @@ def value_based_train(env, agent, memory, record, args): evaluate = False - state, _ = env.reset(seed=seed) + state = env.reset() exploration_rate = 1 @@ -80,11 +92,11 @@ def value_based_train(env, agent, memory, record, args): exploration_rate = max(exploration_min, exploration_rate) if random.random() < exploration_rate: - action = env.action_space.sample() + action = randrange(env.action_num) else: action = agent.select_action_from_policy(state) - next_state, reward, done, truncated, _ = env.step(action) + next_state, reward, done, truncated = env.step(action) memory.add(state=state, action=action, reward=reward, next_state=next_state, done=done) state = next_state episode_reward += reward @@ -124,7 +136,7 @@ def value_based_train(env, agent, memory, record, args): evaluate = False # Reset environment - state, _ = env.reset() + state = env.reset() episode_timesteps = 0 episode_reward = 0 episode_num += 1 From 5ddd84ff36da6a97cf136aef69da1dd766f503f1 Mon Sep 17 00:00:00 2001 From: beardyface Date: Thu, 12 Oct 2023 10:15:04 +1300 Subject: [PATCH 8/9] added opencv to requirements --- .../util/EnvironmentFactory.py | 12 ++++-- .../util/arguement_parser.py | 2 + example/example_training_loops.py | 39 ++++++++++++------- requirements.txt | 1 + 4 files changed, 37 insertions(+), 17 deletions(-) diff --git a/cares_reinforcement_learning/util/EnvironmentFactory.py b/cares_reinforcement_learning/util/EnvironmentFactory.py index fe259661..d92a1944 100644 --- a/cares_reinforcement_learning/util/EnvironmentFactory.py +++ b/cares_reinforcement_learning/util/EnvironmentFactory.py @@ -31,8 +31,8 @@ class OpenAIGym: def __init__(self, args) -> None: logging.info(f"Training task {args['task']}") self.env = gym.make(args["task"], render_mode="rgb_array") - self.env.action_space.seed(args['seed']) - + self.set_seed(args['seed']) + @cached_property def max_action_value(self): return self.env.action_space.high[0] @@ -55,6 +55,9 @@ def action_num(self): raise ValueError(f"Unhandled action space type: {type(self.env.action_space)}") return action_num + def set_seed(self, seed): + self.env.action_space.seed(seed) + def reset(self): state, _ = self.env.reset() return state @@ -106,7 +109,7 @@ def __init__(self, args) -> None: logging.info(f"Training with Task {args['task']}") self.env = suite.load(args['domain'], args['task'], task_kwargs={'random': args['seed']}) - + @cached_property def min_action_value(self): return self.env.action_spec().minimum[0] @@ -125,6 +128,9 @@ def observation_space(self): def action_num(self): return self.env.action_spec().shape[0] + def set_seed(self, seed): + self.env = suite.load(self.env.domain, self.env.task, task_kwargs={'random': seed}) + def reset(self): time_step = self.env.reset() observation = np.hstack(list(time_step.observation.values())) # # e.g. position, orientation, joint_angles diff --git a/cares_reinforcement_learning/util/arguement_parser.py b/cares_reinforcement_learning/util/arguement_parser.py index 65652cde..69db253a 100644 --- a/cares_reinforcement_learning/util/arguement_parser.py +++ b/cares_reinforcement_learning/util/arguement_parser.py @@ -78,6 +78,8 @@ def algorithm_args(parent_parser): def parse_args(): parser = argparse.ArgumentParser(add_help=False) # Add an argument + parser.add_argument('--number_training_iterations', type=int, default=1, help="Total amount of training iterations to complete") + parser.add_argument('--memory', type=str, default="MemoryBuffer", help="Memory type - options: {MemoryBuffer, PER}") parser.add_argument('--image_observation', type=bool, default=False, help="Use image as the observation state from the environment") diff --git a/example/example_training_loops.py b/example/example_training_loops.py index b77ad72c..9da0e569 100644 --- a/example/example_training_loops.py +++ b/example/example_training_loops.py @@ -20,6 +20,7 @@ import torch import random import numpy as np +from pathlib import Path def set_seed(seed): torch.manual_seed(seed) @@ -44,9 +45,6 @@ def main(): args['action_num'] = env.action_num logging.info(f"Action Num: {args['action_num']}") - logging.info(f"Seed: {args['seed']}") - set_seed(args["seed"]) - # Create the network we are using factory = NetworkFactory() logging.info(f"Algorithm: {args['algorithm']}") @@ -64,17 +62,30 @@ def main(): logging.info(f"Memory: {args['memory']}") - #create the record class - standardised results tracking - record = Record(network=agent, config={'args': args}) - # Train the policy or value based approach - if args["algorithm"] == "PPO": - ppe.ppo_train(env, agent, record, args) - elif agent.type == "policy": - pbe.policy_based_train(env, agent, memory, record, args) - elif agent.type == "value": - vbe.value_based_train(env, agent, memory, record, args) - else: - raise ValueError(f"Agent type is unkown: {agent.type}") + seed = args['seed'] + + glob_log_dir = f'{Path.home()}/cares_rl_logs/' + log_dir = f"{algoritm}-{task}-{datetime.now().strftime('%y_%m_%d_%H:%M:%S')}" + + training_iterations = args['number_training_iterations'] + for training_iteration in range(0, training_iterations): + logging.info(f"Training iteration {training_iteration+1}/{training_iterations} with Seed: {seed}") + set_seed(seed) + env.set_seed(seed) + + #create the record class - standardised results tracking + record = Record(glob_log_dir=glob_log_dir, network=agent, config={'args': args}) + + # Train the policy or value based approach + if args["algorithm"] == "PPO": + ppe.ppo_train(env, agent, record, args) + elif agent.type == "policy": + pbe.policy_based_train(env, agent, memory, record, args) + elif agent.type == "value": + vbe.value_based_train(env, agent, memory, record, args) + else: + raise ValueError(f"Agent type is unkown: {agent.type}") + seed += 10 record.save() diff --git a/requirements.txt b/requirements.txt index 56d00fc1..82b27633 100644 --- a/requirements.txt +++ b/requirements.txt @@ -28,3 +28,4 @@ typing_extensions==4.4.0 urllib3==1.26.13 PyYAML==6.0 dm_control==1.0.10 +opencv-python From 8618e9298d81ab2973857304603735b0e772bb1a Mon Sep 17 00:00:00 2001 From: beardyface Date: Thu, 12 Oct 2023 10:18:38 +1300 Subject: [PATCH 9/9] removed portions from wrong branch --- example/example_training_loops.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/example/example_training_loops.py b/example/example_training_loops.py index 9da0e569..99d1c2fe 100644 --- a/example/example_training_loops.py +++ b/example/example_training_loops.py @@ -64,9 +64,6 @@ def main(): seed = args['seed'] - glob_log_dir = f'{Path.home()}/cares_rl_logs/' - log_dir = f"{algoritm}-{task}-{datetime.now().strftime('%y_%m_%d_%H:%M:%S')}" - training_iterations = args['number_training_iterations'] for training_iteration in range(0, training_iterations): logging.info(f"Training iteration {training_iteration+1}/{training_iterations} with Seed: {seed}") @@ -74,7 +71,7 @@ def main(): env.set_seed(seed) #create the record class - standardised results tracking - record = Record(glob_log_dir=glob_log_dir, network=agent, config={'args': args}) + record = Record(network=agent, config={'args': args}) # Train the policy or value based approach if args["algorithm"] == "PPO":