From e043d2fd67ac4836a26c921338039b5ab47b4e5f Mon Sep 17 00:00:00 2001 From: tonyq Date: Sun, 24 Mar 2024 16:35:22 +1300 Subject: [PATCH 01/91] First commit to re-weight --- .../algorithm/mbrl/DYNA_SAC_Reweight.py | 310 ++++++++++++++++++ .../algorithm/mbrl/__init__.py | 1 + .../memory/memory_buffer.py | 46 +++ cares_reinforcement_learning/util/__init__.py | 1 + .../util/configurations.py | 17 +- .../util/network_factory.py | 40 +++ .../util/uncertainty_estimation.py | 36 ++ 7 files changed, 450 insertions(+), 1 deletion(-) create mode 100644 cares_reinforcement_learning/algorithm/mbrl/DYNA_SAC_Reweight.py create mode 100644 cares_reinforcement_learning/util/uncertainty_estimation.py diff --git a/cares_reinforcement_learning/algorithm/mbrl/DYNA_SAC_Reweight.py b/cares_reinforcement_learning/algorithm/mbrl/DYNA_SAC_Reweight.py new file mode 100644 index 00000000..beecae06 --- /dev/null +++ b/cares_reinforcement_learning/algorithm/mbrl/DYNA_SAC_Reweight.py @@ -0,0 +1,310 @@ +""" +Sutton, Richard S. "Dyna, an integrated architecture for learning, planning, and reacting." + +This code runs automatic entropy tuning +""" + +import copy +import logging +import os + +import numpy as np +import torch +import torch.nn.functional as F +from cares_reinforcement_learning.util import sampling + +class DynaSAC_Reweight: + """ + Use the Soft Actor Critic as the Actor Critic framework. + + """ + + def __init__( + self, + actor_network, + critic_network, + world_network, + gamma, + tau, + action_num, + actor_lr, + critic_lr, + alpha_lr, + num_samples, + horizon, + device, + ): + self.type = "mbrl" + # Switches + self.num_samples = num_samples + self.horizon = horizon + self.action_num = action_num + # Other Variables + self.gamma = gamma + self.tau = tau + self.device = device + self.batch_size = None + + # this may be called policy_net in other implementations + self.actor_net = actor_network.to(device) + # this may be called soft_q_net in other implementations + self.critic_net = critic_network.to(device) + self.target_critic_net = copy.deepcopy(self.critic_net).to(device) + + # Set to initial alpha to 1.0 according to other baselines. + self.log_alpha = torch.tensor(np.log(1.0)).to(device) + self.log_alpha.requires_grad = True + self.target_entropy = -action_num + + # optimizer + self.actor_net_optimiser = torch.optim.Adam( + self.actor_net.parameters(), lr=actor_lr + ) + self.critic_net_optimiser = torch.optim.Adam( + self.critic_net.parameters(), lr=critic_lr + ) + self.log_alpha_optimizer = torch.optim.Adam([self.log_alpha], lr=alpha_lr) + + # World model + self.world_model = world_network + self.learn_counter = 0 + self.policy_update_freq = 1 + + @property + def _alpha(self): + """ + A variatble decide to what extend entropy shoud be valued. + """ + return self.log_alpha.exp() + + # pylint: disable-next=unused-argument to keep the same interface + def select_action_from_policy(self, state, evaluation=False, noise_scale=0): + """ + Select a action for executing. It is the only channel that an agent + will communicate the the actual environment. + + """ + # note that when evaluating this algorithm we need to select mu as + # action so _, _, action = self.actor_net.sample(state_tensor) + self.actor_net.eval() + with torch.no_grad(): + state_tensor = torch.FloatTensor(state).unsqueeze(0).to(self.device) + if evaluation is False: + (action, _, _) = self.actor_net.sample(state_tensor) + else: + (_, _, action) = self.actor_net.sample(state_tensor) + action = action.cpu().data.numpy().flatten() + self.actor_net.train() + return action + + def _train_policy(self, states, actions, rewards, next_states, dones, weights): + """ + Train the policy with Model-Based Value Expansion. A family of MBRL. + + """ + info = {} + with torch.no_grad(): + next_actions, next_log_pi, _ = self.actor_net.sample(next_states) + target_q_one, target_q_two = self.target_critic_net( + next_states, next_actions + ) + target_q_values = ( + torch.minimum(target_q_one, target_q_two) - self._alpha * next_log_pi + ) + q_target = rewards + self.gamma * (1 - dones) * target_q_values + q_target = q_target.detach() + assert (len(q_target.shape) == 2) and (q_target.shape[1] == 1) + + q_values_one, q_values_two = self.critic_net(states, actions) + critic_loss_one = 0.5 * ((q_values_one - q_target) * weights).pow(2).mean() + critic_loss_two = 0.5 * ((q_values_two - q_target) * weights).pow(2).mean() + # critic_loss_one = F.mse_loss(q_values_one, q_target) + # critic_loss_two = F.mse_loss(q_values_two, q_target) + critic_loss_total = critic_loss_one + critic_loss_two + # Update the Critic + self.critic_net_optimiser.zero_grad() + critic_loss_total.backward() + self.critic_net_optimiser.step() + + ################## Update the Actor Second #################### + pi, first_log_p, _ = self.actor_net.sample(states) + qf1_pi, qf2_pi = self.critic_net(states, pi) + min_qf_pi = torch.minimum(qf1_pi, qf2_pi) + actor_loss = ((self._alpha * first_log_p) - min_qf_pi).mean() + + # Update the Actor + self.actor_net_optimiser.zero_grad() + actor_loss.backward() + self.actor_net_optimiser.step() + + # update the temperature + alpha_loss = -( + self.log_alpha * (first_log_p + self.target_entropy).detach() + ).mean() + self.log_alpha_optimizer.zero_grad() + alpha_loss.backward() + self.log_alpha_optimizer.step() + + if self.learn_counter % self.policy_update_freq == 0: + for target_param, param in zip( + self.target_critic_net.parameters(), self.critic_net.parameters() + ): + target_param.data.copy_( + param.data * self.tau + target_param.data * (1.0 - self.tau) + ) + + info["q_target"] = q_target + info["q_values_one"] = q_values_one + info["q_values_two"] = q_values_two + info["q_values_min"] = torch.minimum(q_values_one, q_values_two) + info["critic_loss_total"] = critic_loss_total + info["critic_loss_one"] = critic_loss_one + info["critic_loss_two"] = critic_loss_two + info["actor_loss"] = actor_loss + return info + + def train_world_model(self, experiences): + """ + Sample the buffer again for training the world model can reach higher rewards. + + :param experiences: + """ + ( + states, + actions, + rewards, + next_states, + _, + next_actions, + next_rewards, + ) = experiences + states = torch.FloatTensor(np.asarray(states)).to(self.device) + actions = torch.FloatTensor(np.asarray(actions)).to(self.device) + rewards = torch.FloatTensor(np.asarray(rewards)).to(self.device).unsqueeze(1) + next_states = torch.FloatTensor(np.asarray(next_states)).to(self.device) + next_rewards = ( + torch.FloatTensor(np.asarray(next_rewards)).to(self.device).unsqueeze(1) + ) + next_actions = torch.FloatTensor(np.asarray(next_actions)).to(self.device) + assert len(states.shape) >= 2 + assert len(actions.shape) == 2 + assert len(rewards.shape) == 2 and rewards.shape[1] == 1 + assert len(next_rewards.shape) == 2 and next_rewards.shape[1] == 1 + assert len(next_states.shape) >= 2 + + # # Step 1 train the world model. + self.world_model.train_world( + states=states, + actions=actions, + rewards=rewards, + next_states=next_states, + next_actions=next_actions, + next_rewards=next_rewards, + + ) + + def train_policy(self, experiences): + """ + Interface to training loop. + + """ + self.learn_counter += 1 + ( + states, + actions, + rewards, + next_states, + dones, + ) = experiences + self.batch_size = len(states) + # Convert into tensor + states = torch.FloatTensor(np.asarray(states)).to(self.device) + actions = torch.FloatTensor(np.asarray(actions)).to(self.device) + rewards = torch.FloatTensor(np.asarray(rewards)).to(self.device).unsqueeze(1) + next_states = torch.FloatTensor(np.asarray(next_states)).to(self.device) + dones = torch.LongTensor(np.asarray(dones)).to(self.device).unsqueeze(1) + assert len(states.shape) >= 2 + assert len(actions.shape) == 2 + assert len(rewards.shape) == 2 and rewards.shape[1] == 1 + assert len(next_states.shape) >= 2 + full_weights = torch.ones(rewards.shape) + # Step 2 train as usual + self._train_policy( + states=states, + actions=actions, + rewards=rewards, + next_states=next_states, + dones=dones, + weights=full_weights, + ) + # # # Step 3 Dyna add more data + self._dyna_generate_and_train(next_states=next_states) + + def _dyna_generate_and_train(self, next_states): + """ + Only off-policy Dyna will work. + :param next_states: + """ + pred_states = [] + pred_actions = [] + pred_rs = [] + pred_n_states = [] + pred_uncerts = [] + pred_state = next_states + for _ in range(self.horizon): + pred_state = torch.repeat_interleave(pred_state, self.num_samples, dim=0) + # This part is controversial. But random actions is empirically better. + rand_acts = np.random.uniform(-1, 1, (pred_state.shape[0], self.action_num)) + pred_acts = torch.FloatTensor(rand_acts).to(self.device) + pred_next_state, _, pred_mean, pred_var = self.world_model.pred_next_states( + pred_state, pred_acts + ) + uncert = sampling(pred_means=pred_mean, pred_vars=pred_var) + uncert = uncert.unsqueeze(dim=1) + pred_uncerts.append(uncert) + + pred_reward, _ = self.world_model.pred_rewards(pred_state, pred_acts) + pred_states.append(pred_state) + pred_actions.append(pred_acts.detach()) + pred_rs.append(pred_reward.detach()) + pred_n_states.append(pred_next_state.detach()) + pred_state = pred_next_state.detach() + pred_states = torch.vstack(pred_states) + pred_actions = torch.vstack(pred_actions) + pred_rs = torch.vstack(pred_rs) + pred_n_states = torch.vstack(pred_n_states) + pred_weights = torch.vstack(pred_uncerts) + # Pay attention to here! It is dones in the Cares RL Code! + pred_dones = torch.FloatTensor(np.zeros(pred_rs.shape)).to(self.device) + # states, actions, rewards, next_states, not_dones + self._train_policy( + pred_states, pred_actions, pred_rs, pred_n_states, pred_dones, pred_weights + ) + + def set_statistics(self, stats): + """ + Set and update the statatistics (means and stds) for MBRL to normalize the states. + + """ + self.world_model.set_statistics(stats) + + def save_models(self, filename, filepath="models"): + """ + Save the intrim actor critics. + """ + path = f"{filepath}/models" if filepath != "models" else filepath + dir_exists = os.path.exists(path) + if not dir_exists: + os.makedirs(path) + torch.save(self.actor_net.state_dict(), f"{path}/{filename}_actor.pth") + torch.save(self.critic_net.state_dict(), f"{path}/{filename}_critic.pth") + logging.info("models has been saved...") + + def load_models(self, filepath, filename): + """ + Load trained networks + """ + path = f"{filepath}/models" if filepath != "models" else filepath + self.actor_net.load_state_dict(torch.load(f"{path}/{filename}_actor.pth")) + self.critic_net.load_state_dict(torch.load(f"{path}/{filename}_critic.pth")) + logging.info("models has been loaded...") diff --git a/cares_reinforcement_learning/algorithm/mbrl/__init__.py b/cares_reinforcement_learning/algorithm/mbrl/__init__.py index 2ceefac4..6d6fa39b 100644 --- a/cares_reinforcement_learning/algorithm/mbrl/__init__.py +++ b/cares_reinforcement_learning/algorithm/mbrl/__init__.py @@ -1 +1,2 @@ from .DYNA_SAC import DynaSAC +from .DYNA_SAC_Reweight import DynaSAC_Reweight diff --git a/cares_reinforcement_learning/memory/memory_buffer.py b/cares_reinforcement_learning/memory/memory_buffer.py index 59ded1f0..c0b8f6e8 100644 --- a/cares_reinforcement_learning/memory/memory_buffer.py +++ b/cares_reinforcement_learning/memory/memory_buffer.py @@ -84,3 +84,49 @@ def get_statistics(self): "delta_std": delta_std, } return statistics + + def sample_consecutive(self, batch_size): + """ + For training MBRL to predict rewards. The right next transition is + sampled as well. WHEN THE BUFFER IS NOT SHUFFLED. + It is not overriding the original sample() due to sample() can be used + for normal training, and the current sample_consecutive() is slower. + (State, action, reward, next_state, next_action, next_reard) + """ + max_length = len(self.buffer) - 1 + candi_indices = list(range(max_length)) + batch_size = min(batch_size, max_length) + # A list of candidate indices includes all indices. + sampled_indices = [] # randomly sampled indices that is okay. + # In this way, the sampling time depends on the batch size rather than buffer size. + first_sample = True # Not check duplicate for first time sample. + while True: + # Sample size based on how many still needed. + idxs = random.sample(candi_indices, batch_size - len(sampled_indices)) + for i in idxs: + # Check if it is already sampled. + already_sampled = False + # Only check if it is not first time in the while loop. + if not first_sample: + # compare with each item in the sampled. + for j in sampled_indices: + if j == i: + already_sampled = True + if (self.buffer[i][4] is False) and (not already_sampled): + sampled_indices.append(i) + if len(sampled_indices) == batch_size: + break + first_sample = False + if len(sampled_indices) == batch_size: + break + # Form the sampled data batch + experience_batch = [ + self.buffer[i] + + ( + self.buffer[i + 1][1], + self.buffer[i + 1][2], + ) + for i in sampled_indices + ] + transposed_batch = zip(*experience_batch) + return transposed_batch diff --git a/cares_reinforcement_learning/util/__init__.py b/cares_reinforcement_learning/util/__init__.py index 188a2b99..46191902 100644 --- a/cares_reinforcement_learning/util/__init__.py +++ b/cares_reinforcement_learning/util/__init__.py @@ -3,3 +3,4 @@ from cares_reinforcement_learning.util.memory_factory import MemoryFactory from cares_reinforcement_learning.util.rl_parser import RLParser from cares_reinforcement_learning.util.helpers import * +from cares_reinforcement_learning.util.uncertainty_estimation import * \ No newline at end of file diff --git a/cares_reinforcement_learning/util/configurations.py b/cares_reinforcement_learning/util/configurations.py index cff7edd4..b9ee5ae2 100644 --- a/cares_reinforcement_learning/util/configurations.py +++ b/cares_reinforcement_learning/util/configurations.py @@ -22,7 +22,7 @@ class TrainingConfig(SubscriptableClass): # for general agent training. G: Optional[int] = 1 # for training the world model in MBRL. - G_model: Optional[int] = 1 + G_model: Optional[float] = 1 buffer_size: Optional[int] = 1000000 batch_size: Optional[int] = 256 @@ -130,6 +130,21 @@ class DYNAConfig(AlgorithmConfig): world_model_lr: Optional[float] = 0.001 +class MBRL_DYNA_ReweightConfig(AlgorithmConfig): + algorithm: str = Field("MBRL_DYNA_Reweight", Literal=True) + actor_lr: Optional[float] = 3e-4 + critic_lr: Optional[float] = 3e-4 + alpha_lr: Optional[float] = 3e-4 + use_bounded_active: Optional[bool] = False + num_models: Optional[int] = 5 + gamma: Optional[float] = 0.99 + tau: Optional[float] = 0.005 + memory: Optional[str] = "MemoryBuffer" + horizon: Optional[int] = 3 + num_samples: Optional[int] = 10 + world_model_lr: Optional[float] = 0.001 + + class NaSATD3Config(AlgorithmConfig): algorithm: str = Field("NaSATD3", Literal=True) # actor_lr: Optional[float] = 1e-4 diff --git a/cares_reinforcement_learning/util/network_factory.py b/cares_reinforcement_learning/util/network_factory.py index 067ee86f..f7769bb9 100644 --- a/cares_reinforcement_learning/util/network_factory.py +++ b/cares_reinforcement_learning/util/network_factory.py @@ -118,6 +118,46 @@ def create_MBRL_DYNA(observation_size, action_num, config: DYNAConfig): return agent +def create_MBRL_DYNA_Reweight(observation_size, action_num, config: DYNAConfig): + """ + Create networks for model-based SAC agent. The Actor and Critic is same. + An extra world model is added. + + """ + from cares_reinforcement_learning.algorithm.mbrl import DynaSAC_Reweight + from cares_reinforcement_learning.networks.SAC import Actor, Critic + from cares_reinforcement_learning.networks.world_models import EnsembleWorldReward + + actor = Actor(observation_size, action_num) + critic = Critic(observation_size, action_num) + + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + world_model = EnsembleWorldReward( + observation_size=observation_size, + num_actions=action_num, + num_models=config.num_models, + device=device, + lr=config.world_model_lr, + ) + + agent = DynaSAC_Reweight( + actor_network=actor, + critic_network=critic, + world_network=world_model, + actor_lr=config.actor_lr, + critic_lr=config.critic_lr, + gamma=config.gamma, + tau=config.tau, + action_num=action_num, + device=device, + alpha_lr=config.alpha_lr, + horizon=config.horizon, + num_samples=config.num_samples, + ) + return agent + + def create_SAC(observation_size, action_num, config: AlgorithmConfig): """ Create an SAC agent. diff --git a/cares_reinforcement_learning/util/uncertainty_estimation.py b/cares_reinforcement_learning/util/uncertainty_estimation.py new file mode 100644 index 00000000..bcb1cd19 --- /dev/null +++ b/cares_reinforcement_learning/util/uncertainty_estimation.py @@ -0,0 +1,36 @@ +import torch +import torch.nn.functional as F + + +def sampling(pred_means, pred_vars): + """ + High std means low uncertainty. Therefore, divided by 1 + + :param pred_means: + :param pred_vars: + :return: + """ + # 5 models, each sampled 10 times = 50, + sample1 = torch.distributions.Normal(pred_means[0], pred_vars[0]).sample( + [10]) + sample2 = torch.distributions.Normal(pred_means[1], pred_vars[1]).sample( + [10]) + sample3 = torch.distributions.Normal(pred_means[2], pred_vars[2]).sample( + [10]) + sample4 = torch.distributions.Normal(pred_means[3], pred_vars[3]).sample( + [10]) + sample5 = torch.distributions.Normal(pred_means[4], pred_vars[4]).sample( + [10]) + samples = torch.cat((sample1, sample2, sample3, sample4, sample5)) + # Samples = [5 * 10, 10 predictions, 11 state dims] + # print(samples.shape) + stds = torch.var(samples, dim=0) + # print(stds.shape) + # [10 predictions, 11 state dims] + total_stds = torch.mean(stds, dim=1) + total_stds = F.sigmoid(total_stds) + # total_stds = 1 / total_stds + # total_stds = total_stds / torch.mean(total_stds) # if very uncertain, + # high std, encouraged. + # total_stds = total_stds - torch.min(total_stds) + return total_stds.detach() \ No newline at end of file From 0beef638622ea25dc057b828be2931f50bfa2d2f Mon Sep 17 00:00:00 2001 From: tonyq Date: Sun, 24 Mar 2024 22:47:44 +1300 Subject: [PATCH 02/91] Bring weights to device --- .../algorithm/mbrl/DYNA_SAC_Reweight.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cares_reinforcement_learning/algorithm/mbrl/DYNA_SAC_Reweight.py b/cares_reinforcement_learning/algorithm/mbrl/DYNA_SAC_Reweight.py index beecae06..d4cf454a 100644 --- a/cares_reinforcement_learning/algorithm/mbrl/DYNA_SAC_Reweight.py +++ b/cares_reinforcement_learning/algorithm/mbrl/DYNA_SAC_Reweight.py @@ -227,7 +227,7 @@ def train_policy(self, experiences): assert len(actions.shape) == 2 assert len(rewards.shape) == 2 and rewards.shape[1] == 1 assert len(next_states.shape) >= 2 - full_weights = torch.ones(rewards.shape) + full_weights = torch.ones(rewards.shape).to(self.device) # Step 2 train as usual self._train_policy( states=states, @@ -260,7 +260,7 @@ def _dyna_generate_and_train(self, next_states): pred_state, pred_acts ) uncert = sampling(pred_means=pred_mean, pred_vars=pred_var) - uncert = uncert.unsqueeze(dim=1) + uncert = uncert.unsqueeze(dim=1).to(self.device) pred_uncerts.append(uncert) pred_reward, _ = self.world_model.pred_rewards(pred_state, pred_acts) From 58e36f3cfc55cba103409a86cb44951f6fd02868 Mon Sep 17 00:00:00 2001 From: tonyq Date: Sun, 24 Mar 2024 23:21:54 +1300 Subject: [PATCH 03/91] Dyna Name error --- cares_reinforcement_learning/util/configurations.py | 2 +- cares_reinforcement_learning/util/network_factory.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/cares_reinforcement_learning/util/configurations.py b/cares_reinforcement_learning/util/configurations.py index b9ee5ae2..7e226648 100644 --- a/cares_reinforcement_learning/util/configurations.py +++ b/cares_reinforcement_learning/util/configurations.py @@ -116,7 +116,7 @@ class SACConfig(AlgorithmConfig): class DYNAConfig(AlgorithmConfig): - algorithm: str = Field("MBRL_DYNA", Literal=True) + algorithm: str = Field("DYNA", Literal=True) actor_lr: Optional[float] = 3e-4 critic_lr: Optional[float] = 3e-4 alpha_lr: Optional[float] = 3e-4 diff --git a/cares_reinforcement_learning/util/network_factory.py b/cares_reinforcement_learning/util/network_factory.py index f7769bb9..7dc361d7 100644 --- a/cares_reinforcement_learning/util/network_factory.py +++ b/cares_reinforcement_learning/util/network_factory.py @@ -78,7 +78,7 @@ def create_PPO(observation_size, action_num, config: AlgorithmConfig): return agent -def create_MBRL_DYNA(observation_size, action_num, config: DYNAConfig): +def create_DYNA(observation_size, action_num, config: DYNAConfig): """ Create networks for model-based SAC agent. The Actor and Critic is same. An extra world model is added. From 60279b0005fcf043a48dbbeb4c82d0d61d92429b Mon Sep 17 00:00:00 2001 From: tonyq Date: Thu, 28 Mar 2024 15:03:09 +1300 Subject: [PATCH 04/91] a line of comment --- cares_reinforcement_learning/util/uncertainty_estimation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cares_reinforcement_learning/util/uncertainty_estimation.py b/cares_reinforcement_learning/util/uncertainty_estimation.py index bcb1cd19..86978075 100644 --- a/cares_reinforcement_learning/util/uncertainty_estimation.py +++ b/cares_reinforcement_learning/util/uncertainty_estimation.py @@ -28,7 +28,7 @@ def sampling(pred_means, pred_vars): # print(stds.shape) # [10 predictions, 11 state dims] total_stds = torch.mean(stds, dim=1) - total_stds = F.sigmoid(total_stds) + total_stds = F.sigmoid(total_stds) # 0.5 - 1.0 # total_stds = 1 / total_stds # total_stds = total_stds / torch.mean(total_stds) # if very uncertain, # high std, encouraged. From 26a4e2e89464c3dc0510b9fd9e61ba4ceb988e79 Mon Sep 17 00:00:00 2001 From: "QIAO, Ting" Date: Wed, 3 Apr 2024 16:57:13 +1300 Subject: [PATCH 05/91] Update DYNA_SAC_Reweight.py --- .../algorithm/mbrl/DYNA_SAC_Reweight.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cares_reinforcement_learning/algorithm/mbrl/DYNA_SAC_Reweight.py b/cares_reinforcement_learning/algorithm/mbrl/DYNA_SAC_Reweight.py index d4cf454a..26558db2 100644 --- a/cares_reinforcement_learning/algorithm/mbrl/DYNA_SAC_Reweight.py +++ b/cares_reinforcement_learning/algorithm/mbrl/DYNA_SAC_Reweight.py @@ -260,7 +260,9 @@ def _dyna_generate_and_train(self, next_states): pred_state, pred_acts ) uncert = sampling(pred_means=pred_mean, pred_vars=pred_var) + uncert = 1.5 - uncert uncert = uncert.unsqueeze(dim=1).to(self.device) + pred_uncerts.append(uncert) pred_reward, _ = self.world_model.pred_rewards(pred_state, pred_acts) From 58f9b2c13394d930d2c1aed5aeed80ebf12c9f78 Mon Sep 17 00:00:00 2001 From: tonyq Date: Mon, 22 Apr 2024 14:51:33 +1200 Subject: [PATCH 06/91] nothing --- .../mbrl/{DYNA_SAC_Reweight.py => DynaSAC_Reweight.py} | 0 cares_reinforcement_learning/algorithm/mbrl/__init__.py | 2 +- cares_reinforcement_learning/util/uncertainty_estimation.py | 1 + 3 files changed, 2 insertions(+), 1 deletion(-) rename cares_reinforcement_learning/algorithm/mbrl/{DYNA_SAC_Reweight.py => DynaSAC_Reweight.py} (100%) diff --git a/cares_reinforcement_learning/algorithm/mbrl/DYNA_SAC_Reweight.py b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_Reweight.py similarity index 100% rename from cares_reinforcement_learning/algorithm/mbrl/DYNA_SAC_Reweight.py rename to cares_reinforcement_learning/algorithm/mbrl/DynaSAC_Reweight.py diff --git a/cares_reinforcement_learning/algorithm/mbrl/__init__.py b/cares_reinforcement_learning/algorithm/mbrl/__init__.py index 56003866..87c0c7f8 100644 --- a/cares_reinforcement_learning/algorithm/mbrl/__init__.py +++ b/cares_reinforcement_learning/algorithm/mbrl/__init__.py @@ -1,2 +1,2 @@ from .DynaSAC import DynaSAC -from .DYNA_SAC_Reweight import DynaSAC_Reweight +from .DynaSAC_Reweight import DynaSAC_Reweight diff --git a/cares_reinforcement_learning/util/uncertainty_estimation.py b/cares_reinforcement_learning/util/uncertainty_estimation.py index 86978075..e897f908 100644 --- a/cares_reinforcement_learning/util/uncertainty_estimation.py +++ b/cares_reinforcement_learning/util/uncertainty_estimation.py @@ -21,6 +21,7 @@ def sampling(pred_means, pred_vars): [10]) sample5 = torch.distributions.Normal(pred_means[4], pred_vars[4]).sample( [10]) + samples = torch.cat((sample1, sample2, sample3, sample4, sample5)) # Samples = [5 * 10, 10 predictions, 11 state dims] # print(samples.shape) From 1ab27938c48e9c82261242b1de9c0f7a1ecdfd81 Mon Sep 17 00:00:00 2001 From: tonyq Date: Mon, 22 Apr 2024 15:16:09 +1200 Subject: [PATCH 07/91] Change the SAC configurations and alpha learning rate. --- cares_reinforcement_learning/algorithm/policy/SAC.py | 3 ++- cares_reinforcement_learning/util/configurations.py | 2 +- cares_reinforcement_learning/util/network_factory.py | 11 +++++++++-- 3 files changed, 12 insertions(+), 4 deletions(-) diff --git a/cares_reinforcement_learning/algorithm/policy/SAC.py b/cares_reinforcement_learning/algorithm/policy/SAC.py index a1cea5dc..6794c1da 100644 --- a/cares_reinforcement_learning/algorithm/policy/SAC.py +++ b/cares_reinforcement_learning/algorithm/policy/SAC.py @@ -27,6 +27,7 @@ def __init__( action_num: int, actor_lr: float, critic_lr: float, + alpha_lr: float, device: torch.device, ): self.type = "policy" @@ -59,7 +60,7 @@ def __init__( init_temperature = 1.0 self.log_alpha = torch.tensor(np.log(init_temperature)).to(device) self.log_alpha.requires_grad = True - self.log_alpha_optimizer = torch.optim.Adam([self.log_alpha], lr=1e-3) + self.log_alpha_optimizer = torch.optim.Adam([self.log_alpha], lr=alpha_lr) # pylint: disable-next=unused-argument def select_action_from_policy( diff --git a/cares_reinforcement_learning/util/configurations.py b/cares_reinforcement_learning/util/configurations.py index b002de84..7e2bedb3 100644 --- a/cares_reinforcement_learning/util/configurations.py +++ b/cares_reinforcement_learning/util/configurations.py @@ -129,7 +129,7 @@ class SACConfig(AlgorithmConfig): algorithm: str = Field("SAC", Literal=True) actor_lr: Optional[float] = 3e-4 critic_lr: Optional[float] = 3e-4 - + alpha_lr: Optional[float] = 3e-4 gamma: Optional[float] = 0.99 tau: Optional[float] = 0.005 reward_scale: Optional[float] = 1.0 diff --git a/cares_reinforcement_learning/util/network_factory.py b/cares_reinforcement_learning/util/network_factory.py index 50567226..acf672c8 100644 --- a/cares_reinforcement_learning/util/network_factory.py +++ b/cares_reinforcement_learning/util/network_factory.py @@ -4,7 +4,10 @@ import torch -from cares_reinforcement_learning.util.configurations import AlgorithmConfig +from cares_reinforcement_learning.util.configurations import ( + AlgorithmConfig, + SACConfig, +) # Disable these as this is a deliberate use of dynamic imports @@ -117,7 +120,10 @@ def create_DynaSAC(observation_size, action_num, config: AlgorithmConfig): return agent -def create_SAC(observation_size, action_num, config: AlgorithmConfig): +def create_SAC(observation_size, action_num, config: SACConfig): + """ + Create an SAC agent. + """ from cares_reinforcement_learning.algorithm.policy import SAC from cares_reinforcement_learning.networks.SAC import Actor, Critic @@ -130,6 +136,7 @@ def create_SAC(observation_size, action_num, config: AlgorithmConfig): critic_network=critic, actor_lr=config.actor_lr, critic_lr=config.critic_lr, + alpha_lr=config.alpha_lr, gamma=config.gamma, tau=config.tau, reward_scale=config.reward_scale, From 65901f46b7bf634bb7355bdd4a2f3faa02686e4e Mon Sep 17 00:00:00 2001 From: tonyq Date: Mon, 22 Apr 2024 15:32:26 +1200 Subject: [PATCH 08/91] Correct refer to the parameters for SAC and DynaSAC --- cares_reinforcement_learning/algorithm/mbrl/DynaSAC.py | 2 +- cares_reinforcement_learning/util/network_factory.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC.py b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC.py index d7aa1c12..034e80ed 100644 --- a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC.py +++ b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC.py @@ -22,7 +22,7 @@ def __init__( self, actor_network: torch.nn.Module, critic_network: torch.nn.Module, - world_network: torch.nn.Module, + world_network: object, gamma: float, tau: float, action_num: int, diff --git a/cares_reinforcement_learning/util/network_factory.py b/cares_reinforcement_learning/util/network_factory.py index acf672c8..0f6c4610 100644 --- a/cares_reinforcement_learning/util/network_factory.py +++ b/cares_reinforcement_learning/util/network_factory.py @@ -6,6 +6,7 @@ from cares_reinforcement_learning.util.configurations import ( AlgorithmConfig, + DynaSACConfig, SACConfig, ) @@ -80,7 +81,7 @@ def create_PPO(observation_size, action_num, config: AlgorithmConfig): return agent -def create_DynaSAC(observation_size, action_num, config: AlgorithmConfig): +def create_DynaSAC(observation_size, action_num, config: DynaSACConfig): """ Create networks for model-based SAC agent. The Actor and Critic is same. An extra world model is added. From ca21ca366a6589ae62eea578f2927ae269539e51 Mon Sep 17 00:00:00 2001 From: tonyq Date: Mon, 22 Apr 2024 15:52:27 +1200 Subject: [PATCH 09/91] SAC sample to forward. --- cares_reinforcement_learning/algorithm/policy/SAC.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cares_reinforcement_learning/algorithm/policy/SAC.py b/cares_reinforcement_learning/algorithm/policy/SAC.py index 6794c1da..26291673 100644 --- a/cares_reinforcement_learning/algorithm/policy/SAC.py +++ b/cares_reinforcement_learning/algorithm/policy/SAC.py @@ -72,7 +72,7 @@ def select_action_from_policy( state_tensor = torch.FloatTensor(state) state_tensor = state_tensor.unsqueeze(0).to(self.device) if evaluation is False: - (action, _, _) = self.actor_net.sample(state_tensor) + (action, _, _) = self.actor_net(state_tensor) else: (_, _, action) = self.actor_net(state_tensor) action = action.cpu().data.numpy().flatten() From b1abac378357bf57fbf01b636185ebc95b67e004 Mon Sep 17 00:00:00 2001 From: tonyq Date: Mon, 22 Apr 2024 15:58:06 +1200 Subject: [PATCH 10/91] set G_model to float for now. --- cares_reinforcement_learning/util/configurations.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cares_reinforcement_learning/util/configurations.py b/cares_reinforcement_learning/util/configurations.py index 7e2bedb3..70f095b3 100644 --- a/cares_reinforcement_learning/util/configurations.py +++ b/cares_reinforcement_learning/util/configurations.py @@ -55,7 +55,7 @@ class AlgorithmConfig(SubscriptableClass): algorithm: str = Field(description="Name of the algorithm to be used") G: Optional[int] = 1 - G_model: Optional[int] = 1 + G_model: Optional[float] = 1 buffer_size: Optional[int] = 1000000 batch_size: Optional[int] = 256 max_steps_exploration: Optional[int] = 1000 From 24d0f296f1071d749c11d0da9a1ccb383176f8fc Mon Sep 17 00:00:00 2001 From: tonyq Date: Mon, 22 Apr 2024 16:10:12 +1200 Subject: [PATCH 11/91] set G_model to float for now. --- cares_reinforcement_learning/util/network_factory.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cares_reinforcement_learning/util/network_factory.py b/cares_reinforcement_learning/util/network_factory.py index 64131840..e41a7707 100644 --- a/cares_reinforcement_learning/util/network_factory.py +++ b/cares_reinforcement_learning/util/network_factory.py @@ -121,7 +121,7 @@ def create_DynaSAC(observation_size, action_num, config: DynaSACConfig): return agent -def create_MBRL_DYNA_Reweight(observation_size, action_num, config: DYNAConfig): +def create_DynaSAC_Reweight(observation_size, action_num, config: DynaSACConfig): """ Create networks for model-based SAC agent. The Actor and Critic is same. An extra world model is added. From e42994b60b64a183c8c81afdc442427feb808865 Mon Sep 17 00:00:00 2001 From: tonyq Date: Mon, 22 Apr 2024 16:49:01 +1200 Subject: [PATCH 12/91] Changed sum tree position --- cares_reinforcement_learning/util/__init__.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/cares_reinforcement_learning/util/__init__.py b/cares_reinforcement_learning/util/__init__.py index cbac858e..8dbd1fcd 100644 --- a/cares_reinforcement_learning/util/__init__.py +++ b/cares_reinforcement_learning/util/__init__.py @@ -2,5 +2,4 @@ from cares_reinforcement_learning.util.record import Record from cares_reinforcement_learning.util.rl_parser import RLParser from cares_reinforcement_learning.util.helpers import * -from cares_reinforcement_learning.util.uncertainty_estimation import * -from cares_reinforcement_learning.util.sum_tree import SumTree +from cares_reinforcement_learning.util.uncertainty_estimation import * \ No newline at end of file From a01b112ae503ca4b0fa33f12b2284c2f1c6d3aac Mon Sep 17 00:00:00 2001 From: tonyq Date: Mon, 22 Apr 2024 17:31:10 +1200 Subject: [PATCH 13/91] Change different way of training and using reward networks. --- .../networks/world_models/ensemble_integrated.py | 16 ++++++++++++---- .../networks/world_models/simple_rewards.py | 12 ++++++++---- 2 files changed, 20 insertions(+), 8 deletions(-) diff --git a/cares_reinforcement_learning/networks/world_models/ensemble_integrated.py b/cares_reinforcement_learning/networks/world_models/ensemble_integrated.py index abd48a98..0962ef99 100644 --- a/cares_reinforcement_learning/networks/world_models/ensemble_integrated.py +++ b/cares_reinforcement_learning/networks/world_models/ensemble_integrated.py @@ -104,8 +104,11 @@ def train_overall( model_loss = F.gaussian_nll_loss( input=normalized_mean, target=delta_targets_normalized, var=normalized_var ).mean() - pred_rewards = self.reward_network.forward(pred_next_state, next_actions) - all_loss = F.mse_loss(pred_rewards, next_rewards) + model_loss.mean() + + rwd_mean, rwd_var = self.reward_network.forward(pred_next_state, next_actions) + rwd_loss = F.gaussian_nll_loss(input=rwd_mean, target=next_rewards, var=rwd_var) + all_loss = rwd_loss + model_loss.mean() + # Update self.all_optimizer.zero_grad() all_loss.backward() @@ -183,11 +186,15 @@ def pred_rewards( """ rewards = [] for model in self.models: - pred_rewards = model.reward_network.forward(observation, actions) + pred_rewards, _ = model.reward_network.forward(observation, actions) rewards.append(pred_rewards) # Use average rewards = torch.stack(rewards) - reward = torch.min(rewards, dim=0).values # Pessimetic + + rand_ind = random.randint(0, rewards.shape[0]) - 1 + reward = rewards[rand_ind] + + # reward = torch.min(rewards, dim=0).values # Pessimetic return reward, rewards def pred_next_states( @@ -232,6 +239,7 @@ def pred_next_states( if len(not_nans) == 0: logging.info("Predicting all Nans") sys.exit() + # Random Take next state. rand_ind = random.randint(0, len(not_nans) - 1) prediction = predictions_means[not_nans[rand_ind]] # next = current + delta diff --git a/cares_reinforcement_learning/networks/world_models/simple_rewards.py b/cares_reinforcement_learning/networks/world_models/simple_rewards.py index 4c7574bd..ae9adcfc 100644 --- a/cares_reinforcement_learning/networks/world_models/simple_rewards.py +++ b/cares_reinforcement_learning/networks/world_models/simple_rewards.py @@ -20,7 +20,8 @@ def __init__(self, observation_size: int, num_actions: int, hidden_size: int): self.num_actions = num_actions self.linear1 = nn.Linear(observation_size + num_actions, hidden_size) self.linear2 = nn.Linear(hidden_size, hidden_size) - self.linear3 = nn.Linear(hidden_size, 1) + self.mean = nn.Linear(hidden_size, 1) + self.var = nn.Linear(hidden_size, 1) self.apply(weight_init) def forward( @@ -45,7 +46,10 @@ def forward( x = F.relu(x) x = self.linear2(x) x = F.relu(x) - x = self.linear3(x) + rwd_mean = self.mean(x) + rwd_var = self.var(x) + logvar = torch.tanh(rwd_var) + rwd_var = torch.exp(logvar) if normalized: - x = F.sigmoid(x) - return x + rwd_mean = F.sigmoid(rwd_mean) + return rwd_mean, rwd_var From 9badc15de9d08e5899dc85ddabfc756198d13be3 Mon Sep 17 00:00:00 2001 From: tonyq Date: Mon, 22 Apr 2024 22:17:40 +1200 Subject: [PATCH 14/91] Reweight --- .../algorithm/mbrl/DynaSAC_Reweight.py | 193 +++++++----------- .../util/configurations.py | 17 ++ .../util/uncertainty_estimation.py | 4 + 3 files changed, 99 insertions(+), 115 deletions(-) diff --git a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_Reweight.py b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_Reweight.py index 26558db2..c2e2e8eb 100644 --- a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_Reweight.py +++ b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_Reweight.py @@ -1,6 +1,8 @@ """ Sutton, Richard S. "Dyna, an integrated architecture for learning, planning, and reacting." +Original Paper: https://dl.acm.org/doi/abs/10.1145/122344.122377 + This code runs automatic entropy tuning """ @@ -11,100 +13,94 @@ import numpy as np import torch import torch.nn.functional as F -from cares_reinforcement_learning.util import sampling -class DynaSAC_Reweight: - """ - Use the Soft Actor Critic as the Actor Critic framework. +from cares_reinforcement_learning.memory import PrioritizedReplayBuffer +from cares_reinforcement_learning.util import sampling - """ +class DynaSAC_Reweight: def __init__( self, - actor_network, - critic_network, - world_network, - gamma, - tau, - action_num, - actor_lr, - critic_lr, - alpha_lr, - num_samples, - horizon, - device, + actor_network: torch.nn.Module, + critic_network: torch.nn.Module, + world_network: object, + gamma: float, + tau: float, + action_num: int, + actor_lr: float, + critic_lr: float, + alpha_lr: float, + num_samples: int, + horizon: int, + device: torch.device, ): self.type = "mbrl" - # Switches - self.num_samples = num_samples - self.horizon = horizon - self.action_num = action_num - # Other Variables - self.gamma = gamma - self.tau = tau self.device = device - self.batch_size = None # this may be called policy_net in other implementations - self.actor_net = actor_network.to(device) + self.actor_net = actor_network.to(self.device) # this may be called soft_q_net in other implementations - self.critic_net = critic_network.to(device) - self.target_critic_net = copy.deepcopy(self.critic_net).to(device) + self.critic_net = critic_network.to(self.device) + self.target_critic_net = copy.deepcopy(self.critic_net) - # Set to initial alpha to 1.0 according to other baselines. - self.log_alpha = torch.tensor(np.log(1.0)).to(device) - self.log_alpha.requires_grad = True - self.target_entropy = -action_num + self.gamma = gamma + self.tau = tau + + self.num_samples = num_samples + self.horizon = horizon + self.action_num = action_num + + self.learn_counter = 0 + self.policy_update_freq = 1 - # optimizer self.actor_net_optimiser = torch.optim.Adam( self.actor_net.parameters(), lr=actor_lr ) self.critic_net_optimiser = torch.optim.Adam( self.critic_net.parameters(), lr=critic_lr ) + + # Set to initial alpha to 1.0 according to other baselines. + self.log_alpha = torch.tensor(np.log(1.0)).to(device) + self.log_alpha.requires_grad = True + self.target_entropy = -action_num self.log_alpha_optimizer = torch.optim.Adam([self.log_alpha], lr=alpha_lr) # World model self.world_model = world_network - self.learn_counter = 0 - self.policy_update_freq = 1 @property - def _alpha(self): - """ - A variatble decide to what extend entropy shoud be valued. - """ + def _alpha(self) -> float: return self.log_alpha.exp() # pylint: disable-next=unused-argument to keep the same interface - def select_action_from_policy(self, state, evaluation=False, noise_scale=0): - """ - Select a action for executing. It is the only channel that an agent - will communicate the the actual environment. - - """ + def select_action_from_policy( + self, state: np.ndarray, evaluation: bool = False, noise_scale: float = 0 + ) -> np.ndarray: # note that when evaluating this algorithm we need to select mu as - # action so _, _, action = self.actor_net.sample(state_tensor) self.actor_net.eval() with torch.no_grad(): state_tensor = torch.FloatTensor(state).unsqueeze(0).to(self.device) if evaluation is False: - (action, _, _) = self.actor_net.sample(state_tensor) + (action, _, _) = self.actor_net(state_tensor) else: - (_, _, action) = self.actor_net.sample(state_tensor) + (_, _, action) = self.actor_net(state_tensor) action = action.cpu().data.numpy().flatten() self.actor_net.train() return action - def _train_policy(self, states, actions, rewards, next_states, dones, weights): - """ - Train the policy with Model-Based Value Expansion. A family of MBRL. - - """ - info = {} + def _train_policy( + self, + states: torch.Tensor, + actions: torch.Tensor, + rewards: torch.Tensor, + next_states: torch.Tensor, + dones: torch.Tensor, + weights: torch.Tensor, + ) -> None: + ################## Update the Critic First #################### with torch.no_grad(): - next_actions, next_log_pi, _ = self.actor_net.sample(next_states) + next_actions, next_log_pi, _ = self.actor_net(next_states) target_q_one, target_q_two = self.target_critic_net( next_states, next_actions ) @@ -112,22 +108,22 @@ def _train_policy(self, states, actions, rewards, next_states, dones, weights): torch.minimum(target_q_one, target_q_two) - self._alpha * next_log_pi ) q_target = rewards + self.gamma * (1 - dones) * target_q_values - q_target = q_target.detach() - assert (len(q_target.shape) == 2) and (q_target.shape[1] == 1) q_values_one, q_values_two = self.critic_net(states, actions) - critic_loss_one = 0.5 * ((q_values_one - q_target) * weights).pow(2).mean() - critic_loss_two = 0.5 * ((q_values_two - q_target) * weights).pow(2).mean() + critic_loss_one = 0.5 * (weights * (q_values_one - q_target).pow(2)).mean() + critic_loss_two = 0.5 * (weights * (q_values_two - q_target).pow(2)).mean() + # critic_loss_one = F.mse_loss(q_values_one, q_target) # critic_loss_two = F.mse_loss(q_values_two, q_target) critic_loss_total = critic_loss_one + critic_loss_two + # Update the Critic self.critic_net_optimiser.zero_grad() critic_loss_total.backward() self.critic_net_optimiser.step() ################## Update the Actor Second #################### - pi, first_log_p, _ = self.actor_net.sample(states) + pi, first_log_p, _ = self.actor_net(states) qf1_pi, qf2_pi = self.critic_net(states, pi) min_qf_pi = torch.minimum(qf1_pi, qf2_pi) actor_loss = ((self._alpha * first_log_p) - min_qf_pi).mean() @@ -137,10 +133,11 @@ def _train_policy(self, states, actions, rewards, next_states, dones, weights): actor_loss.backward() self.actor_net_optimiser.step() - # update the temperature + # Update the temperature alpha_loss = -( self.log_alpha * (first_log_p + self.target_entropy).detach() ).mean() + self.log_alpha_optimizer.zero_grad() alpha_loss.backward() self.log_alpha_optimizer.step() @@ -153,31 +150,25 @@ def _train_policy(self, states, actions, rewards, next_states, dones, weights): param.data * self.tau + target_param.data * (1.0 - self.tau) ) - info["q_target"] = q_target - info["q_values_one"] = q_values_one - info["q_values_two"] = q_values_two - info["q_values_min"] = torch.minimum(q_values_one, q_values_two) - info["critic_loss_total"] = critic_loss_total - info["critic_loss_one"] = critic_loss_one - info["critic_loss_two"] = critic_loss_two - info["actor_loss"] = actor_loss - return info - - def train_world_model(self, experiences): - """ - Sample the buffer again for training the world model can reach higher rewards. + def train_world_model( + self, memory: PrioritizedReplayBuffer, batch_size: int + ) -> None: + experiences = memory.sample_consecutive(batch_size) - :param experiences: - """ ( states, actions, rewards, next_states, _, + _, next_actions, next_rewards, + _, + _, + _, ) = experiences + states = torch.FloatTensor(np.asarray(states)).to(self.device) actions = torch.FloatTensor(np.asarray(actions)).to(self.device) rewards = torch.FloatTensor(np.asarray(rewards)).to(self.device).unsqueeze(1) @@ -186,13 +177,8 @@ def train_world_model(self, experiences): torch.FloatTensor(np.asarray(next_rewards)).to(self.device).unsqueeze(1) ) next_actions = torch.FloatTensor(np.asarray(next_actions)).to(self.device) - assert len(states.shape) >= 2 - assert len(actions.shape) == 2 - assert len(rewards.shape) == 2 and rewards.shape[1] == 1 - assert len(next_rewards.shape) == 2 and next_rewards.shape[1] == 1 - assert len(next_states.shape) >= 2 - # # Step 1 train the world model. + # Step 1 train the world model. self.world_model.train_world( states=states, actions=actions, @@ -200,33 +186,20 @@ def train_world_model(self, experiences): next_states=next_states, next_actions=next_actions, next_rewards=next_rewards, - ) - def train_policy(self, experiences): - """ - Interface to training loop. - - """ + def train_policy(self, memory: PrioritizedReplayBuffer, batch_size: int) -> None: self.learn_counter += 1 - ( - states, - actions, - rewards, - next_states, - dones, - ) = experiences - self.batch_size = len(states) + + experiences = memory.sample_uniform(batch_size) + states, actions, rewards, next_states, dones, _ = experiences + # Convert into tensor states = torch.FloatTensor(np.asarray(states)).to(self.device) actions = torch.FloatTensor(np.asarray(actions)).to(self.device) rewards = torch.FloatTensor(np.asarray(rewards)).to(self.device).unsqueeze(1) next_states = torch.FloatTensor(np.asarray(next_states)).to(self.device) dones = torch.LongTensor(np.asarray(dones)).to(self.device).unsqueeze(1) - assert len(states.shape) >= 2 - assert len(actions.shape) == 2 - assert len(rewards.shape) == 2 and rewards.shape[1] == 1 - assert len(next_states.shape) >= 2 full_weights = torch.ones(rewards.shape).to(self.device) # Step 2 train as usual self._train_policy( @@ -259,12 +232,12 @@ def _dyna_generate_and_train(self, next_states): pred_next_state, _, pred_mean, pred_var = self.world_model.pred_next_states( pred_state, pred_acts ) + uncert = sampling(pred_means=pred_mean, pred_vars=pred_var) uncert = 1.5 - uncert uncert = uncert.unsqueeze(dim=1).to(self.device) - - pred_uncerts.append(uncert) + pred_uncerts.append(uncert) pred_reward, _ = self.world_model.pred_rewards(pred_state, pred_acts) pred_states.append(pred_state) pred_actions.append(pred_acts.detach()) @@ -283,17 +256,10 @@ def _dyna_generate_and_train(self, next_states): pred_states, pred_actions, pred_rs, pred_n_states, pred_dones, pred_weights ) - def set_statistics(self, stats): - """ - Set and update the statatistics (means and stds) for MBRL to normalize the states. - - """ + def set_statistics(self, stats: dict) -> None: self.world_model.set_statistics(stats) - def save_models(self, filename, filepath="models"): - """ - Save the intrim actor critics. - """ + def save_models(self, filename: str, filepath: str = "models") -> None: path = f"{filepath}/models" if filepath != "models" else filepath dir_exists = os.path.exists(path) if not dir_exists: @@ -302,10 +268,7 @@ def save_models(self, filename, filepath="models"): torch.save(self.critic_net.state_dict(), f"{path}/{filename}_critic.pth") logging.info("models has been saved...") - def load_models(self, filepath, filename): - """ - Load trained networks - """ + def load_models(self, filepath: str, filename: str) -> None: path = f"{filepath}/models" if filepath != "models" else filepath self.actor_net.load_state_dict(torch.load(f"{path}/{filename}_actor.pth")) self.critic_net.load_state_dict(torch.load(f"{path}/{filename}_critic.pth")) diff --git a/cares_reinforcement_learning/util/configurations.py b/cares_reinforcement_learning/util/configurations.py index 70f095b3..8a51b20d 100644 --- a/cares_reinforcement_learning/util/configurations.py +++ b/cares_reinforcement_learning/util/configurations.py @@ -135,6 +135,23 @@ class SACConfig(AlgorithmConfig): reward_scale: Optional[float] = 1.0 +class DynaSAC_ReweightConfig(AlgorithmConfig): + algorithm: str = Field("DynaSAC_Reweight", Literal=True) + actor_lr: Optional[float] = 3e-4 + critic_lr: Optional[float] = 3e-4 + + alpha_lr: Optional[float] = 3e-4 + use_bounded_active: Optional[bool] = False + num_models: Optional[int] = 5 + + gamma: Optional[float] = 0.99 + tau: Optional[float] = 0.005 + + horizon: Optional[int] = 3 + num_samples: Optional[int] = 10 + world_model_lr: Optional[float] = 0.001 + + class DynaSACConfig(AlgorithmConfig): algorithm: str = Field("DynaSAC", Literal=True) actor_lr: Optional[float] = 3e-4 diff --git a/cares_reinforcement_learning/util/uncertainty_estimation.py b/cares_reinforcement_learning/util/uncertainty_estimation.py index e897f908..1992373d 100644 --- a/cares_reinforcement_learning/util/uncertainty_estimation.py +++ b/cares_reinforcement_learning/util/uncertainty_estimation.py @@ -29,6 +29,10 @@ def sampling(pred_means, pred_vars): # print(stds.shape) # [10 predictions, 11 state dims] total_stds = torch.mean(stds, dim=1) + # Clip for sigmoid + total_stds[total_stds < 0.2] = 0.0 + total_stds[total_stds > 4.0] = 4.0 + total_stds = F.sigmoid(total_stds) # 0.5 - 1.0 # total_stds = 1 / total_stds # total_stds = total_stds / torch.mean(total_stds) # if very uncertain, From fcce509e305d5ffe22e9b5cf40d7ab63ee226ef4 Mon Sep 17 00:00:00 2001 From: tonyq Date: Tue, 23 Apr 2024 18:52:08 +1200 Subject: [PATCH 15/91] network_factory.py add alpha_lr parameter for SAC. --- .../util/network_factory.py | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/cares_reinforcement_learning/util/network_factory.py b/cares_reinforcement_learning/util/network_factory.py index e41a7707..3b936fd1 100644 --- a/cares_reinforcement_learning/util/network_factory.py +++ b/cares_reinforcement_learning/util/network_factory.py @@ -4,11 +4,7 @@ import torch -from cares_reinforcement_learning.util.configurations import ( - AlgorithmConfig, - DynaSACConfig, - SACConfig, -) +from cares_reinforcement_learning.util.configurations import AlgorithmConfig # Disable these as this is a deliberate use of dynamic imports @@ -121,7 +117,7 @@ def create_DynaSAC(observation_size, action_num, config: DynaSACConfig): return agent -def create_DynaSAC_Reweight(observation_size, action_num, config: DynaSACConfig): +def create_DynaSAC_Reweight(observation_size, action_num, config: AlgorithmConfig): """ Create networks for model-based SAC agent. The Actor and Critic is same. An extra world model is added. @@ -161,10 +157,7 @@ def create_DynaSAC_Reweight(observation_size, action_num, config: DynaSACConfig) return agent -def create_SAC(observation_size, action_num, config: SACConfig): - """ - Create an SAC agent. - """ +def create_SAC(observation_size, action_num, config: AlgorithmConfig): from cares_reinforcement_learning.algorithm.policy import SAC from cares_reinforcement_learning.networks.SAC import Actor, Critic From 157a9b7b2fd3d7e14a64fc1b8c3f98bea9c3ce4a Mon Sep 17 00:00:00 2001 From: "QIAO, Ting" Date: Wed, 24 Apr 2024 13:34:45 +1200 Subject: [PATCH 16/91] Update network_factory.py --- cares_reinforcement_learning/util/network_factory.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cares_reinforcement_learning/util/network_factory.py b/cares_reinforcement_learning/util/network_factory.py index 3b936fd1..8f33d615 100644 --- a/cares_reinforcement_learning/util/network_factory.py +++ b/cares_reinforcement_learning/util/network_factory.py @@ -77,7 +77,7 @@ def create_PPO(observation_size, action_num, config: AlgorithmConfig): return agent -def create_DynaSAC(observation_size, action_num, config: DynaSACConfig): +def create_DynaSAC(observation_size, action_num, config: AlgorithmConfig): """ Create networks for model-based SAC agent. The Actor and Critic is same. An extra world model is added. From 32bcb07f19c00c6ceb9f11f1dbac510a2c61de46 Mon Sep 17 00:00:00 2001 From: tony Date: Wed, 24 Apr 2024 18:05:53 +1200 Subject: [PATCH 17/91] Use Ensmeble of network and simple reward approximator. --- .../algorithm/mbrl/DynaSAC.py | 65 ++++--- .../algorithm/mbrl/DynaSAC_Reweight.py | 73 +++++--- .../networks/world_models/__init__.py | 3 + .../world_models/ensemble_integrated.py | 11 +- .../networks/world_models/ensemble_world.py | 166 ++++++++++++++++++ .../world_models/probability_rewards.py | 55 ++++++ .../networks/world_models/simple_rewards.py | 10 +- .../util/network_factory.py | 6 +- 8 files changed, 328 insertions(+), 61 deletions(-) create mode 100644 cares_reinforcement_learning/networks/world_models/ensemble_world.py create mode 100644 cares_reinforcement_learning/networks/world_models/probability_rewards.py diff --git a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC.py b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC.py index 5ca99eda..86dd20f7 100644 --- a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC.py +++ b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC.py @@ -18,6 +18,9 @@ from cares_reinforcement_learning.networks.world_models.ensemble_integrated import ( EnsembleWorldReward, ) +from cares_reinforcement_learning.networks.world_models.ensemble_world import ( + EnsembleWorldAndOneReward, +) class DynaSAC: @@ -25,7 +28,7 @@ def __init__( self, actor_network: torch.nn.Module, critic_network: torch.nn.Module, - world_network: EnsembleWorldReward, + world_network: EnsembleWorldAndOneReward, gamma: float, tau: float, action_num: int, @@ -151,39 +154,53 @@ def _train_policy( def train_world_model( self, memory: PrioritizedReplayBuffer, batch_size: int ) -> None: - experiences = memory.sample_consecutive(batch_size) - - ( - states, - actions, - rewards, - next_states, - _, - _, - next_actions, - next_rewards, - _, - _, - _, - ) = experiences + + experiences = memory.sample_uniform(batch_size) + states, actions, rewards, next_states, _, _ = experiences + + # experiences = memory.sample_consecutive(batch_size) + # ( + # states, + # actions, + # rewards, + # next_states, + # _, + # _, + # next_actions, + # next_rewards, + # _, + # _, + # _, + # ) = experiences states = torch.FloatTensor(np.asarray(states)).to(self.device) actions = torch.FloatTensor(np.asarray(actions)).to(self.device) rewards = torch.FloatTensor(np.asarray(rewards)).to(self.device).unsqueeze(1) next_states = torch.FloatTensor(np.asarray(next_states)).to(self.device) - next_rewards = ( - torch.FloatTensor(np.asarray(next_rewards)).to(self.device).unsqueeze(1) - ) - next_actions = torch.FloatTensor(np.asarray(next_actions)).to(self.device) + # next_rewards = ( + # torch.FloatTensor(np.asarray(next_rewards)).to(self.device).unsqueeze(1) + # ) + # next_actions = torch.FloatTensor(np.asarray(next_actions)).to(self.device) # Step 1 train the world model. + # self.world_model.train_world( + # states=states, + # actions=actions, + # rewards=rewards, + # next_states=next_states, + # next_actions=next_actions, + # next_rewards=next_rewards, + # ) + self.world_model.train_world( states=states, actions=actions, - rewards=rewards, next_states=next_states, - next_actions=next_actions, - next_rewards=next_rewards, + ) + self.world_model.train_reward( + states=states, + actions=actions, + rewards=rewards, ) def train_policy(self, memory: PrioritizedReplayBuffer, batch_size: int) -> None: @@ -226,7 +243,7 @@ def _dyna_generate_and_train(self, next_states: torch.Tensor) -> None: pred_next_state, _, _, _ = self.world_model.pred_next_states( pred_state, pred_acts ) - pred_reward, _ = self.world_model.pred_rewards(pred_state, pred_acts) + pred_reward = self.world_model.pred_rewards(pred_state, pred_acts) pred_states.append(pred_state) pred_actions.append(pred_acts.detach()) pred_rs.append(pred_reward.detach()) diff --git a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_Reweight.py b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_Reweight.py index c2e2e8eb..28ef992a 100644 --- a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_Reweight.py +++ b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_Reweight.py @@ -17,13 +17,20 @@ from cares_reinforcement_learning.memory import PrioritizedReplayBuffer from cares_reinforcement_learning.util import sampling +from cares_reinforcement_learning.networks.world_models.ensemble_integrated import ( + EnsembleWorldReward, +) +from cares_reinforcement_learning.networks.world_models.ensemble_world import ( + EnsembleWorldAndOneReward, +) + class DynaSAC_Reweight: def __init__( self, actor_network: torch.nn.Module, critic_network: torch.nn.Module, - world_network: object, + world_network: EnsembleWorldAndOneReward, gamma: float, tau: float, action_num: int, @@ -153,39 +160,53 @@ def _train_policy( def train_world_model( self, memory: PrioritizedReplayBuffer, batch_size: int ) -> None: - experiences = memory.sample_consecutive(batch_size) - - ( - states, - actions, - rewards, - next_states, - _, - _, - next_actions, - next_rewards, - _, - _, - _, - ) = experiences + experiences = memory.sample_uniform(batch_size) + states, actions, rewards, next_states, _, _ = experiences + + # experiences = memory.sample_consecutive(batch_size) + # + # ( + # states, + # actions, + # rewards, + # next_states, + # _, + # _, + # next_actions, + # next_rewards, + # _, + # _, + # _, + # ) = experiences states = torch.FloatTensor(np.asarray(states)).to(self.device) actions = torch.FloatTensor(np.asarray(actions)).to(self.device) rewards = torch.FloatTensor(np.asarray(rewards)).to(self.device).unsqueeze(1) next_states = torch.FloatTensor(np.asarray(next_states)).to(self.device) - next_rewards = ( - torch.FloatTensor(np.asarray(next_rewards)).to(self.device).unsqueeze(1) - ) - next_actions = torch.FloatTensor(np.asarray(next_actions)).to(self.device) + # next_rewards = ( + # torch.FloatTensor(np.asarray(next_rewards)).to(self.device).unsqueeze(1) + # ) + # next_actions = torch.FloatTensor(np.asarray(next_actions)).to(self.device) + + # # Step 1 train the world model. + # self.world_model.train_world( + # states=states, + # actions=actions, + # rewards=rewards, + # next_states=next_states, + # next_actions=next_actions, + # next_rewards=next_rewards, + # ) - # Step 1 train the world model. self.world_model.train_world( states=states, actions=actions, - rewards=rewards, next_states=next_states, - next_actions=next_actions, - next_rewards=next_rewards, + ) + self.world_model.train_reward( + states=states, + actions=actions, + rewards=rewards, ) def train_policy(self, memory: PrioritizedReplayBuffer, batch_size: int) -> None: @@ -229,6 +250,7 @@ def _dyna_generate_and_train(self, next_states): # This part is controversial. But random actions is empirically better. rand_acts = np.random.uniform(-1, 1, (pred_state.shape[0], self.action_num)) pred_acts = torch.FloatTensor(rand_acts).to(self.device) + pred_next_state, _, pred_mean, pred_var = self.world_model.pred_next_states( pred_state, pred_acts ) @@ -238,7 +260,8 @@ def _dyna_generate_and_train(self, next_states): uncert = uncert.unsqueeze(dim=1).to(self.device) pred_uncerts.append(uncert) - pred_reward, _ = self.world_model.pred_rewards(pred_state, pred_acts) + pred_reward = self.world_model.pred_rewards(pred_state, pred_acts) + pred_states.append(pred_state) pred_actions.append(pred_acts.detach()) pred_rs.append(pred_reward.detach()) diff --git a/cares_reinforcement_learning/networks/world_models/__init__.py b/cares_reinforcement_learning/networks/world_models/__init__.py index 7a4356a7..0fede213 100644 --- a/cares_reinforcement_learning/networks/world_models/__init__.py +++ b/cares_reinforcement_learning/networks/world_models/__init__.py @@ -1,3 +1,6 @@ from cares_reinforcement_learning.networks.world_models.ensemble_integrated import ( EnsembleWorldReward, ) +from cares_reinforcement_learning.networks.world_models.ensemble_world import ( + EnsembleWorldAndOneReward, +) \ No newline at end of file diff --git a/cares_reinforcement_learning/networks/world_models/ensemble_integrated.py b/cares_reinforcement_learning/networks/world_models/ensemble_integrated.py index 0962ef99..d7b10f87 100644 --- a/cares_reinforcement_learning/networks/world_models/ensemble_integrated.py +++ b/cares_reinforcement_learning/networks/world_models/ensemble_integrated.py @@ -15,6 +15,10 @@ from cares_reinforcement_learning.networks.world_models.simple_rewards import ( SimpleReward, ) +# from cares_reinforcement_learning.networks.world_models.probability_rewards import ( +# ProbabilityReward, +# ) + from cares_reinforcement_learning.util.helpers import normalize_observation_delta @@ -105,8 +109,9 @@ def train_overall( input=normalized_mean, target=delta_targets_normalized, var=normalized_var ).mean() - rwd_mean, rwd_var = self.reward_network.forward(pred_next_state, next_actions) - rwd_loss = F.gaussian_nll_loss(input=rwd_mean, target=next_rewards, var=rwd_var) + rwd_mean = self.reward_network.forward(pred_next_state, next_actions) + # rwd_loss = F.gaussian_nll_loss(input=rwd_mean, target=next_rewards, var=rwd_var) + rwd_loss = F.mse_loss(rwd_mean, next_rewards) all_loss = rwd_loss + model_loss.mean() # Update @@ -186,7 +191,7 @@ def pred_rewards( """ rewards = [] for model in self.models: - pred_rewards, _ = model.reward_network.forward(observation, actions) + pred_rewards = model.reward_network.forward(observation, actions) rewards.append(pred_rewards) # Use average rewards = torch.stack(rewards) diff --git a/cares_reinforcement_learning/networks/world_models/ensemble_world.py b/cares_reinforcement_learning/networks/world_models/ensemble_world.py new file mode 100644 index 00000000..1e27d69d --- /dev/null +++ b/cares_reinforcement_learning/networks/world_models/ensemble_world.py @@ -0,0 +1,166 @@ +import logging +import math +import random +import sys + +import numpy as np +import torch +import torch.nn.functional as F +import torch.utils +from torch import optim + +from cares_reinforcement_learning.networks.world_models.simple_dynamics import ( + SimpleDynamics, +) +from cares_reinforcement_learning.networks.world_models.simple_rewards import ( + SimpleReward, +) +from cares_reinforcement_learning.util.helpers import normalize_observation_delta + + +class EnsembleWorldAndOneReward: + def __init__( + self, + observation_size: int, + num_actions: int, + num_models: int, + lr: float, + device: str, + hidden_size: int = 128, + ): + self.num_models = num_models + self.observation_size = observation_size + self.num_actions = num_actions + + self.reward_network = SimpleReward( + observation_size=observation_size, + num_actions=num_actions, + hidden_size=hidden_size, + ) + self.reward_optimizer = optim.Adam(self.reward_network.parameters(), lr=lr) + + self.models = [ + SimpleDynamics( + observation_size=observation_size, + num_actions=num_actions, + hidden_size=hidden_size, + ) + for _ in range(self.num_models) + ] + + self.optimizers = [optim.Adam(self.models[i].parameters(), lr=lr) for i in range(self.num_models)] + + self.statistics = {} + + # Bring all reward prediction and dynamic rediction networks to device. + self.device = device + for model in self.models: + model.to(device) + + def set_statistics(self, statistics: dict) -> None: + """ + Update all statistics for normalization for all world models and the + ensemble itself. + + :param (Dictionary) statistics: + """ + for key, value in statistics.items(): + if isinstance(value, np.ndarray): + statistics[key] = torch.FloatTensor(statistics[key]).to(self.device) + + self.statistics = statistics + for model in self.models: + model.statistics = statistics + + def pred_rewards(self, observation: torch.Tensor, actions: torch.Tensor): + pred_rewards = self.reward_network(observation, actions) + return pred_rewards + + def pred_next_states( + self, observation: torch.Tensor, actions: torch.Tensor + ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: + assert ( + observation.shape[1] + actions.shape[1] + == self.observation_size + self.num_actions + ) + means = [] + norm_means = [] + norm_vars = [] + # Iterate over the neural networks and get the predictions + for model in self.models: + # Predict delta + mean, n_mean, n_var = model.forward(observation, actions) + means.append(mean) + norm_means.append(n_mean) + norm_vars.append(n_var) + # Normalized + predictions_means = torch.stack(means) + predictions_norm_means = torch.stack(norm_means) + predictions_vars = torch.stack(norm_vars) + # Get rid of the nans + not_nans = [] + for i in range(self.num_models): + if not torch.any(torch.isnan(predictions_means[i])): + not_nans.append(i) + if len(not_nans) == 0: + logging.info("Predicting all Nans") + sys.exit() + # Random Take next state. + rand_ind = random.randint(0, len(not_nans) - 1) + prediction = predictions_means[not_nans[rand_ind]] + # next = current + delta + prediction += observation + all_predictions = torch.stack(means) + for j in range(all_predictions.shape[0]): + all_predictions[j] += observation + return prediction, all_predictions, predictions_norm_means, predictions_vars + + def train_world( + self, + states: torch.Tensor, + actions: torch.Tensor, + next_states: torch.Tensor, + ) -> None: + + assert len(states.shape) >= 2 + assert len(actions.shape) == 2 + assert ( + states.shape[1] + actions.shape[1] + == self.num_actions + self.observation_size + ) + # For each model, train with different data. + mini_batch_size = int(math.floor(states.shape[0] / self.num_models)) + + for i in range(self.num_models): + sub_states = states[i * mini_batch_size: (i + 1) * mini_batch_size] + sub_actions = actions[i * mini_batch_size: (i + 1) * mini_batch_size] + sub_next_states = next_states[i * mini_batch_size: (i + 1) * mini_batch_size] + sub_target = sub_next_states - sub_states + + delta_targets_normalized = normalize_observation_delta(sub_target, self.statistics) + _, n_mean, n_var = self.models[i].forward(sub_states, sub_actions) + model_loss = F.gaussian_nll_loss(input=n_mean, target=delta_targets_normalized, var=n_var).mean() + + self.optimizers[i].zero_grad() + model_loss.backward() + self.optimizers[i].step() + + def train_reward( + self, + states: torch.Tensor, + actions: torch.Tensor, + rewards: torch.Tensor, + ) -> None: + assert len(states.shape) >= 2 + assert len(actions.shape) == 2 + assert ( + states.shape[1] + actions.shape[1] + == self.num_actions + self.observation_size + ) + self.reward_optimizer.zero_grad() + rwd_mean = self.reward_network.forward(states, actions) + reward_loss = F.mse_loss(rwd_mean, rewards) + reward_loss.backward() + self.reward_optimizer.step() + + diff --git a/cares_reinforcement_learning/networks/world_models/probability_rewards.py b/cares_reinforcement_learning/networks/world_models/probability_rewards.py new file mode 100644 index 00000000..3e53d5a3 --- /dev/null +++ b/cares_reinforcement_learning/networks/world_models/probability_rewards.py @@ -0,0 +1,55 @@ +import torch +from torch import nn +import torch.nn.functional as F +from cares_reinforcement_learning.util.helpers import weight_init + + +class ProbabilityReward(nn.Module): + def __init__(self, observation_size: int, num_actions: int, hidden_size: int): + """ + Note, This reward function is limited to 0 ~ 1 for dm_control. + A reward model with fully connected layers. It takes current states (s) + and current actions (a), and predict rewards (r). + + :param (int) observation_size -- dimension of states + :param (int) num_actions -- dimension of actions + :param (int) hidden_size -- size of neurons in hidden layers. + """ + super().__init__() + self.observation_size = observation_size + self.num_actions = num_actions + self.linear1 = nn.Linear(observation_size + num_actions, hidden_size) + self.linear2 = nn.Linear(hidden_size, hidden_size) + self.mean = nn.Linear(hidden_size, 1) + self.var = nn.Linear(hidden_size, 1) + self.apply(weight_init) + + def forward( + self, observation: torch.Tensor, actions: torch.Tensor, normalized: bool = False + ) -> torch.Tensor: + """ + Forward the inputs throught the network. + Note: For DMCS environment, the reward is from 0~1. + + :param (Tensors) obs -- dimension of states + :param (Tensors) actions -- dimension of actions + :param (Bool) normalized -- whether normalized reward to 0~1 + + :return (Tensors) x -- predicted rewards. + """ + assert ( + observation.shape[1] + actions.shape[1] + == self.observation_size + self.num_actions + ) + x = torch.cat((observation, actions), dim=1) + x = self.linear1(x) + x = F.relu(x) + x = self.linear2(x) + x = F.relu(x) + rwd_mean = self.mean(x) + rwd_var = self.var(x) + logvar = torch.tanh(rwd_var) + rwd_var = torch.exp(logvar) + if normalized: + rwd_mean = F.sigmoid(rwd_mean) + return rwd_mean, rwd_var diff --git a/cares_reinforcement_learning/networks/world_models/simple_rewards.py b/cares_reinforcement_learning/networks/world_models/simple_rewards.py index ae9adcfc..879b71b0 100644 --- a/cares_reinforcement_learning/networks/world_models/simple_rewards.py +++ b/cares_reinforcement_learning/networks/world_models/simple_rewards.py @@ -20,8 +20,7 @@ def __init__(self, observation_size: int, num_actions: int, hidden_size: int): self.num_actions = num_actions self.linear1 = nn.Linear(observation_size + num_actions, hidden_size) self.linear2 = nn.Linear(hidden_size, hidden_size) - self.mean = nn.Linear(hidden_size, 1) - self.var = nn.Linear(hidden_size, 1) + self.linear3 = nn.Linear(hidden_size, 1) self.apply(weight_init) def forward( @@ -46,10 +45,7 @@ def forward( x = F.relu(x) x = self.linear2(x) x = F.relu(x) - rwd_mean = self.mean(x) - rwd_var = self.var(x) - logvar = torch.tanh(rwd_var) - rwd_var = torch.exp(logvar) + rwd_mean = self.linear3(x) if normalized: rwd_mean = F.sigmoid(rwd_mean) - return rwd_mean, rwd_var + return rwd_mean diff --git a/cares_reinforcement_learning/util/network_factory.py b/cares_reinforcement_learning/util/network_factory.py index 8f33d615..6e88c4a3 100644 --- a/cares_reinforcement_learning/util/network_factory.py +++ b/cares_reinforcement_learning/util/network_factory.py @@ -86,13 +86,14 @@ def create_DynaSAC(observation_size, action_num, config: AlgorithmConfig): from cares_reinforcement_learning.algorithm.mbrl import DynaSAC from cares_reinforcement_learning.networks.SAC import Actor, Critic from cares_reinforcement_learning.networks.world_models import EnsembleWorldReward + from cares_reinforcement_learning.networks.world_models import EnsembleWorldAndOneReward actor = Actor(observation_size, action_num) critic = Critic(observation_size, action_num) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - world_model = EnsembleWorldReward( + world_model = EnsembleWorldAndOneReward( observation_size=observation_size, num_actions=action_num, num_models=config.num_models, @@ -126,13 +127,14 @@ def create_DynaSAC_Reweight(observation_size, action_num, config: AlgorithmConfi from cares_reinforcement_learning.algorithm.mbrl import DynaSAC_Reweight from cares_reinforcement_learning.networks.SAC import Actor, Critic from cares_reinforcement_learning.networks.world_models import EnsembleWorldReward + from cares_reinforcement_learning.networks.world_models import EnsembleWorldAndOneReward actor = Actor(observation_size, action_num) critic = Critic(observation_size, action_num) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - world_model = EnsembleWorldReward( + world_model = EnsembleWorldAndOneReward( observation_size=observation_size, num_actions=action_num, num_models=config.num_models, From 137c5f6a660403acc7d57757a7d19202680dda44 Mon Sep 17 00:00:00 2001 From: tony Date: Wed, 24 Apr 2024 19:42:40 +1200 Subject: [PATCH 18/91] reward network to device --- .../networks/world_models/ensemble_world.py | 1 + 1 file changed, 1 insertion(+) diff --git a/cares_reinforcement_learning/networks/world_models/ensemble_world.py b/cares_reinforcement_learning/networks/world_models/ensemble_world.py index 1e27d69d..42a30079 100644 --- a/cares_reinforcement_learning/networks/world_models/ensemble_world.py +++ b/cares_reinforcement_learning/networks/world_models/ensemble_world.py @@ -54,6 +54,7 @@ def __init__( # Bring all reward prediction and dynamic rediction networks to device. self.device = device + self.reward_network.to(self.device) for model in self.models: model.to(device) From aa3f958a41e61cd15841f06107271d2a57fc19a8 Mon Sep 17 00:00:00 2001 From: tony Date: Mon, 29 Apr 2024 10:35:21 +1200 Subject: [PATCH 19/91] reward network to device --- .../algorithm/mbrl/DynaSAC.py | 2 +- .../algorithm/mbrl/DynaSAC_Reweight.py | 54 +++++++++++++++++-- .../networks/world_models/ensemble_world.py | 12 ++--- .../networks/world_models/simple_rewards.py | 16 +++--- 4 files changed, 67 insertions(+), 17 deletions(-) diff --git a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC.py b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC.py index 86dd20f7..7c0fdf32 100644 --- a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC.py +++ b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC.py @@ -243,7 +243,7 @@ def _dyna_generate_and_train(self, next_states: torch.Tensor) -> None: pred_next_state, _, _, _ = self.world_model.pred_next_states( pred_state, pred_acts ) - pred_reward = self.world_model.pred_rewards(pred_state, pred_acts) + pred_reward = self.world_model.pred_rewards(pred_next_state) pred_states.append(pred_state) pred_actions.append(pred_acts.detach()) pred_rs.append(pred_reward.detach()) diff --git a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_Reweight.py b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_Reweight.py index 28ef992a..1f39c8d5 100644 --- a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_Reweight.py +++ b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_Reweight.py @@ -204,7 +204,7 @@ def train_world_model( next_states=next_states, ) self.world_model.train_reward( - states=states, + next_states=next_states, actions=actions, rewards=rewards, ) @@ -255,12 +255,12 @@ def _dyna_generate_and_train(self, next_states): pred_state, pred_acts ) - uncert = sampling(pred_means=pred_mean, pred_vars=pred_var) + uncert = self.sampling(pred_means=pred_mean, pred_vars=pred_var) uncert = 1.5 - uncert uncert = uncert.unsqueeze(dim=1).to(self.device) pred_uncerts.append(uncert) - pred_reward = self.world_model.pred_rewards(pred_state, pred_acts) + pred_reward = self.world_model.pred_rewards(pred_next_state) pred_states.append(pred_state) pred_actions.append(pred_acts.detach()) @@ -279,6 +279,54 @@ def _dyna_generate_and_train(self, next_states): pred_states, pred_actions, pred_rs, pred_n_states, pred_dones, pred_weights ) + def sampling(self, pred_means, pred_vars): + """ + High std means low uncertainty. Therefore, divided by 1 + + :param pred_means: + :param pred_vars: + :return: + """ + # 5 models, each sampled 10 times = 50, + sample1 = torch.distributions.Normal(pred_means[0], pred_vars[0]).sample( + [10]) + rwd_sample1 = self.world_model.pred_rewards(sample1) + + sample2 = torch.distributions.Normal(pred_means[1], pred_vars[1]).sample( + [10]) + rwd_sample2 = self.world_model.pred_rewards(sample2) + + sample3 = torch.distributions.Normal(pred_means[2], pred_vars[2]).sample( + [10]) + rwd_sample3 = self.world_model.pred_rewards(sample3) + + sample4 = torch.distributions.Normal(pred_means[3], pred_vars[3]).sample( + [10]) + rwd_sample4 = self.world_model.pred_rewards(sample4) + + sample5 = torch.distributions.Normal(pred_means[4], pred_vars[4]).sample( + [10]) + rwd_sample5 = self.world_model.pred_rewards(sample5) + + samples = torch.cat((rwd_sample1, rwd_sample2, rwd_sample3, rwd_sample4, rwd_sample5)) + # samples = torch.cat((sample1, sample2, sample3, sample4, sample5)) + # Samples = [5 * 10, 10 predictions, 11 state dims] + # print(samples.shape) + stds = torch.var(samples, dim=0) + # print(stds.shape) + # [10 predictions, 11 state dims] + total_stds = torch.mean(stds, dim=1) + # Clip for sigmoid + total_stds[total_stds < 0.2] = 0.0 + total_stds[total_stds > 4.0] = 4.0 + + total_stds = F.sigmoid(total_stds) # 0.5 - 1.0 + # total_stds = 1 / total_stds + # total_stds = total_stds / torch.mean(total_stds) # if very uncertain, + # high std, encouraged. + # total_stds = total_stds - torch.min(total_stds) + return total_stds.detach() + def set_statistics(self, stats: dict) -> None: self.world_model.set_statistics(stats) diff --git a/cares_reinforcement_learning/networks/world_models/ensemble_world.py b/cares_reinforcement_learning/networks/world_models/ensemble_world.py index 42a30079..58c1c13a 100644 --- a/cares_reinforcement_learning/networks/world_models/ensemble_world.py +++ b/cares_reinforcement_learning/networks/world_models/ensemble_world.py @@ -73,8 +73,8 @@ def set_statistics(self, statistics: dict) -> None: for model in self.models: model.statistics = statistics - def pred_rewards(self, observation: torch.Tensor, actions: torch.Tensor): - pred_rewards = self.reward_network(observation, actions) + def pred_rewards(self, observation: torch.Tensor): + pred_rewards = self.reward_network(observation) return pred_rewards def pred_next_states( @@ -148,18 +148,18 @@ def train_world( def train_reward( self, - states: torch.Tensor, + next_states: torch.Tensor, actions: torch.Tensor, rewards: torch.Tensor, ) -> None: - assert len(states.shape) >= 2 + assert len(next_states.shape) >= 2 assert len(actions.shape) == 2 assert ( - states.shape[1] + actions.shape[1] + next_states.shape[1] + actions.shape[1] == self.num_actions + self.observation_size ) self.reward_optimizer.zero_grad() - rwd_mean = self.reward_network.forward(states, actions) + rwd_mean = self.reward_network.forward(next_states) reward_loss = F.mse_loss(rwd_mean, rewards) reward_loss.backward() self.reward_optimizer.step() diff --git a/cares_reinforcement_learning/networks/world_models/simple_rewards.py b/cares_reinforcement_learning/networks/world_models/simple_rewards.py index 879b71b0..d385de12 100644 --- a/cares_reinforcement_learning/networks/world_models/simple_rewards.py +++ b/cares_reinforcement_learning/networks/world_models/simple_rewards.py @@ -18,13 +18,14 @@ def __init__(self, observation_size: int, num_actions: int, hidden_size: int): super().__init__() self.observation_size = observation_size self.num_actions = num_actions - self.linear1 = nn.Linear(observation_size + num_actions, hidden_size) + self.linear1 = nn.Linear(observation_size, hidden_size) + # self.linear1 = nn.Linear(observation_size + num_actions, hidden_size) self.linear2 = nn.Linear(hidden_size, hidden_size) self.linear3 = nn.Linear(hidden_size, 1) self.apply(weight_init) def forward( - self, observation: torch.Tensor, actions: torch.Tensor, normalized: bool = False + self, observation: torch.Tensor, normalized: bool = False ) -> torch.Tensor: """ Forward the inputs throught the network. @@ -36,11 +37,12 @@ def forward( :return (Tensors) x -- predicted rewards. """ - assert ( - observation.shape[1] + actions.shape[1] - == self.observation_size + self.num_actions - ) - x = torch.cat((observation, actions), dim=1) + # assert ( + # observation.shape[1] + actions.shape[1] + # == self.observation_size + self.num_actions + # ) + # x = torch.cat((observation, actions), dim=1) + x = observation x = self.linear1(x) x = F.relu(x) x = self.linear2(x) From b1ed6da4dbb44d0fb0366808b3f0572130fb807e Mon Sep 17 00:00:00 2001 From: tony Date: Mon, 29 Apr 2024 13:00:52 +1200 Subject: [PATCH 20/91] reward network to device --- .../algorithm/mbrl/DynaSAC.py | 2 +- .../algorithm/mbrl/DynaSAC_Reweight.py | 27 ++++++++++++++++--- 2 files changed, 24 insertions(+), 5 deletions(-) diff --git a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC.py b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC.py index 7c0fdf32..11343bce 100644 --- a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC.py +++ b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC.py @@ -198,7 +198,7 @@ def train_world_model( next_states=next_states, ) self.world_model.train_reward( - states=states, + next_states=next_states, actions=actions, rewards=rewards, ) diff --git a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_Reweight.py b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_Reweight.py index 1f39c8d5..490d0fe4 100644 --- a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_Reweight.py +++ b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_Reweight.py @@ -117,8 +117,27 @@ def _train_policy( q_target = rewards + self.gamma * (1 - dones) * target_q_values q_values_one, q_values_two = self.critic_net(states, actions) - critic_loss_one = 0.5 * (weights * (q_values_one - q_target).pow(2)).mean() - critic_loss_two = 0.5 * (weights * (q_values_two - q_target).pow(2)).mean() + + l1_loss_one = q_values_one - q_target + l1_loss_two = q_values_two - q_target + + reweighted_l1_loss_one = weights * l1_loss_one + reweighted_l1_loss_two = weights * l1_loss_two + + total_loss_after_one = torch.sum(reweighted_l1_loss_one) + total_loss_after_two = torch.sum(reweighted_l1_loss_two) + + total_loss_one = torch.sum(l1_loss_one) + total_loss_two = torch.sum(l1_loss_two) + + ratio_one = total_loss_one / total_loss_after_one + ratio_two = total_loss_two / total_loss_after_two + + reweighted_l1_loss_one = reweighted_l1_loss_one * ratio_one + reweighted_l1_loss_two = reweighted_l1_loss_two * ratio_two + + critic_loss_one = 0.5 * (reweighted_l1_loss_one.pow(2)).mean() + critic_loss_two = 0.5 * (reweighted_l1_loss_two.pow(2)).mean() # critic_loss_one = F.mse_loss(q_values_one, q_target) # critic_loss_two = F.mse_loss(q_values_two, q_target) @@ -317,8 +336,8 @@ def sampling(self, pred_means, pred_vars): # [10 predictions, 11 state dims] total_stds = torch.mean(stds, dim=1) # Clip for sigmoid - total_stds[total_stds < 0.2] = 0.0 - total_stds[total_stds > 4.0] = 4.0 + # total_stds[total_stds < 0.2] = 0.0 + # total_stds[total_stds > 4.0] = 4.0 total_stds = F.sigmoid(total_stds) # 0.5 - 1.0 # total_stds = 1 / total_stds From 6727cf15e2cd1715b5bab0ce8b1ae9758a3fe01e Mon Sep 17 00:00:00 2001 From: tony Date: Tue, 30 Apr 2024 11:12:55 +1200 Subject: [PATCH 21/91] test with simple R. --- .../algorithm/mbrl/DynaSAC.py | 1 - .../algorithm/mbrl/DynaSAC_Reweight.py | 2 -- .../networks/world_models/ensemble_world.py | 11 +++++------ .../util/uncertainty_estimation.py | 4 ++-- 4 files changed, 7 insertions(+), 11 deletions(-) diff --git a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC.py b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC.py index 11343bce..82ec4c12 100644 --- a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC.py +++ b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC.py @@ -199,7 +199,6 @@ def train_world_model( ) self.world_model.train_reward( next_states=next_states, - actions=actions, rewards=rewards, ) diff --git a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_Reweight.py b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_Reweight.py index 490d0fe4..e97bc43a 100644 --- a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_Reweight.py +++ b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_Reweight.py @@ -126,7 +126,6 @@ def _train_policy( total_loss_after_one = torch.sum(reweighted_l1_loss_one) total_loss_after_two = torch.sum(reweighted_l1_loss_two) - total_loss_one = torch.sum(l1_loss_one) total_loss_two = torch.sum(l1_loss_two) @@ -224,7 +223,6 @@ def train_world_model( ) self.world_model.train_reward( next_states=next_states, - actions=actions, rewards=rewards, ) diff --git a/cares_reinforcement_learning/networks/world_models/ensemble_world.py b/cares_reinforcement_learning/networks/world_models/ensemble_world.py index 58c1c13a..570b0659 100644 --- a/cares_reinforcement_learning/networks/world_models/ensemble_world.py +++ b/cares_reinforcement_learning/networks/world_models/ensemble_world.py @@ -149,15 +149,14 @@ def train_world( def train_reward( self, next_states: torch.Tensor, - actions: torch.Tensor, rewards: torch.Tensor, ) -> None: assert len(next_states.shape) >= 2 - assert len(actions.shape) == 2 - assert ( - next_states.shape[1] + actions.shape[1] - == self.num_actions + self.observation_size - ) + # assert len(actions.shape) == 2 + # assert ( + # next_states.shape[1] + actions.shape[1] + # == self.num_actions + self.observation_size + # ) self.reward_optimizer.zero_grad() rwd_mean = self.reward_network.forward(next_states) reward_loss = F.mse_loss(rwd_mean, rewards) diff --git a/cares_reinforcement_learning/util/uncertainty_estimation.py b/cares_reinforcement_learning/util/uncertainty_estimation.py index 1992373d..ed9b0933 100644 --- a/cares_reinforcement_learning/util/uncertainty_estimation.py +++ b/cares_reinforcement_learning/util/uncertainty_estimation.py @@ -30,8 +30,8 @@ def sampling(pred_means, pred_vars): # [10 predictions, 11 state dims] total_stds = torch.mean(stds, dim=1) # Clip for sigmoid - total_stds[total_stds < 0.2] = 0.0 - total_stds[total_stds > 4.0] = 4.0 + # total_stds[total_stds < 0.2] = 0.0 + # total_stds[total_stds > 4.0] = 4.0 total_stds = F.sigmoid(total_stds) # 0.5 - 1.0 # total_stds = 1 / total_stds From 437c8a8839aa9f196ef1dabbc84e67a165a86e7c Mon Sep 17 00:00:00 2001 From: tony Date: Tue, 30 Apr 2024 11:16:27 +1200 Subject: [PATCH 22/91] test with simple R. --- .../algorithm/mbrl/DynaSAC_Reweight.py | 18 ++---------------- 1 file changed, 2 insertions(+), 16 deletions(-) diff --git a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_Reweight.py b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_Reweight.py index e97bc43a..c41245a6 100644 --- a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_Reweight.py +++ b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_Reweight.py @@ -121,22 +121,8 @@ def _train_policy( l1_loss_one = q_values_one - q_target l1_loss_two = q_values_two - q_target - reweighted_l1_loss_one = weights * l1_loss_one - reweighted_l1_loss_two = weights * l1_loss_two - - total_loss_after_one = torch.sum(reweighted_l1_loss_one) - total_loss_after_two = torch.sum(reweighted_l1_loss_two) - total_loss_one = torch.sum(l1_loss_one) - total_loss_two = torch.sum(l1_loss_two) - - ratio_one = total_loss_one / total_loss_after_one - ratio_two = total_loss_two / total_loss_after_two - - reweighted_l1_loss_one = reweighted_l1_loss_one * ratio_one - reweighted_l1_loss_two = reweighted_l1_loss_two * ratio_two - - critic_loss_one = 0.5 * (reweighted_l1_loss_one.pow(2)).mean() - critic_loss_two = 0.5 * (reweighted_l1_loss_two.pow(2)).mean() + critic_loss_one = 0.5 * (l1_loss_one.pow(2) * weights).mean() + critic_loss_two = 0.5 * (l1_loss_two.pow(2) * weights).mean() # critic_loss_one = F.mse_loss(q_values_one, q_target) # critic_loss_two = F.mse_loss(q_values_two, q_target) From e55dc23b1003f2496150541f35345075b7ef8562 Mon Sep 17 00:00:00 2001 From: tony Date: Tue, 30 Apr 2024 11:58:39 +1200 Subject: [PATCH 23/91] test with simple R. --- .../algorithm/mbrl/DynaSAC.py | 50 +++---- .../algorithm/mbrl/DynaSAC_Reweight.py | 126 ++++++++---------- 2 files changed, 84 insertions(+), 92 deletions(-) diff --git a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC.py b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC.py index 82ec4c12..eb177c2c 100644 --- a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC.py +++ b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC.py @@ -232,32 +232,32 @@ def _dyna_generate_and_train(self, next_states: torch.Tensor) -> None: pred_rs = [] pred_n_states = [] - pred_state = next_states - - for _ in range(self.horizon): - pred_state = torch.repeat_interleave(pred_state, self.num_samples, dim=0) - # This part is controversial. But random actions is empirically better. - rand_acts = np.random.uniform(-1, 1, (pred_state.shape[0], self.action_num)) - pred_acts = torch.FloatTensor(rand_acts).to(self.device) - pred_next_state, _, _, _ = self.world_model.pred_next_states( - pred_state, pred_acts + with torch.no_grad(): + pred_state = next_states + for _ in range(self.horizon): + pred_state = torch.repeat_interleave(pred_state, self.num_samples, dim=0) + # This part is controversial. But random actions is empirically better. + rand_acts = np.random.uniform(-1, 1, (pred_state.shape[0], self.action_num)) + pred_acts = torch.FloatTensor(rand_acts).to(self.device) + pred_next_state, _, _, _ = self.world_model.pred_next_states( + pred_state, pred_acts + ) + pred_reward = self.world_model.pred_rewards(pred_next_state) + pred_states.append(pred_state) + pred_actions.append(pred_acts.detach()) + pred_rs.append(pred_reward.detach()) + pred_n_states.append(pred_next_state.detach()) + pred_state = pred_next_state.detach() + pred_states = torch.vstack(pred_states) + pred_actions = torch.vstack(pred_actions) + pred_rs = torch.vstack(pred_rs) + pred_n_states = torch.vstack(pred_n_states) + # Pay attention to here! It is dones in the Cares RL Code! + pred_dones = torch.FloatTensor(np.zeros(pred_rs.shape)).to(self.device) + # states, actions, rewards, next_states, not_dones + self._train_policy( + pred_states, pred_actions, pred_rs, pred_n_states, pred_dones ) - pred_reward = self.world_model.pred_rewards(pred_next_state) - pred_states.append(pred_state) - pred_actions.append(pred_acts.detach()) - pred_rs.append(pred_reward.detach()) - pred_n_states.append(pred_next_state.detach()) - pred_state = pred_next_state.detach() - pred_states = torch.vstack(pred_states) - pred_actions = torch.vstack(pred_actions) - pred_rs = torch.vstack(pred_rs) - pred_n_states = torch.vstack(pred_n_states) - # Pay attention to here! It is dones in the Cares RL Code! - pred_dones = torch.FloatTensor(np.zeros(pred_rs.shape)).to(self.device) - # states, actions, rewards, next_states, not_dones - self._train_policy( - pred_states, pred_actions, pred_rs, pred_n_states, pred_dones - ) def set_statistics(self, stats: dict) -> None: self.world_model.set_statistics(stats) diff --git a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_Reweight.py b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_Reweight.py index c41245a6..cdad89c7 100644 --- a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_Reweight.py +++ b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_Reweight.py @@ -247,40 +247,41 @@ def _dyna_generate_and_train(self, next_states): pred_rs = [] pred_n_states = [] pred_uncerts = [] - pred_state = next_states - for _ in range(self.horizon): - pred_state = torch.repeat_interleave(pred_state, self.num_samples, dim=0) - # This part is controversial. But random actions is empirically better. - rand_acts = np.random.uniform(-1, 1, (pred_state.shape[0], self.action_num)) - pred_acts = torch.FloatTensor(rand_acts).to(self.device) - - pred_next_state, _, pred_mean, pred_var = self.world_model.pred_next_states( - pred_state, pred_acts - ) + with torch.no_grad(): + pred_state = next_states + for _ in range(self.horizon): + pred_state = torch.repeat_interleave(pred_state, self.num_samples, dim=0) + # This part is controversial. But random actions is empirically better. + rand_acts = np.random.uniform(-1, 1, (pred_state.shape[0], self.action_num)) + pred_acts = torch.FloatTensor(rand_acts).to(self.device) + + pred_next_state, _, pred_mean, pred_var = self.world_model.pred_next_states( + pred_state, pred_acts + ) - uncert = self.sampling(pred_means=pred_mean, pred_vars=pred_var) - uncert = 1.5 - uncert - uncert = uncert.unsqueeze(dim=1).to(self.device) - - pred_uncerts.append(uncert) - pred_reward = self.world_model.pred_rewards(pred_next_state) - - pred_states.append(pred_state) - pred_actions.append(pred_acts.detach()) - pred_rs.append(pred_reward.detach()) - pred_n_states.append(pred_next_state.detach()) - pred_state = pred_next_state.detach() - pred_states = torch.vstack(pred_states) - pred_actions = torch.vstack(pred_actions) - pred_rs = torch.vstack(pred_rs) - pred_n_states = torch.vstack(pred_n_states) - pred_weights = torch.vstack(pred_uncerts) - # Pay attention to here! It is dones in the Cares RL Code! - pred_dones = torch.FloatTensor(np.zeros(pred_rs.shape)).to(self.device) - # states, actions, rewards, next_states, not_dones - self._train_policy( - pred_states, pred_actions, pred_rs, pred_n_states, pred_dones, pred_weights - ) + uncert = self.sampling(pred_means=pred_mean, pred_vars=pred_var) + uncert = 1.5 - uncert + uncert = uncert.unsqueeze(dim=1).to(self.device) + + pred_uncerts.append(uncert) + pred_reward = self.world_model.pred_rewards(pred_next_state) + + pred_states.append(pred_state) + pred_actions.append(pred_acts.detach()) + pred_rs.append(pred_reward.detach()) + pred_n_states.append(pred_next_state.detach()) + pred_state = pred_next_state.detach() + pred_states = torch.vstack(pred_states) + pred_actions = torch.vstack(pred_actions) + pred_rs = torch.vstack(pred_rs) + pred_n_states = torch.vstack(pred_n_states) + pred_weights = torch.vstack(pred_uncerts) + # Pay attention to here! It is dones in the Cares RL Code! + pred_dones = torch.FloatTensor(np.zeros(pred_rs.shape)).to(self.device) + # states, actions, rewards, next_states, not_dones + self._train_policy( + pred_states, pred_actions, pred_rs, pred_n_states, pred_dones, pred_weights + ) def sampling(self, pred_means, pred_vars): """ @@ -290,40 +291,31 @@ def sampling(self, pred_means, pred_vars): :param pred_vars: :return: """ - # 5 models, each sampled 10 times = 50, - sample1 = torch.distributions.Normal(pred_means[0], pred_vars[0]).sample( - [10]) - rwd_sample1 = self.world_model.pred_rewards(sample1) - - sample2 = torch.distributions.Normal(pred_means[1], pred_vars[1]).sample( - [10]) - rwd_sample2 = self.world_model.pred_rewards(sample2) - - sample3 = torch.distributions.Normal(pred_means[2], pred_vars[2]).sample( - [10]) - rwd_sample3 = self.world_model.pred_rewards(sample3) - - sample4 = torch.distributions.Normal(pred_means[3], pred_vars[3]).sample( - [10]) - rwd_sample4 = self.world_model.pred_rewards(sample4) - - sample5 = torch.distributions.Normal(pred_means[4], pred_vars[4]).sample( - [10]) - rwd_sample5 = self.world_model.pred_rewards(sample5) - - samples = torch.cat((rwd_sample1, rwd_sample2, rwd_sample3, rwd_sample4, rwd_sample5)) - # samples = torch.cat((sample1, sample2, sample3, sample4, sample5)) - # Samples = [5 * 10, 10 predictions, 11 state dims] - # print(samples.shape) - stds = torch.var(samples, dim=0) - # print(stds.shape) - # [10 predictions, 11 state dims] - total_stds = torch.mean(stds, dim=1) - # Clip for sigmoid - # total_stds[total_stds < 0.2] = 0.0 - # total_stds[total_stds > 4.0] = 4.0 - - total_stds = F.sigmoid(total_stds) # 0.5 - 1.0 + with torch.no_grad(): + # 5 models, each sampled 10 times = 50, + sample1 = torch.distributions.Normal(pred_means[0], pred_vars[0]).sample( + [10]) + sample2 = torch.distributions.Normal(pred_means[1], pred_vars[1]).sample( + [10]) + sample3 = torch.distributions.Normal(pred_means[2], pred_vars[2]).sample( + [10]) + sample4 = torch.distributions.Normal(pred_means[3], pred_vars[3]).sample( + [10]) + sample5 = torch.distributions.Normal(pred_means[4], pred_vars[4]).sample( + [10]) + samples = torch.cat((sample1, sample2, sample3, sample4, sample5)) + # samples = torch.cat((sample1, sample2, sample3, sample4, sample5)) + # Samples = [5 * 10, 10 predictions, 11 state dims] + # print(samples.shape) + stds = torch.var(samples, dim=0) + # print(stds.shape) + # [10 predictions, 11 state dims] + total_stds = torch.mean(stds, dim=1) + # Clip for sigmoid + # total_stds[total_stds < 0.2] = 0.0 + # total_stds[total_stds > 4.0] = 4.0 + + total_stds = F.sigmoid(total_stds) # 0.5 - 1.0 # total_stds = 1 / total_stds # total_stds = total_stds / torch.mean(total_stds) # if very uncertain, # high std, encouraged. From cae6ef4ca36b2b8508c332859861ef4318c7f36a Mon Sep 17 00:00:00 2001 From: tony Date: Thu, 16 May 2024 16:59:20 +1200 Subject: [PATCH 24/91] Many algorithms. --- .../algorithm/mbrl/DynaSAC.py | 46 +- ...C_Reweight.py => DynaSAC_BatchReweight.py} | 170 ++++---- .../algorithm/mbrl/DynaSAC_Var.py | 363 ++++++++++++++++ .../algorithm/mbrl/DynaSAT.py | 251 +++++++++++ .../algorithm/mbrl/DynaSAT_BatchReweight.py | 394 ++++++++++++++++++ .../algorithm/mbrl/__init__.py | 5 +- .../networks/SAC/__init__.py | 1 + .../networks/SAC/triple_critic.py | 45 ++ .../util/configurations.py | 65 ++- .../util/network_factory.py | 139 +++++- 10 files changed, 1351 insertions(+), 128 deletions(-) rename cares_reinforcement_learning/algorithm/mbrl/{DynaSAC_Reweight.py => DynaSAC_BatchReweight.py} (71%) create mode 100644 cares_reinforcement_learning/algorithm/mbrl/DynaSAC_Var.py create mode 100644 cares_reinforcement_learning/algorithm/mbrl/DynaSAT.py create mode 100644 cares_reinforcement_learning/algorithm/mbrl/DynaSAT_BatchReweight.py create mode 100644 cares_reinforcement_learning/networks/SAC/triple_critic.py diff --git a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC.py b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC.py index eb177c2c..d0c9a046 100644 --- a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC.py +++ b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC.py @@ -15,9 +15,7 @@ import torch.nn.functional as F from cares_reinforcement_learning.memory import PrioritizedReplayBuffer -from cares_reinforcement_learning.networks.world_models.ensemble_integrated import ( - EnsembleWorldReward, -) + from cares_reinforcement_learning.networks.world_models.ensemble_world import ( EnsembleWorldAndOneReward, ) @@ -105,6 +103,7 @@ def _train_policy( ################## Update the Critic First #################### with torch.no_grad(): next_actions, next_log_pi, _ = self.actor_net(next_states) + target_q_one, target_q_two = self.target_critic_net( next_states, next_actions ) @@ -114,8 +113,10 @@ def _train_policy( q_target = rewards + self.gamma * (1 - dones) * target_q_values q_values_one, q_values_two = self.critic_net(states, actions) - critic_loss_one = F.mse_loss(q_values_one, q_target) - critic_loss_two = F.mse_loss(q_values_two, q_target) + + critic_loss_one = ((q_values_one - q_target).pow(2)).mean() + critic_loss_two = ((q_values_two - q_target).pow(2)).mean() + critic_loss_total = critic_loss_one + critic_loss_two # Update the Critic @@ -158,39 +159,10 @@ def train_world_model( experiences = memory.sample_uniform(batch_size) states, actions, rewards, next_states, _, _ = experiences - # experiences = memory.sample_consecutive(batch_size) - # ( - # states, - # actions, - # rewards, - # next_states, - # _, - # _, - # next_actions, - # next_rewards, - # _, - # _, - # _, - # ) = experiences - states = torch.FloatTensor(np.asarray(states)).to(self.device) actions = torch.FloatTensor(np.asarray(actions)).to(self.device) rewards = torch.FloatTensor(np.asarray(rewards)).to(self.device).unsqueeze(1) next_states = torch.FloatTensor(np.asarray(next_states)).to(self.device) - # next_rewards = ( - # torch.FloatTensor(np.asarray(next_rewards)).to(self.device).unsqueeze(1) - # ) - # next_actions = torch.FloatTensor(np.asarray(next_actions)).to(self.device) - - # Step 1 train the world model. - # self.world_model.train_world( - # states=states, - # actions=actions, - # rewards=rewards, - # next_states=next_states, - # next_actions=next_actions, - # next_rewards=next_rewards, - # ) self.world_model.train_world( states=states, @@ -255,9 +227,9 @@ def _dyna_generate_and_train(self, next_states: torch.Tensor) -> None: # Pay attention to here! It is dones in the Cares RL Code! pred_dones = torch.FloatTensor(np.zeros(pred_rs.shape)).to(self.device) # states, actions, rewards, next_states, not_dones - self._train_policy( - pred_states, pred_actions, pred_rs, pred_n_states, pred_dones - ) + self._train_policy( + pred_states, pred_actions, pred_rs, pred_n_states, pred_dones + ) def set_statistics(self, stats: dict) -> None: self.world_model.set_statistics(stats) diff --git a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_Reweight.py b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_BatchReweight.py similarity index 71% rename from cares_reinforcement_learning/algorithm/mbrl/DynaSAC_Reweight.py rename to cares_reinforcement_learning/algorithm/mbrl/DynaSAC_BatchReweight.py index cdad89c7..99ed1334 100644 --- a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_Reweight.py +++ b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_BatchReweight.py @@ -12,20 +12,14 @@ import numpy as np import torch -import torch.nn.functional as F - from cares_reinforcement_learning.memory import PrioritizedReplayBuffer -from cares_reinforcement_learning.util import sampling -from cares_reinforcement_learning.networks.world_models.ensemble_integrated import ( - EnsembleWorldReward, -) from cares_reinforcement_learning.networks.world_models.ensemble_world import ( EnsembleWorldAndOneReward, ) -class DynaSAC_Reweight: +class DynaSAC_BatchReweight: def __init__( self, actor_network: torch.nn.Module, @@ -106,6 +100,7 @@ def _train_policy( weights: torch.Tensor, ) -> None: ################## Update the Critic First #################### + # Have more target values? with torch.no_grad(): next_actions, next_log_pi, _ = self.actor_net(next_states) target_q_one, target_q_two = self.target_critic_net( @@ -118,14 +113,24 @@ def _train_policy( q_values_one, q_values_two = self.critic_net(states, actions) - l1_loss_one = q_values_one - q_target - l1_loss_two = q_values_two - q_target + # Original loss function + l2_loss_one = (q_values_one - q_target).pow(2) + l2_loss_two = (q_values_two - q_target).pow(2) + + # Reweighted loss function. weight not participant in training. + weights = weights.detach() + disc_l2_loss_one = l2_loss_one * weights + disc_l2_loss_two = l2_loss_two * weights + # A ratio to scale the loss back to original loss scale. - critic_loss_one = 0.5 * (l1_loss_one.pow(2) * weights).mean() - critic_loss_two = 0.5 * (l1_loss_two.pow(2) * weights).mean() + ratio_1 = torch.mean(l2_loss_one) / torch.mean(disc_l2_loss_one) + ratio_1 = ratio_1.detach() + ratio_2 = torch.mean(l2_loss_two) / torch.mean(disc_l2_loss_two) + ratio_2 = ratio_2.detach() + + critic_loss_one = disc_l2_loss_one.mean() * ratio_1 + critic_loss_two = disc_l2_loss_two.mean() * ratio_2 - # critic_loss_one = F.mse_loss(q_values_one, q_target) - # critic_loss_two = F.mse_loss(q_values_two, q_target) critic_loss_total = critic_loss_one + critic_loss_two # Update the Critic @@ -167,40 +172,10 @@ def train_world_model( experiences = memory.sample_uniform(batch_size) states, actions, rewards, next_states, _, _ = experiences - # experiences = memory.sample_consecutive(batch_size) - # - # ( - # states, - # actions, - # rewards, - # next_states, - # _, - # _, - # next_actions, - # next_rewards, - # _, - # _, - # _, - # ) = experiences - states = torch.FloatTensor(np.asarray(states)).to(self.device) actions = torch.FloatTensor(np.asarray(actions)).to(self.device) rewards = torch.FloatTensor(np.asarray(rewards)).to(self.device).unsqueeze(1) next_states = torch.FloatTensor(np.asarray(next_states)).to(self.device) - # next_rewards = ( - # torch.FloatTensor(np.asarray(next_rewards)).to(self.device).unsqueeze(1) - # ) - # next_actions = torch.FloatTensor(np.asarray(next_actions)).to(self.device) - - # # Step 1 train the world model. - # self.world_model.train_world( - # states=states, - # actions=actions, - # rewards=rewards, - # next_states=next_states, - # next_actions=next_actions, - # next_rewards=next_rewards, - # ) self.world_model.train_world( states=states, @@ -258,14 +233,11 @@ def _dyna_generate_and_train(self, next_states): pred_next_state, _, pred_mean, pred_var = self.world_model.pred_next_states( pred_state, pred_acts ) - uncert = self.sampling(pred_means=pred_mean, pred_vars=pred_var) - uncert = 1.5 - uncert uncert = uncert.unsqueeze(dim=1).to(self.device) - pred_uncerts.append(uncert) - pred_reward = self.world_model.pred_rewards(pred_next_state) + pred_reward = self.world_model.pred_rewards(pred_next_state) pred_states.append(pred_state) pred_actions.append(pred_acts.detach()) pred_rs.append(pred_reward.detach()) @@ -279,11 +251,11 @@ def _dyna_generate_and_train(self, next_states): # Pay attention to here! It is dones in the Cares RL Code! pred_dones = torch.FloatTensor(np.zeros(pred_rs.shape)).to(self.device) # states, actions, rewards, next_states, not_dones - self._train_policy( - pred_states, pred_actions, pred_rs, pred_n_states, pred_dones, pred_weights - ) + self._train_policy( + pred_states, pred_actions, pred_rs, pred_n_states, pred_dones, pred_weights + ) - def sampling(self, pred_means, pred_vars): + def sampling(self, pred_means, pred_vars, phi=0.0001): """ High std means low uncertainty. Therefore, divided by 1 @@ -291,35 +263,85 @@ def sampling(self, pred_means, pred_vars): :param pred_vars: :return: """ + sample_times = 10 with torch.no_grad(): - # 5 models, each sampled 10 times = 50, + # 5 models. Each predict 10 next_states. sample1 = torch.distributions.Normal(pred_means[0], pred_vars[0]).sample( - [10]) + [sample_times]) sample2 = torch.distributions.Normal(pred_means[1], pred_vars[1]).sample( - [10]) + [sample_times]) sample3 = torch.distributions.Normal(pred_means[2], pred_vars[2]).sample( - [10]) + [sample_times]) sample4 = torch.distributions.Normal(pred_means[3], pred_vars[3]).sample( - [10]) + [sample_times]) sample5 = torch.distributions.Normal(pred_means[4], pred_vars[4]).sample( - [10]) - samples = torch.cat((sample1, sample2, sample3, sample4, sample5)) - # samples = torch.cat((sample1, sample2, sample3, sample4, sample5)) - # Samples = [5 * 10, 10 predictions, 11 state dims] - # print(samples.shape) - stds = torch.var(samples, dim=0) - # print(stds.shape) - # [10 predictions, 11 state dims] - total_stds = torch.mean(stds, dim=1) - # Clip for sigmoid - # total_stds[total_stds < 0.2] = 0.0 - # total_stds[total_stds > 4.0] = 4.0 - - total_stds = F.sigmoid(total_stds) # 0.5 - 1.0 - # total_stds = 1 / total_stds - # total_stds = total_stds / torch.mean(total_stds) # if very uncertain, - # high std, encouraged. - # total_stds = total_stds - torch.min(total_stds) + [sample_times]) + rs = [] + acts = [] + qs = [] + # Varying the next_state's distribution. + for i in range(sample_times): + # 5 models, each sampled 10 times = 50, + pred_rwd1 = self.world_model.pred_rewards(sample1[i]) + pred_rwd2 = self.world_model.pred_rewards(sample2[i]) + pred_rwd3 = self.world_model.pred_rewards(sample3[i]) + pred_rwd4 = self.world_model.pred_rewards(sample4[i]) + pred_rwd5 = self.world_model.pred_rewards(sample5[i]) + rs.append(pred_rwd1) + rs.append(pred_rwd2) + rs.append(pred_rwd3) + rs.append(pred_rwd4) + rs.append(pred_rwd5) + # Each times, 5 models predict different actions. + # [2560, 17] + pred_act1, log_pi1, _ = self.actor_net(sample1[i]) + pred_act2, log_pi2, _ = self.actor_net(sample2[i]) + pred_act3, log_pi3, _ = self.actor_net(sample3[i]) + pred_act4, log_pi4, _ = self.actor_net(sample4[i]) + pred_act5, log_pi5, _ = self.actor_net(sample5[i]) + acts.append(log_pi1) + acts.append(log_pi2) + acts.append(log_pi3) + acts.append(log_pi4) + acts.append(log_pi5) + # How to become the same next state, different action. + # Now: sample1 sample2... same next state, different model. + # Pred_act1 pred_act2 same next_state, different actions. + # 5[] * 10[var of state] + qa1, qa2 = self.target_critic_net(sample1[i], pred_act1) + qa = torch.minimum(qa1, qa2) + qb1, qb2 = self.target_critic_net(sample2[i], pred_act2) + qb = torch.minimum(qb1, qb2) + qc1, qc2 = self.target_critic_net(sample3[i], pred_act3) + qc = torch.minimum(qc1, qc2) + qd1, qd2 = self.target_critic_net(sample4[i], pred_act4) + qd = torch.minimum(qd1, qd2) + qe1, qe2 = self.target_critic_net(sample5[i], pred_act5) + qe = torch.minimum(qe1, qe2) + qs.append(qa) + qs.append(qb) + qs.append(qc) + qs.append(qd) + qs.append(qe) + + rs = torch.stack(rs) + acts = torch.stack(acts) + qs = torch.stack(qs) + + var_r = torch.var(rs, dim=0) + var_a = torch.var(acts, dim=0) + var_q = torch.var(qs, dim=0) + + # Computing covariance. + mean_a = torch.mean(acts, dim=0, keepdim=True) + mean_q = torch.mean(qs, dim=0, keepdim=True) + diff_a = acts - mean_a + diff_q = qs - mean_q + cov_aq = torch.mean(diff_a * diff_q, dim=0) + + total_var = var_r + var_a + var_q + 2 * cov_aq + total_var[total_var < phi] = phi + total_stds = 1 / total_var return total_stds.detach() def set_statistics(self, stats: dict) -> None: diff --git a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_Var.py b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_Var.py new file mode 100644 index 00000000..d53859e7 --- /dev/null +++ b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_Var.py @@ -0,0 +1,363 @@ +""" +Sutton, Richard S. "Dyna, an integrated architecture for learning, planning, and reacting." + +Original Paper: https://dl.acm.org/doi/abs/10.1145/122344.122377 + +This code runs automatic entropy tuning +""" + +import copy +import logging +import os + +import numpy as np +import torch +from cares_reinforcement_learning.memory import PrioritizedReplayBuffer + +from cares_reinforcement_learning.networks.world_models.ensemble_world import ( + EnsembleWorldAndOneReward, +) + + +class DynaSAC_Var: + def __init__( + self, + actor_network: torch.nn.Module, + critic_network: torch.nn.Module, + world_network: EnsembleWorldAndOneReward, + gamma: float, + tau: float, + action_num: int, + actor_lr: float, + critic_lr: float, + alpha_lr: float, + num_samples: int, + horizon: int, + device: torch.device, + ): + self.type = "mbrl" + self.device = device + + # this may be called policy_net in other implementations + self.actor_net = actor_network.to(self.device) + # this may be called soft_q_net in other implementations + self.critic_net = critic_network.to(self.device) + self.target_critic_net = copy.deepcopy(self.critic_net) + + self.gamma = gamma + self.tau = tau + + self.num_samples = num_samples + self.horizon = horizon + self.action_num = action_num + + self.learn_counter = 0 + self.policy_update_freq = 1 + + self.actor_net_optimiser = torch.optim.Adam( + self.actor_net.parameters(), lr=actor_lr + ) + self.critic_net_optimiser = torch.optim.Adam( + self.critic_net.parameters(), lr=critic_lr + ) + + # Set to initial alpha to 1.0 according to other baselines. + self.log_alpha = torch.tensor(np.log(1.0)).to(device) + self.log_alpha.requires_grad = True + self.target_entropy = -action_num + self.log_alpha_optimizer = torch.optim.Adam([self.log_alpha], lr=alpha_lr) + + # World model + self.world_model = world_network + + @property + def _alpha(self) -> float: + return self.log_alpha.exp() + + # pylint: disable-next=unused-argument to keep the same interface + def select_action_from_policy( + self, state: np.ndarray, evaluation: bool = False, noise_scale: float = 0 + ) -> np.ndarray: + # note that when evaluating this algorithm we need to select mu as + self.actor_net.eval() + with torch.no_grad(): + state_tensor = torch.FloatTensor(state).unsqueeze(0).to(self.device) + if evaluation is False: + (action, _, _) = self.actor_net(state_tensor) + else: + (_, _, action) = self.actor_net(state_tensor) + action = action.cpu().data.numpy().flatten() + self.actor_net.train() + return action + + def _train_policy( + self, + states: torch.Tensor, + actions: torch.Tensor, + rewards: torch.Tensor, + next_states: torch.Tensor, + dones: torch.Tensor, + weights: torch.Tensor, + ) -> None: + ################## Update the Critic First #################### + # Have more target values? + with torch.no_grad(): + next_actions, next_log_pi, _ = self.actor_net(next_states) + target_q_one, target_q_two = self.target_critic_net( + next_states, next_actions + ) + target_q_values = ( + torch.minimum(target_q_one, target_q_two) - self._alpha * next_log_pi + ) + q_target = rewards + self.gamma * (1 - dones) * target_q_values + + q_values_one, q_values_two = self.critic_net(states, actions) + + # Original loss function + l2_loss_one = (q_values_one - q_target).pow(2) + l2_loss_two = (q_values_two - q_target).pow(2) + + # Reweighted loss function. weight not participant in training. + weights = weights.detach() + disc_l2_loss_one = l2_loss_one * weights + disc_l2_loss_two = l2_loss_two * weights + # A ratio to scale the loss back to original loss scale. + + ratio_1 = torch.mean(l2_loss_one) / torch.mean(disc_l2_loss_one) + ratio_1 = ratio_1.detach() + ratio_2 = torch.mean(l2_loss_two) / torch.mean(disc_l2_loss_two) + ratio_2 = ratio_2.detach() + + critic_loss_one = disc_l2_loss_one.mean() * ratio_1 + critic_loss_two = disc_l2_loss_two.mean() * ratio_2 + + critic_loss_total = critic_loss_one + critic_loss_two + + # Update the Critic + self.critic_net_optimiser.zero_grad() + critic_loss_total.backward() + self.critic_net_optimiser.step() + + ################## Update the Actor Second #################### + pi, first_log_p, _ = self.actor_net(states) + qf1_pi, qf2_pi = self.critic_net(states, pi) + min_qf_pi = torch.minimum(qf1_pi, qf2_pi) + actor_loss = ((self._alpha * first_log_p) - min_qf_pi).mean() + + # Update the Actor + self.actor_net_optimiser.zero_grad() + actor_loss.backward() + self.actor_net_optimiser.step() + + # Update the temperature + alpha_loss = -( + self.log_alpha * (first_log_p + self.target_entropy).detach() + ).mean() + + self.log_alpha_optimizer.zero_grad() + alpha_loss.backward() + self.log_alpha_optimizer.step() + + if self.learn_counter % self.policy_update_freq == 0: + for target_param, param in zip( + self.target_critic_net.parameters(), self.critic_net.parameters() + ): + target_param.data.copy_( + param.data * self.tau + target_param.data * (1.0 - self.tau) + ) + + def train_world_model( + self, memory: PrioritizedReplayBuffer, batch_size: int + ) -> None: + experiences = memory.sample_uniform(batch_size) + states, actions, rewards, next_states, _, _ = experiences + + states = torch.FloatTensor(np.asarray(states)).to(self.device) + actions = torch.FloatTensor(np.asarray(actions)).to(self.device) + rewards = torch.FloatTensor(np.asarray(rewards)).to(self.device).unsqueeze(1) + next_states = torch.FloatTensor(np.asarray(next_states)).to(self.device) + + self.world_model.train_world( + states=states, + actions=actions, + next_states=next_states, + ) + self.world_model.train_reward( + next_states=next_states, + rewards=rewards, + ) + + def train_policy(self, memory: PrioritizedReplayBuffer, batch_size: int) -> None: + self.learn_counter += 1 + + experiences = memory.sample_uniform(batch_size) + states, actions, rewards, next_states, dones, _ = experiences + + # Convert into tensor + states = torch.FloatTensor(np.asarray(states)).to(self.device) + actions = torch.FloatTensor(np.asarray(actions)).to(self.device) + rewards = torch.FloatTensor(np.asarray(rewards)).to(self.device).unsqueeze(1) + next_states = torch.FloatTensor(np.asarray(next_states)).to(self.device) + dones = torch.LongTensor(np.asarray(dones)).to(self.device).unsqueeze(1) + full_weights = torch.ones(rewards.shape).to(self.device) + # Step 2 train as usual + self._train_policy( + states=states, + actions=actions, + rewards=rewards, + next_states=next_states, + dones=dones, + weights=full_weights, + ) + # # # Step 3 Dyna add more data + self._dyna_generate_and_train(next_states=next_states) + + def _dyna_generate_and_train(self, next_states): + """ + Only off-policy Dyna will work. + :param next_states: + """ + pred_states = [] + pred_actions = [] + pred_rs = [] + pred_n_states = [] + pred_uncerts = [] + with torch.no_grad(): + pred_state = next_states + for _ in range(self.horizon): + pred_state = torch.repeat_interleave(pred_state, self.num_samples, dim=0) + # This part is controversial. But random actions is empirically better. + rand_acts = np.random.uniform(-1, 1, (pred_state.shape[0], self.action_num)) + pred_acts = torch.FloatTensor(rand_acts).to(self.device) + + pred_next_state, _, pred_mean, pred_var = self.world_model.pred_next_states( + pred_state, pred_acts + ) + uncert = self.sampling(pred_means=pred_mean, pred_vars=pred_var) + uncert = uncert.unsqueeze(dim=1).to(self.device) + pred_uncerts.append(uncert) + + pred_reward = self.world_model.pred_rewards(pred_next_state) + pred_states.append(pred_state) + pred_actions.append(pred_acts.detach()) + pred_rs.append(pred_reward.detach()) + pred_n_states.append(pred_next_state.detach()) + pred_state = pred_next_state.detach() + pred_states = torch.vstack(pred_states) + pred_actions = torch.vstack(pred_actions) + pred_rs = torch.vstack(pred_rs) + pred_n_states = torch.vstack(pred_n_states) + pred_weights = torch.vstack(pred_uncerts) + # Pay attention to here! It is dones in the Cares RL Code! + pred_dones = torch.FloatTensor(np.zeros(pred_rs.shape)).to(self.device) + # states, actions, rewards, next_states, not_dones + self._train_policy( + pred_states, pred_actions, pred_rs, pred_n_states, pred_dones, pred_weights + ) + + def sampling(self, pred_means, pred_vars, phi=0.0001): + """ + High std means low uncertainty. Therefore, divided by 1 + + :param pred_means: + :param pred_vars: + :return: + """ + sample_times = 10 + with torch.no_grad(): + # 5 models. Each predict 10 next_states. + sample1 = torch.distributions.Normal(pred_means[0], pred_vars[0]).sample( + [sample_times]) + sample2 = torch.distributions.Normal(pred_means[1], pred_vars[1]).sample( + [sample_times]) + sample3 = torch.distributions.Normal(pred_means[2], pred_vars[2]).sample( + [sample_times]) + sample4 = torch.distributions.Normal(pred_means[3], pred_vars[3]).sample( + [sample_times]) + sample5 = torch.distributions.Normal(pred_means[4], pred_vars[4]).sample( + [sample_times]) + rs = [] + acts = [] + qs = [] + # Varying the next_state's distribution. + for i in range(sample_times): + # 5 models, each sampled 10 times = 50, + pred_rwd1 = self.world_model.pred_rewards(sample1[i]) + pred_rwd2 = self.world_model.pred_rewards(sample2[i]) + pred_rwd3 = self.world_model.pred_rewards(sample3[i]) + pred_rwd4 = self.world_model.pred_rewards(sample4[i]) + pred_rwd5 = self.world_model.pred_rewards(sample5[i]) + rs.append(pred_rwd1) + rs.append(pred_rwd2) + rs.append(pred_rwd3) + rs.append(pred_rwd4) + rs.append(pred_rwd5) + # Each times, 5 models predict different actions. + # [2560, 17] + pred_act1, log_pi1, _ = self.actor_net(sample1[i]) + pred_act2, log_pi2, _ = self.actor_net(sample2[i]) + pred_act3, log_pi3, _ = self.actor_net(sample3[i]) + pred_act4, log_pi4, _ = self.actor_net(sample4[i]) + pred_act5, log_pi5, _ = self.actor_net(sample5[i]) + acts.append(log_pi1) + acts.append(log_pi2) + acts.append(log_pi3) + acts.append(log_pi4) + acts.append(log_pi5) + # How to become the same next state, different action. + # Now: sample1 sample2... same next state, different model. + # Pred_act1 pred_act2 same next_state, different actions. + # 5[] * 10[var of state] + qa1, qa2 = self.target_critic_net(sample1[i], pred_act1) + qa = torch.minimum(qa1, qa2) + qb1, qb2 = self.target_critic_net(sample2[i], pred_act2) + qb = torch.minimum(qb1, qb2) + qc1, qc2 = self.target_critic_net(sample3[i], pred_act3) + qc = torch.minimum(qc1, qc2) + qd1, qd2 = self.target_critic_net(sample4[i], pred_act4) + qd = torch.minimum(qd1, qd2) + qe1, qe2 = self.target_critic_net(sample5[i], pred_act5) + qe = torch.minimum(qe1, qe2) + qs.append(qa) + qs.append(qb) + qs.append(qc) + qs.append(qd) + qs.append(qe) + + rs = torch.stack(rs) + acts = torch.stack(acts) + qs = torch.stack(qs) + + var_r = torch.var(rs, dim=0) + var_a = torch.var(acts, dim=0) + var_q = torch.var(qs, dim=0) + + # Computing covariance. + mean_a = torch.mean(acts, dim=0, keepdim=True) + mean_q = torch.mean(qs, dim=0, keepdim=True) + diff_a = acts - mean_a + diff_q = qs - mean_q + cov_aq = torch.mean(diff_a * diff_q, dim=0) + + total_var = var_r + var_a + var_q + 2 * cov_aq + total_var[total_var < phi] = phi + # total_stds = 1 / total_var + return total_var.detach() + + def set_statistics(self, stats: dict) -> None: + self.world_model.set_statistics(stats) + + def save_models(self, filename: str, filepath: str = "models") -> None: + path = f"{filepath}/models" if filepath != "models" else filepath + dir_exists = os.path.exists(path) + if not dir_exists: + os.makedirs(path) + torch.save(self.actor_net.state_dict(), f"{path}/{filename}_actor.pth") + torch.save(self.critic_net.state_dict(), f"{path}/{filename}_critic.pth") + logging.info("models has been saved...") + + def load_models(self, filepath: str, filename: str) -> None: + path = f"{filepath}/models" if filepath != "models" else filepath + self.actor_net.load_state_dict(torch.load(f"{path}/{filename}_actor.pth")) + self.critic_net.load_state_dict(torch.load(f"{path}/{filename}_critic.pth")) + logging.info("models has been loaded...") diff --git a/cares_reinforcement_learning/algorithm/mbrl/DynaSAT.py b/cares_reinforcement_learning/algorithm/mbrl/DynaSAT.py new file mode 100644 index 00000000..f67d30f7 --- /dev/null +++ b/cares_reinforcement_learning/algorithm/mbrl/DynaSAT.py @@ -0,0 +1,251 @@ +""" +Sutton, Richard S. "Dyna, an integrated architecture for learning, planning, and reacting." + +Original Paper: https://dl.acm.org/doi/abs/10.1145/122344.122377 + +This code runs automatic entropy tuning +""" + +import copy +import logging +import os + +import numpy as np +import torch + +from cares_reinforcement_learning.memory import PrioritizedReplayBuffer + +from cares_reinforcement_learning.networks.world_models.ensemble_world import ( + EnsembleWorldAndOneReward, +) + + +class DynaSAT: + def __init__( + self, + actor_network: torch.nn.Module, + critic_network: torch.nn.Module, + world_network: EnsembleWorldAndOneReward, + gamma: float, + tau: float, + action_num: int, + actor_lr: float, + critic_lr: float, + alpha_lr: float, + num_samples: int, + horizon: int, + device: torch.device, + ): + self.type = "mbrl" + self.device = device + + # this may be called policy_net in other implementations + self.actor_net = actor_network.to(self.device) + # this may be called soft_q_net in other implementations + self.critic_net = critic_network.to(self.device) + self.target_critic_net = copy.deepcopy(self.critic_net) + + self.gamma = gamma + self.tau = tau + + self.num_samples = num_samples + self.horizon = horizon + self.action_num = action_num + + self.learn_counter = 0 + self.policy_update_freq = 1 + + self.actor_net_optimiser = torch.optim.Adam( + self.actor_net.parameters(), lr=actor_lr + ) + self.critic_net_optimiser = torch.optim.Adam( + self.critic_net.parameters(), lr=critic_lr + ) + + # Set to initial alpha to 1.0 according to other baselines. + self.log_alpha = torch.tensor(np.log(1.0)).to(device) + self.log_alpha.requires_grad = True + self.target_entropy = -action_num + self.log_alpha_optimizer = torch.optim.Adam([self.log_alpha], lr=alpha_lr) + + # World model + self.world_model = world_network + + @property + def _alpha(self) -> float: + return self.log_alpha.exp() + + # pylint: disable-next=unused-argument to keep the same interface + def select_action_from_policy( + self, state: np.ndarray, evaluation: bool = False, noise_scale: float = 0 + ) -> np.ndarray: + # note that when evaluating this algorithm we need to select mu as + self.actor_net.eval() + with torch.no_grad(): + state_tensor = torch.FloatTensor(state).unsqueeze(0).to(self.device) + if evaluation is False: + (action, _, _) = self.actor_net(state_tensor) + else: + (_, _, action) = self.actor_net(state_tensor) + action = action.cpu().data.numpy().flatten() + self.actor_net.train() + return action + + def _train_policy( + self, + states: torch.Tensor, + actions: torch.Tensor, + rewards: torch.Tensor, + next_states: torch.Tensor, + dones: torch.Tensor, + ) -> None: + ################## Update the Critic First #################### + with torch.no_grad(): + next_actions, next_log_pi, _ = self.actor_net(next_states) + target_q_one, target_q_two, target_q_three = self.target_critic_net( + next_states, next_actions + ) + target_q_values = ( + torch.minimum(torch.minimum(target_q_one, target_q_two), target_q_three) - self._alpha * next_log_pi + ) + q_target = rewards + self.gamma * (1 - dones) * target_q_values + + q_values_one, q_values_two, q_values_three = self.critic_net(states, actions) + critic_loss_one = ((q_values_one - q_target).pow(2)).mean() + critic_loss_two = ((q_values_two - q_target).pow(2)).mean() + critic_loss_three = ((q_values_three - q_target).pow(2)).mean() + + critic_loss_total = critic_loss_one + critic_loss_two + critic_loss_three + + # Update the Critic + self.critic_net_optimiser.zero_grad() + critic_loss_total.backward() + self.critic_net_optimiser.step() + + ################## Update the Actor Second #################### + pi, first_log_p, _ = self.actor_net(states) + qf1_pi, qf2_pi, qf3_pi = self.critic_net(states, pi) + min_qf_pi = torch.minimum(torch.minimum(qf1_pi, qf2_pi), qf3_pi) + + actor_loss = ((self._alpha * first_log_p) - min_qf_pi).mean() + + # Update the Actor + self.actor_net_optimiser.zero_grad() + actor_loss.backward() + self.actor_net_optimiser.step() + + # Update the temperature + alpha_loss = -( + self.log_alpha * (first_log_p + self.target_entropy).detach() + ).mean() + + self.log_alpha_optimizer.zero_grad() + alpha_loss.backward() + self.log_alpha_optimizer.step() + + if self.learn_counter % self.policy_update_freq == 0: + for target_param, param in zip( + self.target_critic_net.parameters(), self.critic_net.parameters() + ): + target_param.data.copy_( + param.data * self.tau + target_param.data * (1.0 - self.tau) + ) + + + + def train_policy(self, memory: PrioritizedReplayBuffer, batch_size: int) -> None: + self.learn_counter += 1 + + experiences = memory.sample_uniform(batch_size) + states, actions, rewards, next_states, dones, _ = experiences + + # Convert into tensor + states = torch.FloatTensor(np.asarray(states)).to(self.device) + actions = torch.FloatTensor(np.asarray(actions)).to(self.device) + rewards = torch.FloatTensor(np.asarray(rewards)).to(self.device).unsqueeze(1) + next_states = torch.FloatTensor(np.asarray(next_states)).to(self.device) + dones = torch.LongTensor(np.asarray(dones)).to(self.device).unsqueeze(1) + + # Step 2 train as usual + self._train_policy( + states=states, + actions=actions, + rewards=rewards, + next_states=next_states, + dones=dones, + ) + # # # Step 3 Dyna add more data + self._dyna_generate_and_train(next_states=next_states) + + def train_world_model( + self, memory: PrioritizedReplayBuffer, batch_size: int + ) -> None: + + experiences = memory.sample_uniform(batch_size) + states, actions, rewards, next_states, _, _ = experiences + + states = torch.FloatTensor(np.asarray(states)).to(self.device) + actions = torch.FloatTensor(np.asarray(actions)).to(self.device) + rewards = torch.FloatTensor(np.asarray(rewards)).to(self.device).unsqueeze(1) + next_states = torch.FloatTensor(np.asarray(next_states)).to(self.device) + + self.world_model.train_world( + states=states, + actions=actions, + next_states=next_states, + ) + self.world_model.train_reward( + next_states=next_states, + rewards=rewards, + ) + + def _dyna_generate_and_train(self, next_states: torch.Tensor) -> None: + pred_states = [] + pred_actions = [] + pred_rs = [] + pred_n_states = [] + + with torch.no_grad(): + pred_state = next_states + for _ in range(self.horizon): + pred_state = torch.repeat_interleave(pred_state, self.num_samples, dim=0) + # This part is controversial. But random actions is empirically better. + rand_acts = np.random.uniform(-1, 1, (pred_state.shape[0], self.action_num)) + pred_acts = torch.FloatTensor(rand_acts).to(self.device) + pred_next_state, _, _, _ = self.world_model.pred_next_states( + pred_state, pred_acts + ) + pred_reward = self.world_model.pred_rewards(pred_next_state) + pred_states.append(pred_state) + pred_actions.append(pred_acts.detach()) + pred_rs.append(pred_reward.detach()) + pred_n_states.append(pred_next_state.detach()) + pred_state = pred_next_state.detach() + pred_states = torch.vstack(pred_states) + pred_actions = torch.vstack(pred_actions) + pred_rs = torch.vstack(pred_rs) + pred_n_states = torch.vstack(pred_n_states) + # Pay attention to here! It is dones in the Cares RL Code! + pred_dones = torch.FloatTensor(np.zeros(pred_rs.shape)).to(self.device) + # states, actions, rewards, next_states, not_dones + self._train_policy( + pred_states, pred_actions, pred_rs, pred_n_states, pred_dones + ) + + def set_statistics(self, stats: dict) -> None: + self.world_model.set_statistics(stats) + + def save_models(self, filename: str, filepath: str = "models") -> None: + path = f"{filepath}/models" if filepath != "models" else filepath + dir_exists = os.path.exists(path) + if not dir_exists: + os.makedirs(path) + torch.save(self.actor_net.state_dict(), f"{path}/{filename}_actor.pth") + torch.save(self.critic_net.state_dict(), f"{path}/{filename}_critic.pth") + logging.info("models has been saved...") + + def load_models(self, filepath: str, filename: str) -> None: + path = f"{filepath}/models" if filepath != "models" else filepath + self.actor_net.load_state_dict(torch.load(f"{path}/{filename}_actor.pth")) + self.critic_net.load_state_dict(torch.load(f"{path}/{filename}_critic.pth")) + logging.info("models has been loaded...") diff --git a/cares_reinforcement_learning/algorithm/mbrl/DynaSAT_BatchReweight.py b/cares_reinforcement_learning/algorithm/mbrl/DynaSAT_BatchReweight.py new file mode 100644 index 00000000..548bc8b9 --- /dev/null +++ b/cares_reinforcement_learning/algorithm/mbrl/DynaSAT_BatchReweight.py @@ -0,0 +1,394 @@ +""" +Sutton, Richard S. "Dyna, an integrated architecture for learning, planning, and reacting." + +Original Paper: https://dl.acm.org/doi/abs/10.1145/122344.122377 + +This code runs automatic entropy tuning +""" + +import copy +import logging +import os + +import numpy as np +import torch +from cares_reinforcement_learning.memory import PrioritizedReplayBuffer + +from cares_reinforcement_learning.networks.world_models.ensemble_world import ( + EnsembleWorldAndOneReward, +) + + +class DynaSAT_BatchReweight: + def __init__( + self, + actor_network: torch.nn.Module, + critic_network: torch.nn.Module, + world_network: EnsembleWorldAndOneReward, + gamma: float, + tau: float, + action_num: int, + actor_lr: float, + critic_lr: float, + alpha_lr: float, + num_samples: int, + horizon: int, + device: torch.device, + ): + self.type = "mbrl" + self.device = device + + # this may be called policy_net in other implementations + self.actor_net = actor_network.to(self.device) + # this may be called soft_q_net in other implementations + self.critic_net = critic_network.to(self.device) + self.target_critic_net = copy.deepcopy(self.critic_net) + + self.gamma = gamma + self.tau = tau + + self.num_samples = num_samples + self.horizon = horizon + self.action_num = action_num + + self.learn_counter = 0 + self.policy_update_freq = 1 + + self.actor_net_optimiser = torch.optim.Adam( + self.actor_net.parameters(), lr=actor_lr + ) + self.critic_net_optimiser = torch.optim.Adam( + self.critic_net.parameters(), lr=critic_lr + ) + + # Set to initial alpha to 1.0 according to other baselines. + self.log_alpha = torch.tensor(np.log(1.0)).to(device) + self.log_alpha.requires_grad = True + self.target_entropy = -action_num + self.log_alpha_optimizer = torch.optim.Adam([self.log_alpha], lr=alpha_lr) + + # World model + self.world_model = world_network + + @property + def _alpha(self) -> float: + return self.log_alpha.exp() + + # pylint: disable-next=unused-argument to keep the same interface + def select_action_from_policy( + self, state: np.ndarray, evaluation: bool = False, noise_scale: float = 0 + ) -> np.ndarray: + # note that when evaluating this algorithm we need to select mu as + self.actor_net.eval() + with torch.no_grad(): + state_tensor = torch.FloatTensor(state).unsqueeze(0).to(self.device) + if evaluation is False: + (action, _, _) = self.actor_net(state_tensor) + else: + (_, _, action) = self.actor_net(state_tensor) + action = action.cpu().data.numpy().flatten() + self.actor_net.train() + return action + + def _train_policy( + self, + states: torch.Tensor, + actions: torch.Tensor, + rewards: torch.Tensor, + next_states: torch.Tensor, + dones: torch.Tensor, + weights: torch.Tensor, + ) -> None: + ################## Update the Critic First #################### + with torch.no_grad(): + next_actions, next_log_pi, _ = self.actor_net(next_states) + target_q_one, target_q_two, target_q_three = self.target_critic_net( + next_states, next_actions + ) + target_q_values = ( + torch.minimum(torch.minimum(target_q_one, target_q_two), target_q_three) - self._alpha * next_log_pi + ) + q_target = rewards + self.gamma * (1 - dones) * target_q_values + + q_values_one, q_values_two, q_values_three = self.critic_net(states, actions) + + # Original loss function + l2_loss_one = (q_values_one - q_target).pow(2) + l2_loss_two = (q_values_two - q_target).pow(2) + l2_loss_three = (q_values_three - q_target).pow(2) + + # Reweighted loss function. weight not participant in training. + weights = weights.detach() + disc_l2_loss_one = l2_loss_one * weights + disc_l2_loss_two = l2_loss_two * weights + disc_l2_loss_three = l2_loss_three * weights + + # A ratio to scale the loss back to original loss scale. + ratio_1 = torch.mean(l2_loss_one) / torch.mean(disc_l2_loss_one) + ratio_1 = ratio_1.detach() + ratio_2 = torch.mean(l2_loss_two) / torch.mean(disc_l2_loss_two) + ratio_2 = ratio_2.detach() + ratio_3 = torch.mean(l2_loss_three) / torch.mean(disc_l2_loss_three) + ratio_3 = ratio_3.detach() + + critic_loss_one = disc_l2_loss_one.mean() * ratio_1 + critic_loss_two = disc_l2_loss_two.mean() * ratio_2 + critic_loss_three = disc_l2_loss_three.mean() * ratio_3 + + critic_loss_total = critic_loss_one + critic_loss_two + critic_loss_three + + # Update the Critic + self.critic_net_optimiser.zero_grad() + critic_loss_total.backward() + self.critic_net_optimiser.step() + + ################## Update the Actor Second #################### + pi, first_log_p, _ = self.actor_net(states) + qf1_pi, qf2_pi, qf3_pi = self.critic_net(states, pi) + min_qf_pi = torch.minimum(torch.minimum(qf1_pi, qf2_pi), qf3_pi) + + actor_loss = ((self._alpha * first_log_p) - min_qf_pi).mean() + + # Update the Actor + self.actor_net_optimiser.zero_grad() + actor_loss.backward() + self.actor_net_optimiser.step() + + # Update the temperature + alpha_loss = -( + self.log_alpha * (first_log_p + self.target_entropy).detach() + ).mean() + + self.log_alpha_optimizer.zero_grad() + alpha_loss.backward() + self.log_alpha_optimizer.step() + + if self.learn_counter % self.policy_update_freq == 0: + for target_param, param in zip( + self.target_critic_net.parameters(), self.critic_net.parameters() + ): + target_param.data.copy_( + param.data * self.tau + target_param.data * (1.0 - self.tau) + ) + + def train_policy(self, memory: PrioritizedReplayBuffer, batch_size: int) -> None: + self.learn_counter += 1 + experiences = memory.sample_uniform(batch_size) + states, actions, rewards, next_states, dones, _ = experiences + # Convert into tensor + states = torch.FloatTensor(np.asarray(states)).to(self.device) + actions = torch.FloatTensor(np.asarray(actions)).to(self.device) + rewards = torch.FloatTensor(np.asarray(rewards)).to(self.device).unsqueeze(1) + next_states = torch.FloatTensor(np.asarray(next_states)).to(self.device) + dones = torch.LongTensor(np.asarray(dones)).to(self.device).unsqueeze(1) + full_weights = torch.ones(rewards.shape).to(self.device) + # Step 2 train as usual + self._train_policy( + states=states, + actions=actions, + rewards=rewards, + next_states=next_states, + dones=dones, + weights=full_weights, + ) + # # # Step 3 Dyna add more data + self._dyna_generate_and_train(next_states=next_states) + + def train_world_model( + self, memory: PrioritizedReplayBuffer, batch_size: int + ) -> None: + experiences = memory.sample_uniform(batch_size) + states, actions, rewards, next_states, _, _ = experiences + states = torch.FloatTensor(np.asarray(states)).to(self.device) + actions = torch.FloatTensor(np.asarray(actions)).to(self.device) + rewards = torch.FloatTensor(np.asarray(rewards)).to(self.device).unsqueeze(1) + next_states = torch.FloatTensor(np.asarray(next_states)).to(self.device) + self.world_model.train_world( + states=states, + actions=actions, + next_states=next_states, + ) + self.world_model.train_reward( + next_states=next_states, + rewards=rewards, + ) + + def _dyna_generate_and_train(self, next_states: torch.Tensor) -> None: + pred_states = [] + pred_actions = [] + pred_rs = [] + pred_n_states = [] + pred_uncerts = [] + with torch.no_grad(): + pred_state = next_states + for _ in range(self.horizon): + pred_state = torch.repeat_interleave(pred_state, self.num_samples, dim=0) + # This part is controversial. But random actions is empirically better. + rand_acts = np.random.uniform(-1, 1, (pred_state.shape[0], self.action_num)) + pred_acts = torch.FloatTensor(rand_acts).to(self.device) + pred_next_state, _, pred_mean, pred_var = self.world_model.pred_next_states( + pred_state, pred_acts + ) + uncert = self.sampling(pred_means=pred_mean, pred_vars=pred_var) + uncert = uncert.unsqueeze(dim=1).to(self.device) + + pred_reward = self.world_model.pred_rewards(pred_next_state) + # uncert = torch.ones(pred_reward.shape).to(self.device) + pred_uncerts.append(uncert) + + pred_states.append(pred_state) + pred_actions.append(pred_acts.detach()) + pred_rs.append(pred_reward.detach()) + pred_n_states.append(pred_next_state.detach()) + pred_state = pred_next_state.detach() + pred_states = torch.vstack(pred_states) + pred_actions = torch.vstack(pred_actions) + pred_rs = torch.vstack(pred_rs) + pred_n_states = torch.vstack(pred_n_states) + pred_weights = torch.vstack(pred_uncerts) + # Pay attention to here! It is dones in the Cares RL Code! + pred_dones = torch.FloatTensor(np.zeros(pred_rs.shape)).to(self.device) + # states, actions, rewards, next_states, not_dones + self._train_policy( + pred_states, pred_actions, pred_rs, pred_n_states, pred_dones, pred_weights + ) + + def sampling(self, pred_means, pred_vars, phi=0.0001): + """ + High std means low uncertainty. Therefore, divided by 1 + + :param pred_means: + :param pred_vars: + :return: + """ + sample_times = 10 + with torch.no_grad(): + sample1 = torch.distributions.Normal(pred_means[0], pred_vars[0]).sample( + [sample_times]) + sample2 = torch.distributions.Normal(pred_means[1], pred_vars[1]).sample( + [sample_times]) + sample3 = torch.distributions.Normal(pred_means[2], pred_vars[2]).sample( + [sample_times]) + sample4 = torch.distributions.Normal(pred_means[3], pred_vars[3]).sample( + [sample_times]) + sample5 = torch.distributions.Normal(pred_means[4], pred_vars[4]).sample( + [sample_times]) + + rs = [] + acts = [] + qs = [] + q_vars = [] + q_means = [] + # Varying the next_state's distribution. + for i in range(sample_times): + # 5 models, each sampled 10 times = 50, + pred_rwd1 = self.world_model.pred_rewards(sample1[i]) + pred_rwd2 = self.world_model.pred_rewards(sample2[i]) + pred_rwd3 = self.world_model.pred_rewards(sample3[i]) + pred_rwd4 = self.world_model.pred_rewards(sample4[i]) + pred_rwd5 = self.world_model.pred_rewards(sample5[i]) + rs.append(pred_rwd1) + rs.append(pred_rwd2) + rs.append(pred_rwd3) + rs.append(pred_rwd4) + rs.append(pred_rwd5) + # Each times, 5 models predict different actions. + # [2560, 17] + # Same sample, different model same next_state. + pred_act1, log_pi1, _ = self.actor_net(sample1[i]) + pred_act2, log_pi2, _ = self.actor_net(sample2[i]) + pred_act3, log_pi3, _ = self.actor_net(sample3[i]) + pred_act4, log_pi4, _ = self.actor_net(sample4[i]) + pred_act5, log_pi5, _ = self.actor_net(sample5[i]) + acts.append(log_pi1) + acts.append(log_pi2) + acts.append(log_pi3) + acts.append(log_pi4) + acts.append(log_pi5) + # How to become the same next state, different action. + # Now: sample1 sample2... same next state, different model. + # Pred_act1 pred_act2 same next_state, different actions. + # 5 models * 10 samples [var of state] + qa1, qa2, qa3 = self.target_critic_net(sample1[i], pred_act1) + qa_mean = (qa1 + qa2 + qa3) / 3.0 + qa_var = ((qa1 - qa_mean).pow(2) + (qa2 - qa_mean).pow(2) + (qa3 - qa_mean).pow(2)) / 3.0 + q_vars.append(qa_var) + q_means.append(qa_mean) + # qa_mins = torch.minimum(torch.minimum(qa1, qa2), qa3) + # qs.append(qa_mins) + + qb1, qb2, qb3 = self.target_critic_net(sample2[i], pred_act2) + qb_mean = (qb1 + qb2 + qb3) / 3.0 + qb_var = ((qb1 - qb_mean).pow(2) + (qb2 - qb_mean).pow(2) + (qb3 - qb_mean).pow(2)) / 3.0 + q_vars.append(qb_var) + q_means.append(qb_mean) + # qb_mins = torch.minimum(torch.minimum(qb1, qb2), qb3) + # qs.append(qb_mins) + + qc1, qc2, qc3 = self.target_critic_net(sample3[i], pred_act3) + qc_mean = (qc1 + qc2 + qc3) / 3.0 + qc_var = ((qc1 - qc_mean).pow(2) + (qc2 - qc_mean).pow(2) + (qc3 - qc_mean).pow(2)) / 3.0 + q_vars.append(qc_var) + q_means.append(qc_mean) + # qc_mins = torch.minimum(torch.minimum(qc1, qc2), qc3) + # qs.append(qc_mins) + + qd1, qd2, qd3 = self.target_critic_net(sample4[i], pred_act4) + qd_mean = (qd1 + qd2 + qd3) / 3.0 + qd_var = ((qd1 - qd_mean).pow(2) + (qd2 - qd_mean).pow(2) + (qd3 - qd_mean).pow(2)) / 3.0 + q_vars.append(qd_var) + q_means.append(qd_mean) + # qd_mins = torch.minimum(torch.minimum(qd1, qd2), qd3) + # qs.append(qd_mins) + + qe1, qe2, qe3 = self.target_critic_net(sample5[i], pred_act5) + qe_mean = (qe1 + qe2 + qe3) / 3.0 + qe_var = ((qe1 - qe_mean).pow(2) + (qe2 - qe_mean).pow(2) + (qe3 - qe_mean).pow(2)) / 3.0 + q_vars.append(qe_var) + q_means.append(qe_mean) + # qe_mins = torch.minimum(torch.minimum(qe1, qe2), qe3) + # qs.append(qe_mins) + + rs = torch.stack(rs) + acts = torch.stack(acts) + # qs = torch.stack(qs) + + var_r = torch.var(rs, dim=0) + var_a = torch.var(acts, dim=0) + + q_vars = torch.stack(q_vars) + q_means = torch.stack(q_means) + var_of_mean = torch.var(q_means, dim=0) + mean_of_vars = torch.mean(q_vars, dim=0) + var_q = var_of_mean + mean_of_vars + + # Computing covariance. + # mean_a = torch.mean(acts, dim=0, keepdim=True) + # mean_q = torch.mean(qs, dim=0, keepdim=True) + # diff_a = acts - mean_a + # diff_q = qs - mean_q + # cov_aq = torch.mean(diff_a * diff_q, dim=0) + + total_var = var_r + var_a + var_q # + 2 * cov_aq + # Clip for sigmoid + total_var[total_var < phi] = phi + total_stds = 1 / total_var + return total_stds.detach() + + def set_statistics(self, stats: dict) -> None: + self.world_model.set_statistics(stats) + + def save_models(self, filename: str, filepath: str = "models") -> None: + path = f"{filepath}/models" if filepath != "models" else filepath + dir_exists = os.path.exists(path) + if not dir_exists: + os.makedirs(path) + torch.save(self.actor_net.state_dict(), f"{path}/{filename}_actor.pth") + torch.save(self.critic_net.state_dict(), f"{path}/{filename}_critic.pth") + logging.info("models has been saved...") + + def load_models(self, filepath: str, filename: str) -> None: + path = f"{filepath}/models" if filepath != "models" else filepath + self.actor_net.load_state_dict(torch.load(f"{path}/{filename}_actor.pth")) + self.critic_net.load_state_dict(torch.load(f"{path}/{filename}_critic.pth")) + logging.info("models has been loaded...") diff --git a/cares_reinforcement_learning/algorithm/mbrl/__init__.py b/cares_reinforcement_learning/algorithm/mbrl/__init__.py index 87c0c7f8..663387a8 100644 --- a/cares_reinforcement_learning/algorithm/mbrl/__init__.py +++ b/cares_reinforcement_learning/algorithm/mbrl/__init__.py @@ -1,2 +1,5 @@ from .DynaSAC import DynaSAC -from .DynaSAC_Reweight import DynaSAC_Reweight +from .DynaSAC_BatchReweight import DynaSAC_BatchReweight +from .DynaSAC_Var import DynaSAC_Var +from .DynaSAT import DynaSAT +from .DynaSAT_BatchReweight import DynaSAT_BatchReweight diff --git a/cares_reinforcement_learning/networks/SAC/__init__.py b/cares_reinforcement_learning/networks/SAC/__init__.py index 76c75cd1..5a1eb50d 100644 --- a/cares_reinforcement_learning/networks/SAC/__init__.py +++ b/cares_reinforcement_learning/networks/SAC/__init__.py @@ -1,2 +1,3 @@ from .actor import Actor from .critic import Critic +from .triple_critic import TriCritic diff --git a/cares_reinforcement_learning/networks/SAC/triple_critic.py b/cares_reinforcement_learning/networks/SAC/triple_critic.py new file mode 100644 index 00000000..c6347d6a --- /dev/null +++ b/cares_reinforcement_learning/networks/SAC/triple_critic.py @@ -0,0 +1,45 @@ +import torch +from torch import nn + + +class TriCritic(nn.Module): + def __init__(self, observation_size: int, num_actions: int): + super().__init__() + + self.hidden_size = [256, 256] + + # Q1 architecture + self.Q1 = nn.Sequential( + nn.Linear(observation_size + num_actions, self.hidden_size[0]), + nn.ReLU(), + nn.Linear(self.hidden_size[0], self.hidden_size[1]), + nn.ReLU(), + nn.Linear(self.hidden_size[1], 1), + ) + + # Q2 architecture + self.Q2 = nn.Sequential( + nn.Linear(observation_size + num_actions, self.hidden_size[0]), + nn.ReLU(), + nn.Linear(self.hidden_size[0], self.hidden_size[1]), + nn.ReLU(), + nn.Linear(self.hidden_size[1], 1), + ) + + self.Q3 = nn.Sequential( + nn.Linear(observation_size + num_actions, self.hidden_size[0]), + nn.ReLU(), + nn.Linear(self.hidden_size[0], self.hidden_size[1]), + nn.ReLU(), + nn.Linear(self.hidden_size[1], 1), + ) + + + def forward( + self, state: torch.Tensor, action: torch.Tensor + ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + obs_action = torch.cat([state, action], dim=1) + q1 = self.Q1(obs_action) + q2 = self.Q2(obs_action) + q3 = self.Q3(obs_action) + return q1, q2, q3 diff --git a/cares_reinforcement_learning/util/configurations.py b/cares_reinforcement_learning/util/configurations.py index 31a1d38a..f7e76629 100644 --- a/cares_reinforcement_learning/util/configurations.py +++ b/cares_reinforcement_learning/util/configurations.py @@ -133,14 +133,31 @@ class SACConfig(AlgorithmConfig): actor_lr: Optional[float] = 3e-4 critic_lr: Optional[float] = 3e-4 alpha_lr: Optional[float] = 3e-4 + gamma: Optional[float] = 0.99 + tau: Optional[float] = 0.005 + reward_scale: Optional[float] = 1.0 + + +class DynaSAC_VarConfig(AlgorithmConfig): + algorithm: str = Field("DynaSAC_Var", Literal=True) + actor_lr: Optional[float] = 3e-4 + critic_lr: Optional[float] = 3e-4 + + alpha_lr: Optional[float] = 3e-4 + use_bounded_active: Optional[bool] = False + num_models: Optional[int] = 5 gamma: Optional[float] = 0.99 tau: Optional[float] = 0.005 reward_scale: Optional[float] = 1.0 + horizon: Optional[int] = 1 + num_samples: Optional[int] = 10 + world_model_lr: Optional[float] = 0.001 + -class DynaSAC_ReweightConfig(AlgorithmConfig): - algorithm: str = Field("DynaSAC_Reweight", Literal=True) +class DynaSAT_BatchReweightConfig(AlgorithmConfig): + algorithm: str = Field("DynaSAT_BatchReweight", Literal=True) actor_lr: Optional[float] = 3e-4 critic_lr: Optional[float] = 3e-4 @@ -150,8 +167,45 @@ class DynaSAC_ReweightConfig(AlgorithmConfig): gamma: Optional[float] = 0.99 tau: Optional[float] = 0.005 + reward_scale: Optional[float] = 1.0 - horizon: Optional[int] = 3 + horizon: Optional[int] = 1 + num_samples: Optional[int] = 10 + world_model_lr: Optional[float] = 0.001 + + +class DynaSAC_BatchReweightConfig(AlgorithmConfig): + algorithm: str = Field("DynaSAC_BatchReweight", Literal=True) + actor_lr: Optional[float] = 3e-4 + critic_lr: Optional[float] = 3e-4 + + alpha_lr: Optional[float] = 3e-4 + use_bounded_active: Optional[bool] = False + num_models: Optional[int] = 5 + + gamma: Optional[float] = 0.99 + tau: Optional[float] = 0.005 + reward_scale: Optional[float] = 1.0 + + horizon: Optional[int] = 1 + num_samples: Optional[int] = 10 + world_model_lr: Optional[float] = 0.001 + + +class DynaSATConfig(AlgorithmConfig): + algorithm: str = Field("DynaSAT", Literal=True) + actor_lr: Optional[float] = 3e-4 + critic_lr: Optional[float] = 3e-4 + + alpha_lr: Optional[float] = 3e-4 + use_bounded_active: Optional[bool] = False + num_models: Optional[int] = 5 + + gamma: Optional[float] = 0.99 + tau: Optional[float] = 0.005 + reward_scale: Optional[float] = 1.0 + + horizon: Optional[int] = 1 num_samples: Optional[int] = 10 world_model_lr: Optional[float] = 0.001 @@ -162,15 +216,14 @@ class DynaSACConfig(AlgorithmConfig): critic_lr: Optional[float] = 3e-4 alpha_lr: Optional[float] = 3e-4 - - # TODO this bool doesn't work as expected - needs to be int 1/0 use_bounded_active: Optional[bool] = False num_models: Optional[int] = 5 gamma: Optional[float] = 0.99 tau: Optional[float] = 0.005 + reward_scale: Optional[float] = 1.0 - horizon: Optional[int] = 3 + horizon: Optional[int] = 1 num_samples: Optional[int] = 10 world_model_lr: Optional[float] = 0.001 diff --git a/cares_reinforcement_learning/util/network_factory.py b/cares_reinforcement_learning/util/network_factory.py index 79a27f58..af815560 100644 --- a/cares_reinforcement_learning/util/network_factory.py +++ b/cares_reinforcement_learning/util/network_factory.py @@ -77,19 +77,18 @@ def create_PPO(observation_size, action_num, config: AlgorithmConfig): return agent -def create_DynaSAC(observation_size, action_num, config: AlgorithmConfig): +def create_DynaSAT_BatchReweight(observation_size, action_num, config: AlgorithmConfig): """ Create networks for model-based SAC agent. The Actor and Critic is same. An extra world model is added. """ - from cares_reinforcement_learning.algorithm.mbrl import DynaSAC - from cares_reinforcement_learning.networks.SAC import Actor, Critic - from cares_reinforcement_learning.networks.world_models import EnsembleWorldReward + from cares_reinforcement_learning.algorithm.mbrl import DynaSAT_BatchReweight + from cares_reinforcement_learning.networks.SAC import Actor, TriCritic from cares_reinforcement_learning.networks.world_models import EnsembleWorldAndOneReward actor = Actor(observation_size, action_num) - critic = Critic(observation_size, action_num) + critic = TriCritic(observation_size, action_num) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") @@ -97,11 +96,51 @@ def create_DynaSAC(observation_size, action_num, config: AlgorithmConfig): observation_size=observation_size, num_actions=action_num, num_models=config.num_models, + device=device, lr=config.world_model_lr, + ) + + agent = DynaSAT_BatchReweight( + actor_network=actor, + critic_network=critic, + world_network=world_model, + actor_lr=config.actor_lr, + critic_lr=config.critic_lr, + gamma=config.gamma, + tau=config.tau, + action_num=action_num, device=device, + alpha_lr=config.alpha_lr, + horizon=config.horizon, + num_samples=config.num_samples, ) + return agent - agent = DynaSAC( + +def create_DynaSAT(observation_size, action_num, config: AlgorithmConfig): + """ + Create networks for model-based SAC agent. The Actor and Critic is same. + An extra world model is added. + + """ + from cares_reinforcement_learning.algorithm.mbrl import DynaSAT + from cares_reinforcement_learning.networks.SAC import Actor, TriCritic + from cares_reinforcement_learning.networks.world_models import EnsembleWorldAndOneReward + + actor = Actor(observation_size, action_num) + critic = TriCritic(observation_size, action_num) + + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + world_model = EnsembleWorldAndOneReward( + observation_size=observation_size, + num_actions=action_num, + num_models=config.num_models, + lr=config.world_model_lr, + device=device, + ) + + agent = DynaSAT( actor_network=actor, critic_network=critic, world_network=world_model, @@ -118,15 +157,54 @@ def create_DynaSAC(observation_size, action_num, config: AlgorithmConfig): return agent -def create_DynaSAC_Reweight(observation_size, action_num, config: AlgorithmConfig): +def create_DynaSAC_BatchReweight(observation_size, action_num, config: AlgorithmConfig): + """ + Create networks for model-based SAC agent. The Actor and Critic is same. + An extra world model is added. + + """ + from cares_reinforcement_learning.algorithm.mbrl import DynaSAC_BatchReweight + from cares_reinforcement_learning.networks.SAC import Actor, Critic + from cares_reinforcement_learning.networks.world_models import EnsembleWorldAndOneReward + + actor = Actor(observation_size, action_num) + critic = Critic(observation_size, action_num) + + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + world_model = EnsembleWorldAndOneReward( + observation_size=observation_size, + num_actions=action_num, + num_models=config.num_models, + device=device, + lr=config.world_model_lr, + ) + + agent = DynaSAC_BatchReweight( + actor_network=actor, + critic_network=critic, + world_network=world_model, + actor_lr=config.actor_lr, + critic_lr=config.critic_lr, + gamma=config.gamma, + tau=config.tau, + action_num=action_num, + device=device, + alpha_lr=config.alpha_lr, + horizon=config.horizon, + num_samples=config.num_samples, + ) + return agent + + +def create_DynaSAC_Var(observation_size, action_num, config: AlgorithmConfig): """ Create networks for model-based SAC agent. The Actor and Critic is same. An extra world model is added. """ - from cares_reinforcement_learning.algorithm.mbrl import DynaSAC_Reweight + from cares_reinforcement_learning.algorithm.mbrl import DynaSAC_Var from cares_reinforcement_learning.networks.SAC import Actor, Critic - from cares_reinforcement_learning.networks.world_models import EnsembleWorldReward from cares_reinforcement_learning.networks.world_models import EnsembleWorldAndOneReward actor = Actor(observation_size, action_num) @@ -142,7 +220,7 @@ def create_DynaSAC_Reweight(observation_size, action_num, config: AlgorithmConfi lr=config.world_model_lr, ) - agent = DynaSAC_Reweight( + agent = DynaSAC_Var( actor_network=actor, critic_network=critic, world_network=world_model, @@ -159,6 +237,47 @@ def create_DynaSAC_Reweight(observation_size, action_num, config: AlgorithmConfi return agent +def create_DynaSAC(observation_size, action_num, config: AlgorithmConfig): + """ + Create networks for model-based SAC agent. The Actor and Critic is same. + An extra world model is added. + + """ + from cares_reinforcement_learning.algorithm.mbrl import DynaSAC + from cares_reinforcement_learning.networks.SAC import Actor, Critic + from cares_reinforcement_learning.networks.world_models import EnsembleWorldAndOneReward + + actor = Actor(observation_size, action_num) + critic = Critic(observation_size, action_num) + + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + world_model = EnsembleWorldAndOneReward( + observation_size=observation_size, + num_actions=action_num, + num_models=config.num_models, + lr=config.world_model_lr, + device=device, + ) + + agent = DynaSAC( + actor_network=actor, + critic_network=critic, + world_network=world_model, + actor_lr=config.actor_lr, + critic_lr=config.critic_lr, + gamma=config.gamma, + tau=config.tau, + action_num=action_num, + alpha_lr=config.alpha_lr, + horizon=config.horizon, + num_samples=config.num_samples, + device=device, + ) + return agent + + + def create_SAC(observation_size, action_num, config: AlgorithmConfig): from cares_reinforcement_learning.algorithm.policy import SAC from cares_reinforcement_learning.networks.SAC import Actor, Critic From 3521a289249882441c2e3b37465a445e7dd28bcb Mon Sep 17 00:00:00 2001 From: tony Date: Thu, 16 May 2024 18:49:45 +1200 Subject: [PATCH 25/91] full variance. --- .../algorithm/mbrl/DynaSAC_BatchReweight.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_BatchReweight.py b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_BatchReweight.py index 99ed1334..b2cbcd08 100644 --- a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_BatchReweight.py +++ b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_BatchReweight.py @@ -339,7 +339,12 @@ def sampling(self, pred_means, pred_vars, phi=0.0001): diff_q = qs - mean_q cov_aq = torch.mean(diff_a * diff_q, dim=0) - total_var = var_r + var_a + var_q + 2 * cov_aq + mean_r = torch.mean(rs, dim=0, keepdim=True) + diff_r = rs - mean_r + cov_rq = torch.mean(diff_r * diff_q, dim=0) + cov_ra = torch.mean(diff_r * diff_a, dim=0) + + total_var = var_r + var_a + var_q + 2 * cov_aq + 2 * cov_rq + 2 * cov_ra total_var[total_var < phi] = phi total_stds = 1 / total_var return total_stds.detach() From c2688973a38e5c18c1b5efd4483c55d5e35cc420 Mon Sep 17 00:00:00 2001 From: tony Date: Fri, 17 May 2024 11:56:17 +1200 Subject: [PATCH 26/91] Exacerbate the variance difference. --- .../mbrl/DynaSAC_ExaBatchReweight.py | 377 ++++++++++++++++++ .../algorithm/mbrl/__init__.py | 1 + .../util/configurations.py | 18 + .../util/network_factory.py | 40 ++ 4 files changed, 436 insertions(+) create mode 100644 cares_reinforcement_learning/algorithm/mbrl/DynaSAC_ExaBatchReweight.py diff --git a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_ExaBatchReweight.py b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_ExaBatchReweight.py new file mode 100644 index 00000000..a5fc7c87 --- /dev/null +++ b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_ExaBatchReweight.py @@ -0,0 +1,377 @@ +""" +Sutton, Richard S. "Dyna, an integrated architecture for learning, planning, and reacting." + +Original Paper: https://dl.acm.org/doi/abs/10.1145/122344.122377 + +This code runs automatic entropy tuning +""" + +import copy +import logging +import os + +import numpy as np +import torch +from cares_reinforcement_learning.memory import PrioritizedReplayBuffer + +from cares_reinforcement_learning.networks.world_models.ensemble_world import ( + EnsembleWorldAndOneReward, +) + + +class DynaSAC_ExaBatchReweight: + def __init__( + self, + actor_network: torch.nn.Module, + critic_network: torch.nn.Module, + world_network: EnsembleWorldAndOneReward, + gamma: float, + tau: float, + action_num: int, + actor_lr: float, + critic_lr: float, + alpha_lr: float, + num_samples: int, + horizon: int, + device: torch.device, + ): + self.type = "mbrl" + self.device = device + + # this may be called policy_net in other implementations + self.actor_net = actor_network.to(self.device) + # this may be called soft_q_net in other implementations + self.critic_net = critic_network.to(self.device) + self.target_critic_net = copy.deepcopy(self.critic_net) + + self.gamma = gamma + self.tau = tau + + self.num_samples = num_samples + self.horizon = horizon + self.action_num = action_num + + self.learn_counter = 0 + self.policy_update_freq = 1 + + self.actor_net_optimiser = torch.optim.Adam( + self.actor_net.parameters(), lr=actor_lr + ) + self.critic_net_optimiser = torch.optim.Adam( + self.critic_net.parameters(), lr=critic_lr + ) + + # Set to initial alpha to 1.0 according to other baselines. + self.log_alpha = torch.tensor(np.log(1.0)).to(device) + self.log_alpha.requires_grad = True + self.target_entropy = -action_num + self.log_alpha_optimizer = torch.optim.Adam([self.log_alpha], lr=alpha_lr) + + # World model + self.world_model = world_network + + @property + def _alpha(self) -> float: + return self.log_alpha.exp() + + # pylint: disable-next=unused-argument to keep the same interface + def select_action_from_policy( + self, state: np.ndarray, evaluation: bool = False, noise_scale: float = 0 + ) -> np.ndarray: + # note that when evaluating this algorithm we need to select mu as + self.actor_net.eval() + with torch.no_grad(): + state_tensor = torch.FloatTensor(state).unsqueeze(0).to(self.device) + if evaluation is False: + (action, _, _) = self.actor_net(state_tensor) + else: + (_, _, action) = self.actor_net(state_tensor) + action = action.cpu().data.numpy().flatten() + self.actor_net.train() + return action + + def _train_policy( + self, + states: torch.Tensor, + actions: torch.Tensor, + rewards: torch.Tensor, + next_states: torch.Tensor, + dones: torch.Tensor, + weights: torch.Tensor, + ) -> None: + ################## Update the Critic First #################### + # Have more target values? + with torch.no_grad(): + next_actions, next_log_pi, _ = self.actor_net(next_states) + target_q_one, target_q_two = self.target_critic_net( + next_states, next_actions + ) + target_q_values = ( + torch.minimum(target_q_one, target_q_two) - self._alpha * next_log_pi + ) + q_target = rewards + self.gamma * (1 - dones) * target_q_values + + q_values_one, q_values_two = self.critic_net(states, actions) + + # Original loss function + l2_loss_one = (q_values_one - q_target).pow(2) + l2_loss_two = (q_values_two - q_target).pow(2) + + # Reweighted loss function. weight not participant in training. + weights = weights.detach() + disc_l2_loss_one = l2_loss_one * weights + disc_l2_loss_two = l2_loss_two * weights + # A ratio to scale the loss back to original loss scale. + + ratio_1 = torch.mean(l2_loss_one) / torch.mean(disc_l2_loss_one) + ratio_1 = ratio_1.detach() + ratio_2 = torch.mean(l2_loss_two) / torch.mean(disc_l2_loss_two) + ratio_2 = ratio_2.detach() + + critic_loss_one = disc_l2_loss_one.mean() * ratio_1 + critic_loss_two = disc_l2_loss_two.mean() * ratio_2 + + critic_loss_total = critic_loss_one + critic_loss_two + + # Update the Critic + self.critic_net_optimiser.zero_grad() + critic_loss_total.backward() + self.critic_net_optimiser.step() + + ################## Update the Actor Second #################### + pi, first_log_p, _ = self.actor_net(states) + qf1_pi, qf2_pi = self.critic_net(states, pi) + min_qf_pi = torch.minimum(qf1_pi, qf2_pi) + actor_loss = ((self._alpha * first_log_p) - min_qf_pi).mean() + + # Update the Actor + self.actor_net_optimiser.zero_grad() + actor_loss.backward() + self.actor_net_optimiser.step() + + # Update the temperature + alpha_loss = -( + self.log_alpha * (first_log_p + self.target_entropy).detach() + ).mean() + + self.log_alpha_optimizer.zero_grad() + alpha_loss.backward() + self.log_alpha_optimizer.step() + + if self.learn_counter % self.policy_update_freq == 0: + for target_param, param in zip( + self.target_critic_net.parameters(), self.critic_net.parameters() + ): + target_param.data.copy_( + param.data * self.tau + target_param.data * (1.0 - self.tau) + ) + + def train_world_model( + self, memory: PrioritizedReplayBuffer, batch_size: int + ) -> None: + experiences = memory.sample_uniform(batch_size) + states, actions, rewards, next_states, _, _ = experiences + + states = torch.FloatTensor(np.asarray(states)).to(self.device) + actions = torch.FloatTensor(np.asarray(actions)).to(self.device) + rewards = torch.FloatTensor(np.asarray(rewards)).to(self.device).unsqueeze(1) + next_states = torch.FloatTensor(np.asarray(next_states)).to(self.device) + + self.world_model.train_world( + states=states, + actions=actions, + next_states=next_states, + ) + self.world_model.train_reward( + next_states=next_states, + rewards=rewards, + ) + + def train_policy(self, memory: PrioritizedReplayBuffer, batch_size: int) -> None: + self.learn_counter += 1 + + experiences = memory.sample_uniform(batch_size) + states, actions, rewards, next_states, dones, _ = experiences + + # Convert into tensor + states = torch.FloatTensor(np.asarray(states)).to(self.device) + actions = torch.FloatTensor(np.asarray(actions)).to(self.device) + rewards = torch.FloatTensor(np.asarray(rewards)).to(self.device).unsqueeze(1) + next_states = torch.FloatTensor(np.asarray(next_states)).to(self.device) + dones = torch.LongTensor(np.asarray(dones)).to(self.device).unsqueeze(1) + full_weights = torch.ones(rewards.shape).to(self.device) + # Step 2 train as usual + self._train_policy( + states=states, + actions=actions, + rewards=rewards, + next_states=next_states, + dones=dones, + weights=full_weights, + ) + # # # Step 3 Dyna add more data + self._dyna_generate_and_train(next_states=next_states) + + def _dyna_generate_and_train(self, next_states): + """ + Only off-policy Dyna will work. + :param next_states: + """ + pred_states = [] + pred_actions = [] + pred_rs = [] + pred_n_states = [] + pred_uncerts = [] + with torch.no_grad(): + pred_state = next_states + for _ in range(self.horizon): + pred_state = torch.repeat_interleave(pred_state, self.num_samples, dim=0) + # This part is controversial. But random actions is empirically better. + rand_acts = np.random.uniform(-1, 1, (pred_state.shape[0], self.action_num)) + pred_acts = torch.FloatTensor(rand_acts).to(self.device) + + pred_next_state, _, pred_mean, pred_var = self.world_model.pred_next_states( + pred_state, pred_acts + ) + uncert = self.sampling(pred_means=pred_mean, pred_vars=pred_var) + uncert = uncert.unsqueeze(dim=1).to(self.device) + pred_uncerts.append(uncert) + + pred_reward = self.world_model.pred_rewards(pred_next_state) + pred_states.append(pred_state) + pred_actions.append(pred_acts.detach()) + pred_rs.append(pred_reward.detach()) + pred_n_states.append(pred_next_state.detach()) + pred_state = pred_next_state.detach() + pred_states = torch.vstack(pred_states) + pred_actions = torch.vstack(pred_actions) + pred_rs = torch.vstack(pred_rs) + pred_n_states = torch.vstack(pred_n_states) + pred_weights = torch.vstack(pred_uncerts) + # Pay attention to here! It is dones in the Cares RL Code! + pred_dones = torch.FloatTensor(np.zeros(pred_rs.shape)).to(self.device) + # states, actions, rewards, next_states, not_dones + self._train_policy( + pred_states, pred_actions, pred_rs, pred_n_states, pred_dones, pred_weights + ) + + def sampling(self, pred_means, pred_vars, phi=0.0001): + """ + High std means low uncertainty. Therefore, divided by 1 + + :param pred_means: + :param pred_vars: + :return: + """ + sample_times = 10 + with torch.no_grad(): + # 5 models. Each predict 10 next_states. + sample1 = torch.distributions.Normal(pred_means[0], pred_vars[0]).sample( + [sample_times]) + sample2 = torch.distributions.Normal(pred_means[1], pred_vars[1]).sample( + [sample_times]) + sample3 = torch.distributions.Normal(pred_means[2], pred_vars[2]).sample( + [sample_times]) + sample4 = torch.distributions.Normal(pred_means[3], pred_vars[3]).sample( + [sample_times]) + sample5 = torch.distributions.Normal(pred_means[4], pred_vars[4]).sample( + [sample_times]) + rs = [] + acts = [] + qs = [] + # Varying the next_state's distribution. + for i in range(sample_times): + # 5 models, each sampled 10 times = 50, + pred_rwd1 = self.world_model.pred_rewards(sample1[i]) + pred_rwd2 = self.world_model.pred_rewards(sample2[i]) + pred_rwd3 = self.world_model.pred_rewards(sample3[i]) + pred_rwd4 = self.world_model.pred_rewards(sample4[i]) + pred_rwd5 = self.world_model.pred_rewards(sample5[i]) + rs.append(pred_rwd1) + rs.append(pred_rwd2) + rs.append(pred_rwd3) + rs.append(pred_rwd4) + rs.append(pred_rwd5) + # Each times, 5 models predict different actions. + # [2560, 17] + pred_act1, log_pi1, _ = self.actor_net(sample1[i]) + pred_act2, log_pi2, _ = self.actor_net(sample2[i]) + pred_act3, log_pi3, _ = self.actor_net(sample3[i]) + pred_act4, log_pi4, _ = self.actor_net(sample4[i]) + pred_act5, log_pi5, _ = self.actor_net(sample5[i]) + acts.append(log_pi1) + acts.append(log_pi2) + acts.append(log_pi3) + acts.append(log_pi4) + acts.append(log_pi5) + # How to become the same next state, different action. + # Now: sample1 sample2... same next state, different model. + # Pred_act1 pred_act2 same next_state, different actions. + # 5[] * 10[var of state] + qa1, qa2 = self.target_critic_net(sample1[i], pred_act1) + qa = torch.minimum(qa1, qa2) + qb1, qb2 = self.target_critic_net(sample2[i], pred_act2) + qb = torch.minimum(qb1, qb2) + qc1, qc2 = self.target_critic_net(sample3[i], pred_act3) + qc = torch.minimum(qc1, qc2) + qd1, qd2 = self.target_critic_net(sample4[i], pred_act4) + qd = torch.minimum(qd1, qd2) + qe1, qe2 = self.target_critic_net(sample5[i], pred_act5) + qe = torch.minimum(qe1, qe2) + qs.append(qa) + qs.append(qb) + qs.append(qc) + qs.append(qd) + qs.append(qe) + + rs = torch.stack(rs) + acts = torch.stack(acts) + qs = torch.stack(qs) + + var_r = torch.var(rs, dim=0) + var_a = torch.var(acts, dim=0) + var_q = torch.var(qs, dim=0) + + # Computing covariance. + mean_a = torch.mean(acts, dim=0, keepdim=True) + mean_q = torch.mean(qs, dim=0, keepdim=True) + diff_a = acts - mean_a + diff_q = qs - mean_q + cov_aq = torch.mean(diff_a * diff_q, dim=0) + + mean_r = torch.mean(rs, dim=0, keepdim=True) + diff_r = rs - mean_r + cov_rq = torch.mean(diff_r * diff_q, dim=0) + cov_ra = torch.mean(diff_r * diff_a, dim=0) + + total_var = var_r + var_a + var_q + 2 * cov_aq + 2 * cov_rq + 2 * cov_ra + + # Exacerbate the sample difference. + scale = 2.0 + mean_var = torch.mean(total_var, keepdim=True) + diff = total_var - mean_var + scale_diff = scale * diff + total_var += scale_diff + total_var[total_var < phi] = phi + + total_stds = 1 / total_var + + return total_stds.detach() + + def set_statistics(self, stats: dict) -> None: + self.world_model.set_statistics(stats) + + def save_models(self, filename: str, filepath: str = "models") -> None: + path = f"{filepath}/models" if filepath != "models" else filepath + dir_exists = os.path.exists(path) + if not dir_exists: + os.makedirs(path) + torch.save(self.actor_net.state_dict(), f"{path}/{filename}_actor.pth") + torch.save(self.critic_net.state_dict(), f"{path}/{filename}_critic.pth") + logging.info("models has been saved...") + + def load_models(self, filepath: str, filename: str) -> None: + path = f"{filepath}/models" if filepath != "models" else filepath + self.actor_net.load_state_dict(torch.load(f"{path}/{filename}_actor.pth")) + self.critic_net.load_state_dict(torch.load(f"{path}/{filename}_critic.pth")) + logging.info("models has been loaded...") diff --git a/cares_reinforcement_learning/algorithm/mbrl/__init__.py b/cares_reinforcement_learning/algorithm/mbrl/__init__.py index 663387a8..1b68582b 100644 --- a/cares_reinforcement_learning/algorithm/mbrl/__init__.py +++ b/cares_reinforcement_learning/algorithm/mbrl/__init__.py @@ -3,3 +3,4 @@ from .DynaSAC_Var import DynaSAC_Var from .DynaSAT import DynaSAT from .DynaSAT_BatchReweight import DynaSAT_BatchReweight +from .DynaSAC_ExaBatchReweight import DynaSAC_ExaBatchReweight \ No newline at end of file diff --git a/cares_reinforcement_learning/util/configurations.py b/cares_reinforcement_learning/util/configurations.py index f7e76629..a06e0671 100644 --- a/cares_reinforcement_learning/util/configurations.py +++ b/cares_reinforcement_learning/util/configurations.py @@ -174,6 +174,24 @@ class DynaSAT_BatchReweightConfig(AlgorithmConfig): world_model_lr: Optional[float] = 0.001 +class DynaSAC_ExaBatchReweightConfig(AlgorithmConfig): + algorithm: str = Field("DynaSAC_ExaBatchReweight", Literal=True) + actor_lr: Optional[float] = 3e-4 + critic_lr: Optional[float] = 3e-4 + + alpha_lr: Optional[float] = 3e-4 + use_bounded_active: Optional[bool] = False + num_models: Optional[int] = 5 + + gamma: Optional[float] = 0.99 + tau: Optional[float] = 0.005 + reward_scale: Optional[float] = 1.0 + + horizon: Optional[int] = 1 + num_samples: Optional[int] = 10 + world_model_lr: Optional[float] = 0.001 + + class DynaSAC_BatchReweightConfig(AlgorithmConfig): algorithm: str = Field("DynaSAC_BatchReweight", Literal=True) actor_lr: Optional[float] = 3e-4 diff --git a/cares_reinforcement_learning/util/network_factory.py b/cares_reinforcement_learning/util/network_factory.py index af815560..5a27352f 100644 --- a/cares_reinforcement_learning/util/network_factory.py +++ b/cares_reinforcement_learning/util/network_factory.py @@ -77,6 +77,46 @@ def create_PPO(observation_size, action_num, config: AlgorithmConfig): return agent +def create_DynaSAC_ExaBatchReweight(observation_size, action_num, config: AlgorithmConfig): + """ + Create networks for model-based SAC agent. The Actor and Critic is same. + An extra world model is added. + + """ + from cares_reinforcement_learning.algorithm.mbrl import DynaSAC_ExaBatchReweight + from cares_reinforcement_learning.networks.SAC import Actor, Critic + from cares_reinforcement_learning.networks.world_models import EnsembleWorldAndOneReward + + actor = Actor(observation_size, action_num) + critic = Critic(observation_size, action_num) + + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + world_model = EnsembleWorldAndOneReward( + observation_size=observation_size, + num_actions=action_num, + num_models=config.num_models, + device=device, + lr=config.world_model_lr, + ) + + agent = DynaSAC_ExaBatchReweight( + actor_network=actor, + critic_network=critic, + world_network=world_model, + actor_lr=config.actor_lr, + critic_lr=config.critic_lr, + gamma=config.gamma, + tau=config.tau, + action_num=action_num, + device=device, + alpha_lr=config.alpha_lr, + horizon=config.horizon, + num_samples=config.num_samples, + ) + return agent + + def create_DynaSAT_BatchReweight(observation_size, action_num, config: AlgorithmConfig): """ Create networks for model-based SAC agent. The Actor and Critic is same. From f68c9c4ee9d8551c90a3140866ecf04cbc5239c0 Mon Sep 17 00:00:00 2001 From: tony Date: Fri, 17 May 2024 12:00:51 +1200 Subject: [PATCH 27/91] Exacerbate the variance difference. --- .../algorithm/mbrl/DynaSAC_ExaBatchReweight.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_ExaBatchReweight.py b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_ExaBatchReweight.py index a5fc7c87..02bc5a70 100644 --- a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_ExaBatchReweight.py +++ b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_ExaBatchReweight.py @@ -348,7 +348,7 @@ def sampling(self, pred_means, pred_vars, phi=0.0001): # Exacerbate the sample difference. scale = 2.0 - mean_var = torch.mean(total_var, keepdim=True) + mean_var = torch.mean(total_var) diff = total_var - mean_var scale_diff = scale * diff total_var += scale_diff From e83c123a2d562ee883ddaf620efa2d691293dab9 Mon Sep 17 00:00:00 2001 From: tony Date: Fri, 17 May 2024 20:12:09 +1200 Subject: [PATCH 28/91] Maximize the variance rescale. --- .../mbrl/DynaSAC_MaxBatchReweight.py | 375 ++++++++++++++++++ .../algorithm/mbrl/__init__.py | 3 +- .../util/configurations.py | 16 + .../util/network_factory.py | 38 ++ 4 files changed, 431 insertions(+), 1 deletion(-) create mode 100644 cares_reinforcement_learning/algorithm/mbrl/DynaSAC_MaxBatchReweight.py diff --git a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_MaxBatchReweight.py b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_MaxBatchReweight.py new file mode 100644 index 00000000..889efc44 --- /dev/null +++ b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_MaxBatchReweight.py @@ -0,0 +1,375 @@ +""" +Sutton, Richard S. "Dyna, an integrated architecture for learning, planning, and reacting." + +Original Paper: https://dl.acm.org/doi/abs/10.1145/122344.122377 + +This code runs automatic entropy tuning +""" + +import copy +import logging +import os + +import numpy as np +import torch +from cares_reinforcement_learning.memory import PrioritizedReplayBuffer + +from cares_reinforcement_learning.networks.world_models.ensemble_world import ( + EnsembleWorldAndOneReward, +) + + +class DynaSAC_MaxBatchReweight: + def __init__( + self, + actor_network: torch.nn.Module, + critic_network: torch.nn.Module, + world_network: EnsembleWorldAndOneReward, + gamma: float, + tau: float, + action_num: int, + actor_lr: float, + critic_lr: float, + alpha_lr: float, + num_samples: int, + horizon: int, + device: torch.device, + ): + self.type = "mbrl" + self.device = device + + # this may be called policy_net in other implementations + self.actor_net = actor_network.to(self.device) + # this may be called soft_q_net in other implementations + self.critic_net = critic_network.to(self.device) + self.target_critic_net = copy.deepcopy(self.critic_net) + + self.gamma = gamma + self.tau = tau + + self.num_samples = num_samples + self.horizon = horizon + self.action_num = action_num + + self.learn_counter = 0 + self.policy_update_freq = 1 + + self.actor_net_optimiser = torch.optim.Adam( + self.actor_net.parameters(), lr=actor_lr + ) + self.critic_net_optimiser = torch.optim.Adam( + self.critic_net.parameters(), lr=critic_lr + ) + + # Set to initial alpha to 1.0 according to other baselines. + self.log_alpha = torch.tensor(np.log(1.0)).to(device) + self.log_alpha.requires_grad = True + self.target_entropy = -action_num + self.log_alpha_optimizer = torch.optim.Adam([self.log_alpha], lr=alpha_lr) + + # World model + self.world_model = world_network + + @property + def _alpha(self) -> float: + return self.log_alpha.exp() + + # pylint: disable-next=unused-argument to keep the same interface + def select_action_from_policy( + self, state: np.ndarray, evaluation: bool = False, noise_scale: float = 0 + ) -> np.ndarray: + # note that when evaluating this algorithm we need to select mu as + self.actor_net.eval() + with torch.no_grad(): + state_tensor = torch.FloatTensor(state).unsqueeze(0).to(self.device) + if evaluation is False: + (action, _, _) = self.actor_net(state_tensor) + else: + (_, _, action) = self.actor_net(state_tensor) + action = action.cpu().data.numpy().flatten() + self.actor_net.train() + return action + + def _train_policy( + self, + states: torch.Tensor, + actions: torch.Tensor, + rewards: torch.Tensor, + next_states: torch.Tensor, + dones: torch.Tensor, + weights: torch.Tensor, + ) -> None: + ################## Update the Critic First #################### + # Have more target values? + with torch.no_grad(): + next_actions, next_log_pi, _ = self.actor_net(next_states) + target_q_one, target_q_two = self.target_critic_net( + next_states, next_actions + ) + target_q_values = ( + torch.minimum(target_q_one, target_q_two) - self._alpha * next_log_pi + ) + q_target = rewards + self.gamma * (1 - dones) * target_q_values + + q_values_one, q_values_two = self.critic_net(states, actions) + + # Original loss function + l2_loss_one = (q_values_one - q_target).pow(2) + l2_loss_two = (q_values_two - q_target).pow(2) + + # Reweighted loss function. weight not participant in training. + weights = weights.detach() + disc_l2_loss_one = l2_loss_one * weights + disc_l2_loss_two = l2_loss_two * weights + # A ratio to scale the loss back to original loss scale. + + ratio_1 = torch.mean(l2_loss_one) / torch.mean(disc_l2_loss_one) + ratio_1 = ratio_1.detach() + ratio_2 = torch.mean(l2_loss_two) / torch.mean(disc_l2_loss_two) + ratio_2 = ratio_2.detach() + + critic_loss_one = disc_l2_loss_one.mean() * ratio_1 + critic_loss_two = disc_l2_loss_two.mean() * ratio_2 + + critic_loss_total = critic_loss_one + critic_loss_two + + # Update the Critic + self.critic_net_optimiser.zero_grad() + critic_loss_total.backward() + self.critic_net_optimiser.step() + + ################## Update the Actor Second #################### + pi, first_log_p, _ = self.actor_net(states) + qf1_pi, qf2_pi = self.critic_net(states, pi) + min_qf_pi = torch.minimum(qf1_pi, qf2_pi) + actor_loss = ((self._alpha * first_log_p) - min_qf_pi).mean() + + # Update the Actor + self.actor_net_optimiser.zero_grad() + actor_loss.backward() + self.actor_net_optimiser.step() + + # Update the temperature + alpha_loss = -( + self.log_alpha * (first_log_p + self.target_entropy).detach() + ).mean() + + self.log_alpha_optimizer.zero_grad() + alpha_loss.backward() + self.log_alpha_optimizer.step() + + if self.learn_counter % self.policy_update_freq == 0: + for target_param, param in zip( + self.target_critic_net.parameters(), self.critic_net.parameters() + ): + target_param.data.copy_( + param.data * self.tau + target_param.data * (1.0 - self.tau) + ) + + def train_world_model( + self, memory: PrioritizedReplayBuffer, batch_size: int + ) -> None: + experiences = memory.sample_uniform(batch_size) + states, actions, rewards, next_states, _, _ = experiences + + states = torch.FloatTensor(np.asarray(states)).to(self.device) + actions = torch.FloatTensor(np.asarray(actions)).to(self.device) + rewards = torch.FloatTensor(np.asarray(rewards)).to(self.device).unsqueeze(1) + next_states = torch.FloatTensor(np.asarray(next_states)).to(self.device) + + self.world_model.train_world( + states=states, + actions=actions, + next_states=next_states, + ) + self.world_model.train_reward( + next_states=next_states, + rewards=rewards, + ) + + def train_policy(self, memory: PrioritizedReplayBuffer, batch_size: int) -> None: + self.learn_counter += 1 + + experiences = memory.sample_uniform(batch_size) + states, actions, rewards, next_states, dones, _ = experiences + + # Convert into tensor + states = torch.FloatTensor(np.asarray(states)).to(self.device) + actions = torch.FloatTensor(np.asarray(actions)).to(self.device) + rewards = torch.FloatTensor(np.asarray(rewards)).to(self.device).unsqueeze(1) + next_states = torch.FloatTensor(np.asarray(next_states)).to(self.device) + dones = torch.LongTensor(np.asarray(dones)).to(self.device).unsqueeze(1) + full_weights = torch.ones(rewards.shape).to(self.device) + # Step 2 train as usual + self._train_policy( + states=states, + actions=actions, + rewards=rewards, + next_states=next_states, + dones=dones, + weights=full_weights, + ) + # # # Step 3 Dyna add more data + self._dyna_generate_and_train(next_states=next_states) + + def _dyna_generate_and_train(self, next_states): + """ + Only off-policy Dyna will work. + :param next_states: + """ + pred_states = [] + pred_actions = [] + pred_rs = [] + pred_n_states = [] + pred_uncerts = [] + with torch.no_grad(): + pred_state = next_states + for _ in range(self.horizon): + pred_state = torch.repeat_interleave(pred_state, self.num_samples, dim=0) + # This part is controversial. But random actions is empirically better. + rand_acts = np.random.uniform(-1, 1, (pred_state.shape[0], self.action_num)) + pred_acts = torch.FloatTensor(rand_acts).to(self.device) + + pred_next_state, _, pred_mean, pred_var = self.world_model.pred_next_states( + pred_state, pred_acts + ) + uncert = self.sampling(pred_means=pred_mean, pred_vars=pred_var) + uncert = uncert.unsqueeze(dim=1).to(self.device) + pred_uncerts.append(uncert) + + pred_reward = self.world_model.pred_rewards(pred_next_state) + pred_states.append(pred_state) + pred_actions.append(pred_acts.detach()) + pred_rs.append(pred_reward.detach()) + pred_n_states.append(pred_next_state.detach()) + pred_state = pred_next_state.detach() + pred_states = torch.vstack(pred_states) + pred_actions = torch.vstack(pred_actions) + pred_rs = torch.vstack(pred_rs) + pred_n_states = torch.vstack(pred_n_states) + pred_weights = torch.vstack(pred_uncerts) + # Pay attention to here! It is dones in the Cares RL Code! + pred_dones = torch.FloatTensor(np.zeros(pred_rs.shape)).to(self.device) + # states, actions, rewards, next_states, not_dones + self._train_policy( + pred_states, pred_actions, pred_rs, pred_n_states, pred_dones, pred_weights + ) + + def sampling(self, pred_means, pred_vars, phi=0.0001): + """ + High std means low uncertainty. Therefore, divided by 1 + + :param pred_means: + :param pred_vars: + :return: + """ + sample_times = 10 + with torch.no_grad(): + # 5 models. Each predict 10 next_states. + sample1 = torch.distributions.Normal(pred_means[0], pred_vars[0]).sample( + [sample_times]) + sample2 = torch.distributions.Normal(pred_means[1], pred_vars[1]).sample( + [sample_times]) + sample3 = torch.distributions.Normal(pred_means[2], pred_vars[2]).sample( + [sample_times]) + sample4 = torch.distributions.Normal(pred_means[3], pred_vars[3]).sample( + [sample_times]) + sample5 = torch.distributions.Normal(pred_means[4], pred_vars[4]).sample( + [sample_times]) + rs = [] + acts = [] + qs = [] + # Varying the next_state's distribution. + for i in range(sample_times): + # 5 models, each sampled 10 times = 50, + pred_rwd1 = self.world_model.pred_rewards(sample1[i]) + pred_rwd2 = self.world_model.pred_rewards(sample2[i]) + pred_rwd3 = self.world_model.pred_rewards(sample3[i]) + pred_rwd4 = self.world_model.pred_rewards(sample4[i]) + pred_rwd5 = self.world_model.pred_rewards(sample5[i]) + rs.append(pred_rwd1) + rs.append(pred_rwd2) + rs.append(pred_rwd3) + rs.append(pred_rwd4) + rs.append(pred_rwd5) + # Each times, 5 models predict different actions. + # [2560, 17] + pred_act1, log_pi1, _ = self.actor_net(sample1[i]) + pred_act2, log_pi2, _ = self.actor_net(sample2[i]) + pred_act3, log_pi3, _ = self.actor_net(sample3[i]) + pred_act4, log_pi4, _ = self.actor_net(sample4[i]) + pred_act5, log_pi5, _ = self.actor_net(sample5[i]) + acts.append(log_pi1) + acts.append(log_pi2) + acts.append(log_pi3) + acts.append(log_pi4) + acts.append(log_pi5) + # How to become the same next state, different action. + # Now: sample1 sample2... same next state, different model. + # Pred_act1 pred_act2 same next_state, different actions. + # 5[] * 10[var of state] + qa1, qa2 = self.target_critic_net(sample1[i], pred_act1) + qa = torch.minimum(qa1, qa2) + qb1, qb2 = self.target_critic_net(sample2[i], pred_act2) + qb = torch.minimum(qb1, qb2) + qc1, qc2 = self.target_critic_net(sample3[i], pred_act3) + qc = torch.minimum(qc1, qc2) + qd1, qd2 = self.target_critic_net(sample4[i], pred_act4) + qd = torch.minimum(qd1, qd2) + qe1, qe2 = self.target_critic_net(sample5[i], pred_act5) + qe = torch.minimum(qe1, qe2) + qs.append(qa) + qs.append(qb) + qs.append(qc) + qs.append(qd) + qs.append(qe) + + rs = torch.stack(rs) + acts = torch.stack(acts) + qs = torch.stack(qs) + + var_r = torch.var(rs, dim=0) + var_a = torch.var(acts, dim=0) + var_q = torch.var(qs, dim=0) + + # Computing covariance. + mean_a = torch.mean(acts, dim=0, keepdim=True) + mean_q = torch.mean(qs, dim=0, keepdim=True) + diff_a = acts - mean_a + diff_q = qs - mean_q + cov_aq = torch.mean(diff_a * diff_q, dim=0) + + mean_r = torch.mean(rs, dim=0, keepdim=True) + diff_r = rs - mean_r + cov_rq = torch.mean(diff_r * diff_q, dim=0) + cov_ra = torch.mean(diff_r * diff_a, dim=0) + + total_var = var_r + var_a + var_q + 2 * cov_aq + 2 * cov_rq + 2 * cov_ra + + # Exacerbate the sample difference. + min_var = torch.min(total_var) + max_var = torch.max(total_var) + scale_var = max_var - min_var + total_var = (total_var - min_var) / scale_var + total_var[total_var < phi] = phi + total_stds = 1 / total_var + + return total_stds.detach() + + def set_statistics(self, stats: dict) -> None: + self.world_model.set_statistics(stats) + + def save_models(self, filename: str, filepath: str = "models") -> None: + path = f"{filepath}/models" if filepath != "models" else filepath + dir_exists = os.path.exists(path) + if not dir_exists: + os.makedirs(path) + torch.save(self.actor_net.state_dict(), f"{path}/{filename}_actor.pth") + torch.save(self.critic_net.state_dict(), f"{path}/{filename}_critic.pth") + logging.info("models has been saved...") + + def load_models(self, filepath: str, filename: str) -> None: + path = f"{filepath}/models" if filepath != "models" else filepath + self.actor_net.load_state_dict(torch.load(f"{path}/{filename}_actor.pth")) + self.critic_net.load_state_dict(torch.load(f"{path}/{filename}_critic.pth")) + logging.info("models has been loaded...") diff --git a/cares_reinforcement_learning/algorithm/mbrl/__init__.py b/cares_reinforcement_learning/algorithm/mbrl/__init__.py index 1b68582b..b7e3e5c9 100644 --- a/cares_reinforcement_learning/algorithm/mbrl/__init__.py +++ b/cares_reinforcement_learning/algorithm/mbrl/__init__.py @@ -3,4 +3,5 @@ from .DynaSAC_Var import DynaSAC_Var from .DynaSAT import DynaSAT from .DynaSAT_BatchReweight import DynaSAT_BatchReweight -from .DynaSAC_ExaBatchReweight import DynaSAC_ExaBatchReweight \ No newline at end of file +from .DynaSAC_ExaBatchReweight import DynaSAC_ExaBatchReweight +from .DynaSAC_MaxBatchReweight import DynaSAC_MaxBatchReweight diff --git a/cares_reinforcement_learning/util/configurations.py b/cares_reinforcement_learning/util/configurations.py index a06e0671..e0f5e9e2 100644 --- a/cares_reinforcement_learning/util/configurations.py +++ b/cares_reinforcement_learning/util/configurations.py @@ -173,6 +173,22 @@ class DynaSAT_BatchReweightConfig(AlgorithmConfig): num_samples: Optional[int] = 10 world_model_lr: Optional[float] = 0.001 +class DynaSAC_MaxBatchReweightConfig(AlgorithmConfig): + algorithm: str = Field("DynaSAC_MaxBatchReweight", Literal=True) + actor_lr: Optional[float] = 3e-4 + critic_lr: Optional[float] = 3e-4 + + alpha_lr: Optional[float] = 3e-4 + use_bounded_active: Optional[bool] = False + num_models: Optional[int] = 5 + + gamma: Optional[float] = 0.99 + tau: Optional[float] = 0.005 + reward_scale: Optional[float] = 1.0 + + horizon: Optional[int] = 1 + num_samples: Optional[int] = 10 + world_model_lr: Optional[float] = 0.001 class DynaSAC_ExaBatchReweightConfig(AlgorithmConfig): algorithm: str = Field("DynaSAC_ExaBatchReweight", Literal=True) diff --git a/cares_reinforcement_learning/util/network_factory.py b/cares_reinforcement_learning/util/network_factory.py index 5a27352f..ccc349cf 100644 --- a/cares_reinforcement_learning/util/network_factory.py +++ b/cares_reinforcement_learning/util/network_factory.py @@ -76,6 +76,44 @@ def create_PPO(observation_size, action_num, config: AlgorithmConfig): ) return agent +def create_DynaSAC_MaxBatchReweight(observation_size, action_num, config: AlgorithmConfig): + """ + Create networks for model-based SAC agent. The Actor and Critic is same. + An extra world model is added. + + """ + from cares_reinforcement_learning.algorithm.mbrl import DynaSAC_MaxBatchReweight + from cares_reinforcement_learning.networks.SAC import Actor, Critic + from cares_reinforcement_learning.networks.world_models import EnsembleWorldAndOneReward + + actor = Actor(observation_size, action_num) + critic = Critic(observation_size, action_num) + + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + world_model = EnsembleWorldAndOneReward( + observation_size=observation_size, + num_actions=action_num, + num_models=config.num_models, + device=device, + lr=config.world_model_lr, + ) + + agent = DynaSAC_MaxBatchReweight( + actor_network=actor, + critic_network=critic, + world_network=world_model, + actor_lr=config.actor_lr, + critic_lr=config.critic_lr, + gamma=config.gamma, + tau=config.tau, + action_num=action_num, + device=device, + alpha_lr=config.alpha_lr, + horizon=config.horizon, + num_samples=config.num_samples, + ) + return agent def create_DynaSAC_ExaBatchReweight(observation_size, action_num, config: AlgorithmConfig): """ From 05a56b2f4a77d228f98f742dc4333a84db64d209 Mon Sep 17 00:00:00 2001 From: tony Date: Thu, 23 May 2024 12:48:39 +1200 Subject: [PATCH 29/91] Ablation exp --- .../mbrl/DynaSAC_MaxBatchReweight.py | 71 ++++++++++++------- .../util/configurations.py | 7 ++ .../util/network_factory.py | 4 ++ 3 files changed, 58 insertions(+), 24 deletions(-) diff --git a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_MaxBatchReweight.py b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_MaxBatchReweight.py index 889efc44..657382df 100644 --- a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_MaxBatchReweight.py +++ b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_MaxBatchReweight.py @@ -20,6 +20,9 @@ class DynaSAC_MaxBatchReweight: + """ + Max as ? + """ def __init__( self, actor_network: torch.nn.Module, @@ -33,6 +36,10 @@ def __init__( alpha_lr: float, num_samples: int, horizon: int, + max_scale: float, + max_threshold: float, + mode: int, + sample_times: int, device: torch.device, ): self.type = "mbrl" @@ -69,6 +76,11 @@ def __init__( # World model self.world_model = world_network + # Parameter + self.max_scale = max_scale + self.max_threshold = max_threshold + self.mode = mode + self.sample_times = sample_times @property def _alpha(self) -> float: @@ -255,7 +267,7 @@ def _dyna_generate_and_train(self, next_states): pred_states, pred_actions, pred_rs, pred_n_states, pred_dones, pred_weights ) - def sampling(self, pred_means, pred_vars, phi=0.0001): + def sampling(self, pred_means, pred_vars): """ High std means low uncertainty. Therefore, divided by 1 @@ -263,24 +275,23 @@ def sampling(self, pred_means, pred_vars, phi=0.0001): :param pred_vars: :return: """ - sample_times = 10 with torch.no_grad(): # 5 models. Each predict 10 next_states. sample1 = torch.distributions.Normal(pred_means[0], pred_vars[0]).sample( - [sample_times]) + [self.sample_times]) sample2 = torch.distributions.Normal(pred_means[1], pred_vars[1]).sample( - [sample_times]) + [self.sample_times]) sample3 = torch.distributions.Normal(pred_means[2], pred_vars[2]).sample( - [sample_times]) + [self.sample_times]) sample4 = torch.distributions.Normal(pred_means[3], pred_vars[3]).sample( - [sample_times]) + [self.sample_times]) sample5 = torch.distributions.Normal(pred_means[4], pred_vars[4]).sample( - [sample_times]) + [self.sample_times]) rs = [] acts = [] qs = [] # Varying the next_state's distribution. - for i in range(sample_times): + for i in range(self.sample_times): # 5 models, each sampled 10 times = 50, pred_rwd1 = self.world_model.pred_rewards(sample1[i]) pred_rwd2 = self.world_model.pred_rewards(sample2[i]) @@ -329,31 +340,43 @@ def sampling(self, pred_means, pred_vars, phi=0.0001): qs = torch.stack(qs) var_r = torch.var(rs, dim=0) - var_a = torch.var(acts, dim=0) - var_q = torch.var(qs, dim=0) - # Computing covariance. - mean_a = torch.mean(acts, dim=0, keepdim=True) - mean_q = torch.mean(qs, dim=0, keepdim=True) - diff_a = acts - mean_a - diff_q = qs - mean_q - cov_aq = torch.mean(diff_a * diff_q, dim=0) - - mean_r = torch.mean(rs, dim=0, keepdim=True) - diff_r = rs - mean_r - cov_rq = torch.mean(diff_r * diff_q, dim=0) - cov_ra = torch.mean(diff_r * diff_a, dim=0) + if self.mode < 3: + var_a = torch.var(acts, dim=0) + var_q = torch.var(qs, dim=0) - total_var = var_r + var_a + var_q + 2 * cov_aq + 2 * cov_rq + 2 * cov_ra + # Computing covariance. + if self.mode < 2: + mean_a = torch.mean(acts, dim=0, keepdim=True) + mean_q = torch.mean(qs, dim=0, keepdim=True) + diff_a = acts - mean_a + diff_q = qs - mean_q + cov_aq = torch.mean(diff_a * diff_q, dim=0) + + if self.mode < 1: + mean_r = torch.mean(rs, dim=0, keepdim=True) + diff_r = rs - mean_r + cov_rq = torch.mean(diff_r * diff_q, dim=0) + + cov_ra = torch.mean(diff_r * diff_a, dim=0) + + # Ablation + if self.mode == 0: + total_var = var_r + var_a + var_q + 2 * cov_aq + 2 * cov_rq + 2 * cov_ra + if self.mode == 1: + total_var = var_r + var_a + var_q + 2 * cov_aq + if self.mode == 2: + total_var = var_r + var_a + var_q + if self.mode == 3: + total_var = var_r # Exacerbate the sample difference. min_var = torch.min(total_var) max_var = torch.max(total_var) scale_var = max_var - min_var total_var = (total_var - min_var) / scale_var - total_var[total_var < phi] = phi + total_var[total_var < self.max_threshold] = self.max_scale total_stds = 1 / total_var - return total_stds.detach() def set_statistics(self, stats: dict) -> None: diff --git a/cares_reinforcement_learning/util/configurations.py b/cares_reinforcement_learning/util/configurations.py index e0f5e9e2..e1354bd6 100644 --- a/cares_reinforcement_learning/util/configurations.py +++ b/cares_reinforcement_learning/util/configurations.py @@ -173,6 +173,7 @@ class DynaSAT_BatchReweightConfig(AlgorithmConfig): num_samples: Optional[int] = 10 world_model_lr: Optional[float] = 0.001 + class DynaSAC_MaxBatchReweightConfig(AlgorithmConfig): algorithm: str = Field("DynaSAC_MaxBatchReweight", Literal=True) actor_lr: Optional[float] = 3e-4 @@ -190,6 +191,12 @@ class DynaSAC_MaxBatchReweightConfig(AlgorithmConfig): num_samples: Optional[int] = 10 world_model_lr: Optional[float] = 0.001 + max_scale: Optional[float] = 0.1 + max_threshold: Optional[float] = 0.7 + mode: Optional[int] = 0 + sample_times: Optional[int] = 10 + + class DynaSAC_ExaBatchReweightConfig(AlgorithmConfig): algorithm: str = Field("DynaSAC_ExaBatchReweight", Literal=True) actor_lr: Optional[float] = 3e-4 diff --git a/cares_reinforcement_learning/util/network_factory.py b/cares_reinforcement_learning/util/network_factory.py index ccc349cf..4eae6e40 100644 --- a/cares_reinforcement_learning/util/network_factory.py +++ b/cares_reinforcement_learning/util/network_factory.py @@ -112,6 +112,10 @@ def create_DynaSAC_MaxBatchReweight(observation_size, action_num, config: Algori alpha_lr=config.alpha_lr, horizon=config.horizon, num_samples=config.num_samples, + max_scale=config.max_scale, + max_threshold=config.max_threshold, + mode=config.mode, + sample_times=config.sample_times, ) return agent From 52d70270ad7f2207d16a43b6e6bb07a8945dce00 Mon Sep 17 00:00:00 2001 From: tony Date: Wed, 12 Jun 2024 12:11:47 +1200 Subject: [PATCH 30/91] Adjust algorithm. --- ...tchReweight.py => DynaSAC_BinaryBatchReweight.py} | 12 +++++------- .../algorithm/mbrl/__init__.py | 2 +- cares_reinforcement_learning/util/configurations.py | 7 +++---- cares_reinforcement_learning/util/network_factory.py | 9 ++++----- 4 files changed, 13 insertions(+), 17 deletions(-) rename cares_reinforcement_learning/algorithm/mbrl/{DynaSAC_MaxBatchReweight.py => DynaSAC_BinaryBatchReweight.py} (98%) diff --git a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_MaxBatchReweight.py b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_BinaryBatchReweight.py similarity index 98% rename from cares_reinforcement_learning/algorithm/mbrl/DynaSAC_MaxBatchReweight.py rename to cares_reinforcement_learning/algorithm/mbrl/DynaSAC_BinaryBatchReweight.py index 657382df..e2bf16af 100644 --- a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_MaxBatchReweight.py +++ b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_BinaryBatchReweight.py @@ -19,7 +19,7 @@ ) -class DynaSAC_MaxBatchReweight: +class DynaSAC_BinaryBatchReweight: """ Max as ? """ @@ -36,8 +36,7 @@ def __init__( alpha_lr: float, num_samples: int, horizon: int, - max_scale: float, - max_threshold: float, + threshold_scale: float, mode: int, sample_times: int, device: torch.device, @@ -77,8 +76,7 @@ def __init__( # World model self.world_model = world_network # Parameter - self.max_scale = max_scale - self.max_threshold = max_threshold + self.threshold_scale = threshold_scale self.mode = mode self.sample_times = sample_times @@ -374,8 +372,8 @@ def sampling(self, pred_means, pred_vars): min_var = torch.min(total_var) max_var = torch.max(total_var) scale_var = max_var - min_var - total_var = (total_var - min_var) / scale_var - total_var[total_var < self.max_threshold] = self.max_scale + threshold = (self.threshold_scale * scale_var) + min_var + total_var[total_var <= threshold] = min_var total_stds = 1 / total_var return total_stds.detach() diff --git a/cares_reinforcement_learning/algorithm/mbrl/__init__.py b/cares_reinforcement_learning/algorithm/mbrl/__init__.py index b7e3e5c9..d7c59f62 100644 --- a/cares_reinforcement_learning/algorithm/mbrl/__init__.py +++ b/cares_reinforcement_learning/algorithm/mbrl/__init__.py @@ -4,4 +4,4 @@ from .DynaSAT import DynaSAT from .DynaSAT_BatchReweight import DynaSAT_BatchReweight from .DynaSAC_ExaBatchReweight import DynaSAC_ExaBatchReweight -from .DynaSAC_MaxBatchReweight import DynaSAC_MaxBatchReweight +from .DynaSAC_BinaryBatchReweight import DynaSAC_BinaryBatchReweight diff --git a/cares_reinforcement_learning/util/configurations.py b/cares_reinforcement_learning/util/configurations.py index e1354bd6..38f02932 100644 --- a/cares_reinforcement_learning/util/configurations.py +++ b/cares_reinforcement_learning/util/configurations.py @@ -174,8 +174,8 @@ class DynaSAT_BatchReweightConfig(AlgorithmConfig): world_model_lr: Optional[float] = 0.001 -class DynaSAC_MaxBatchReweightConfig(AlgorithmConfig): - algorithm: str = Field("DynaSAC_MaxBatchReweight", Literal=True) +class DynaSAC_BinaryBatchReweightConfig(AlgorithmConfig): + algorithm: str = Field("DynaSAC_BinaryBatchReweight", Literal=True) actor_lr: Optional[float] = 3e-4 critic_lr: Optional[float] = 3e-4 @@ -191,8 +191,7 @@ class DynaSAC_MaxBatchReweightConfig(AlgorithmConfig): num_samples: Optional[int] = 10 world_model_lr: Optional[float] = 0.001 - max_scale: Optional[float] = 0.1 - max_threshold: Optional[float] = 0.7 + threshold_scale: Optional[float] = 0.7 mode: Optional[int] = 0 sample_times: Optional[int] = 10 diff --git a/cares_reinforcement_learning/util/network_factory.py b/cares_reinforcement_learning/util/network_factory.py index 4eae6e40..2c5c869c 100644 --- a/cares_reinforcement_learning/util/network_factory.py +++ b/cares_reinforcement_learning/util/network_factory.py @@ -76,13 +76,13 @@ def create_PPO(observation_size, action_num, config: AlgorithmConfig): ) return agent -def create_DynaSAC_MaxBatchReweight(observation_size, action_num, config: AlgorithmConfig): +def create_DynaSAC_BinaryBatchReweight(observation_size, action_num, config: AlgorithmConfig): """ Create networks for model-based SAC agent. The Actor and Critic is same. An extra world model is added. """ - from cares_reinforcement_learning.algorithm.mbrl import DynaSAC_MaxBatchReweight + from cares_reinforcement_learning.algorithm.mbrl import DynaSAC_BinaryBatchReweight from cares_reinforcement_learning.networks.SAC import Actor, Critic from cares_reinforcement_learning.networks.world_models import EnsembleWorldAndOneReward @@ -99,7 +99,7 @@ def create_DynaSAC_MaxBatchReweight(observation_size, action_num, config: Algori lr=config.world_model_lr, ) - agent = DynaSAC_MaxBatchReweight( + agent = DynaSAC_BinaryBatchReweight( actor_network=actor, critic_network=critic, world_network=world_model, @@ -112,8 +112,7 @@ def create_DynaSAC_MaxBatchReweight(observation_size, action_num, config: Algori alpha_lr=config.alpha_lr, horizon=config.horizon, num_samples=config.num_samples, - max_scale=config.max_scale, - max_threshold=config.max_threshold, + threshold_scale=config.threshold_scale, mode=config.mode, sample_times=config.sample_times, ) From 78464b9b1308fc95b5dc8260384c78e996cdc75a Mon Sep 17 00:00:00 2001 From: tony Date: Wed, 12 Jun 2024 12:35:58 +1200 Subject: [PATCH 31/91] Adjust algorithm. --- .../algorithm/mbrl/DynaSAC_BinaryBatchReweight.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_BinaryBatchReweight.py b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_BinaryBatchReweight.py index e2bf16af..f34ea44f 100644 --- a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_BinaryBatchReweight.py +++ b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_BinaryBatchReweight.py @@ -371,8 +371,9 @@ def sampling(self, pred_means, pred_vars): # Exacerbate the sample difference. min_var = torch.min(total_var) max_var = torch.max(total_var) - scale_var = max_var - min_var - threshold = (self.threshold_scale * scale_var) + min_var + # scale_var = max_var - min_var + mean_var = torch.mean(total_var) + threshold = (self.threshold_scale * (max_var - mean_var)) + mean_var total_var[total_var <= threshold] = min_var total_stds = 1 / total_var return total_stds.detach() From 3e75218ff4c190d5726d6c208f7b4773afa96370 Mon Sep 17 00:00:00 2001 From: tony Date: Mon, 17 Jun 2024 08:28:56 +1200 Subject: [PATCH 32/91] clean up --- .../algorithm/mbrl/DynaSAC_BatchReweight.py | 368 ---------------- ...eight.py => DynaSAC_ScaleBatchReweight.py} | 88 ++-- .../algorithm/mbrl/DynaSAC_Var.py | 363 ---------------- .../algorithm/mbrl/DynaSAT.py | 251 ----------- .../algorithm/mbrl/DynaSAT_BatchReweight.py | 394 ------------------ .../algorithm/mbrl/__init__.py | 6 +- .../util/configurations.py | 85 +--- .../util/network_factory.py | 176 +------- 8 files changed, 80 insertions(+), 1651 deletions(-) delete mode 100644 cares_reinforcement_learning/algorithm/mbrl/DynaSAC_BatchReweight.py rename cares_reinforcement_learning/algorithm/mbrl/{DynaSAC_ExaBatchReweight.py => DynaSAC_ScaleBatchReweight.py} (85%) delete mode 100644 cares_reinforcement_learning/algorithm/mbrl/DynaSAC_Var.py delete mode 100644 cares_reinforcement_learning/algorithm/mbrl/DynaSAT.py delete mode 100644 cares_reinforcement_learning/algorithm/mbrl/DynaSAT_BatchReweight.py diff --git a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_BatchReweight.py b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_BatchReweight.py deleted file mode 100644 index b2cbcd08..00000000 --- a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_BatchReweight.py +++ /dev/null @@ -1,368 +0,0 @@ -""" -Sutton, Richard S. "Dyna, an integrated architecture for learning, planning, and reacting." - -Original Paper: https://dl.acm.org/doi/abs/10.1145/122344.122377 - -This code runs automatic entropy tuning -""" - -import copy -import logging -import os - -import numpy as np -import torch -from cares_reinforcement_learning.memory import PrioritizedReplayBuffer - -from cares_reinforcement_learning.networks.world_models.ensemble_world import ( - EnsembleWorldAndOneReward, -) - - -class DynaSAC_BatchReweight: - def __init__( - self, - actor_network: torch.nn.Module, - critic_network: torch.nn.Module, - world_network: EnsembleWorldAndOneReward, - gamma: float, - tau: float, - action_num: int, - actor_lr: float, - critic_lr: float, - alpha_lr: float, - num_samples: int, - horizon: int, - device: torch.device, - ): - self.type = "mbrl" - self.device = device - - # this may be called policy_net in other implementations - self.actor_net = actor_network.to(self.device) - # this may be called soft_q_net in other implementations - self.critic_net = critic_network.to(self.device) - self.target_critic_net = copy.deepcopy(self.critic_net) - - self.gamma = gamma - self.tau = tau - - self.num_samples = num_samples - self.horizon = horizon - self.action_num = action_num - - self.learn_counter = 0 - self.policy_update_freq = 1 - - self.actor_net_optimiser = torch.optim.Adam( - self.actor_net.parameters(), lr=actor_lr - ) - self.critic_net_optimiser = torch.optim.Adam( - self.critic_net.parameters(), lr=critic_lr - ) - - # Set to initial alpha to 1.0 according to other baselines. - self.log_alpha = torch.tensor(np.log(1.0)).to(device) - self.log_alpha.requires_grad = True - self.target_entropy = -action_num - self.log_alpha_optimizer = torch.optim.Adam([self.log_alpha], lr=alpha_lr) - - # World model - self.world_model = world_network - - @property - def _alpha(self) -> float: - return self.log_alpha.exp() - - # pylint: disable-next=unused-argument to keep the same interface - def select_action_from_policy( - self, state: np.ndarray, evaluation: bool = False, noise_scale: float = 0 - ) -> np.ndarray: - # note that when evaluating this algorithm we need to select mu as - self.actor_net.eval() - with torch.no_grad(): - state_tensor = torch.FloatTensor(state).unsqueeze(0).to(self.device) - if evaluation is False: - (action, _, _) = self.actor_net(state_tensor) - else: - (_, _, action) = self.actor_net(state_tensor) - action = action.cpu().data.numpy().flatten() - self.actor_net.train() - return action - - def _train_policy( - self, - states: torch.Tensor, - actions: torch.Tensor, - rewards: torch.Tensor, - next_states: torch.Tensor, - dones: torch.Tensor, - weights: torch.Tensor, - ) -> None: - ################## Update the Critic First #################### - # Have more target values? - with torch.no_grad(): - next_actions, next_log_pi, _ = self.actor_net(next_states) - target_q_one, target_q_two = self.target_critic_net( - next_states, next_actions - ) - target_q_values = ( - torch.minimum(target_q_one, target_q_two) - self._alpha * next_log_pi - ) - q_target = rewards + self.gamma * (1 - dones) * target_q_values - - q_values_one, q_values_two = self.critic_net(states, actions) - - # Original loss function - l2_loss_one = (q_values_one - q_target).pow(2) - l2_loss_two = (q_values_two - q_target).pow(2) - - # Reweighted loss function. weight not participant in training. - weights = weights.detach() - disc_l2_loss_one = l2_loss_one * weights - disc_l2_loss_two = l2_loss_two * weights - # A ratio to scale the loss back to original loss scale. - - ratio_1 = torch.mean(l2_loss_one) / torch.mean(disc_l2_loss_one) - ratio_1 = ratio_1.detach() - ratio_2 = torch.mean(l2_loss_two) / torch.mean(disc_l2_loss_two) - ratio_2 = ratio_2.detach() - - critic_loss_one = disc_l2_loss_one.mean() * ratio_1 - critic_loss_two = disc_l2_loss_two.mean() * ratio_2 - - critic_loss_total = critic_loss_one + critic_loss_two - - # Update the Critic - self.critic_net_optimiser.zero_grad() - critic_loss_total.backward() - self.critic_net_optimiser.step() - - ################## Update the Actor Second #################### - pi, first_log_p, _ = self.actor_net(states) - qf1_pi, qf2_pi = self.critic_net(states, pi) - min_qf_pi = torch.minimum(qf1_pi, qf2_pi) - actor_loss = ((self._alpha * first_log_p) - min_qf_pi).mean() - - # Update the Actor - self.actor_net_optimiser.zero_grad() - actor_loss.backward() - self.actor_net_optimiser.step() - - # Update the temperature - alpha_loss = -( - self.log_alpha * (first_log_p + self.target_entropy).detach() - ).mean() - - self.log_alpha_optimizer.zero_grad() - alpha_loss.backward() - self.log_alpha_optimizer.step() - - if self.learn_counter % self.policy_update_freq == 0: - for target_param, param in zip( - self.target_critic_net.parameters(), self.critic_net.parameters() - ): - target_param.data.copy_( - param.data * self.tau + target_param.data * (1.0 - self.tau) - ) - - def train_world_model( - self, memory: PrioritizedReplayBuffer, batch_size: int - ) -> None: - experiences = memory.sample_uniform(batch_size) - states, actions, rewards, next_states, _, _ = experiences - - states = torch.FloatTensor(np.asarray(states)).to(self.device) - actions = torch.FloatTensor(np.asarray(actions)).to(self.device) - rewards = torch.FloatTensor(np.asarray(rewards)).to(self.device).unsqueeze(1) - next_states = torch.FloatTensor(np.asarray(next_states)).to(self.device) - - self.world_model.train_world( - states=states, - actions=actions, - next_states=next_states, - ) - self.world_model.train_reward( - next_states=next_states, - rewards=rewards, - ) - - def train_policy(self, memory: PrioritizedReplayBuffer, batch_size: int) -> None: - self.learn_counter += 1 - - experiences = memory.sample_uniform(batch_size) - states, actions, rewards, next_states, dones, _ = experiences - - # Convert into tensor - states = torch.FloatTensor(np.asarray(states)).to(self.device) - actions = torch.FloatTensor(np.asarray(actions)).to(self.device) - rewards = torch.FloatTensor(np.asarray(rewards)).to(self.device).unsqueeze(1) - next_states = torch.FloatTensor(np.asarray(next_states)).to(self.device) - dones = torch.LongTensor(np.asarray(dones)).to(self.device).unsqueeze(1) - full_weights = torch.ones(rewards.shape).to(self.device) - # Step 2 train as usual - self._train_policy( - states=states, - actions=actions, - rewards=rewards, - next_states=next_states, - dones=dones, - weights=full_weights, - ) - # # # Step 3 Dyna add more data - self._dyna_generate_and_train(next_states=next_states) - - def _dyna_generate_and_train(self, next_states): - """ - Only off-policy Dyna will work. - :param next_states: - """ - pred_states = [] - pred_actions = [] - pred_rs = [] - pred_n_states = [] - pred_uncerts = [] - with torch.no_grad(): - pred_state = next_states - for _ in range(self.horizon): - pred_state = torch.repeat_interleave(pred_state, self.num_samples, dim=0) - # This part is controversial. But random actions is empirically better. - rand_acts = np.random.uniform(-1, 1, (pred_state.shape[0], self.action_num)) - pred_acts = torch.FloatTensor(rand_acts).to(self.device) - - pred_next_state, _, pred_mean, pred_var = self.world_model.pred_next_states( - pred_state, pred_acts - ) - uncert = self.sampling(pred_means=pred_mean, pred_vars=pred_var) - uncert = uncert.unsqueeze(dim=1).to(self.device) - pred_uncerts.append(uncert) - - pred_reward = self.world_model.pred_rewards(pred_next_state) - pred_states.append(pred_state) - pred_actions.append(pred_acts.detach()) - pred_rs.append(pred_reward.detach()) - pred_n_states.append(pred_next_state.detach()) - pred_state = pred_next_state.detach() - pred_states = torch.vstack(pred_states) - pred_actions = torch.vstack(pred_actions) - pred_rs = torch.vstack(pred_rs) - pred_n_states = torch.vstack(pred_n_states) - pred_weights = torch.vstack(pred_uncerts) - # Pay attention to here! It is dones in the Cares RL Code! - pred_dones = torch.FloatTensor(np.zeros(pred_rs.shape)).to(self.device) - # states, actions, rewards, next_states, not_dones - self._train_policy( - pred_states, pred_actions, pred_rs, pred_n_states, pred_dones, pred_weights - ) - - def sampling(self, pred_means, pred_vars, phi=0.0001): - """ - High std means low uncertainty. Therefore, divided by 1 - - :param pred_means: - :param pred_vars: - :return: - """ - sample_times = 10 - with torch.no_grad(): - # 5 models. Each predict 10 next_states. - sample1 = torch.distributions.Normal(pred_means[0], pred_vars[0]).sample( - [sample_times]) - sample2 = torch.distributions.Normal(pred_means[1], pred_vars[1]).sample( - [sample_times]) - sample3 = torch.distributions.Normal(pred_means[2], pred_vars[2]).sample( - [sample_times]) - sample4 = torch.distributions.Normal(pred_means[3], pred_vars[3]).sample( - [sample_times]) - sample5 = torch.distributions.Normal(pred_means[4], pred_vars[4]).sample( - [sample_times]) - rs = [] - acts = [] - qs = [] - # Varying the next_state's distribution. - for i in range(sample_times): - # 5 models, each sampled 10 times = 50, - pred_rwd1 = self.world_model.pred_rewards(sample1[i]) - pred_rwd2 = self.world_model.pred_rewards(sample2[i]) - pred_rwd3 = self.world_model.pred_rewards(sample3[i]) - pred_rwd4 = self.world_model.pred_rewards(sample4[i]) - pred_rwd5 = self.world_model.pred_rewards(sample5[i]) - rs.append(pred_rwd1) - rs.append(pred_rwd2) - rs.append(pred_rwd3) - rs.append(pred_rwd4) - rs.append(pred_rwd5) - # Each times, 5 models predict different actions. - # [2560, 17] - pred_act1, log_pi1, _ = self.actor_net(sample1[i]) - pred_act2, log_pi2, _ = self.actor_net(sample2[i]) - pred_act3, log_pi3, _ = self.actor_net(sample3[i]) - pred_act4, log_pi4, _ = self.actor_net(sample4[i]) - pred_act5, log_pi5, _ = self.actor_net(sample5[i]) - acts.append(log_pi1) - acts.append(log_pi2) - acts.append(log_pi3) - acts.append(log_pi4) - acts.append(log_pi5) - # How to become the same next state, different action. - # Now: sample1 sample2... same next state, different model. - # Pred_act1 pred_act2 same next_state, different actions. - # 5[] * 10[var of state] - qa1, qa2 = self.target_critic_net(sample1[i], pred_act1) - qa = torch.minimum(qa1, qa2) - qb1, qb2 = self.target_critic_net(sample2[i], pred_act2) - qb = torch.minimum(qb1, qb2) - qc1, qc2 = self.target_critic_net(sample3[i], pred_act3) - qc = torch.minimum(qc1, qc2) - qd1, qd2 = self.target_critic_net(sample4[i], pred_act4) - qd = torch.minimum(qd1, qd2) - qe1, qe2 = self.target_critic_net(sample5[i], pred_act5) - qe = torch.minimum(qe1, qe2) - qs.append(qa) - qs.append(qb) - qs.append(qc) - qs.append(qd) - qs.append(qe) - - rs = torch.stack(rs) - acts = torch.stack(acts) - qs = torch.stack(qs) - - var_r = torch.var(rs, dim=0) - var_a = torch.var(acts, dim=0) - var_q = torch.var(qs, dim=0) - - # Computing covariance. - mean_a = torch.mean(acts, dim=0, keepdim=True) - mean_q = torch.mean(qs, dim=0, keepdim=True) - diff_a = acts - mean_a - diff_q = qs - mean_q - cov_aq = torch.mean(diff_a * diff_q, dim=0) - - mean_r = torch.mean(rs, dim=0, keepdim=True) - diff_r = rs - mean_r - cov_rq = torch.mean(diff_r * diff_q, dim=0) - cov_ra = torch.mean(diff_r * diff_a, dim=0) - - total_var = var_r + var_a + var_q + 2 * cov_aq + 2 * cov_rq + 2 * cov_ra - total_var[total_var < phi] = phi - total_stds = 1 / total_var - return total_stds.detach() - - def set_statistics(self, stats: dict) -> None: - self.world_model.set_statistics(stats) - - def save_models(self, filename: str, filepath: str = "models") -> None: - path = f"{filepath}/models" if filepath != "models" else filepath - dir_exists = os.path.exists(path) - if not dir_exists: - os.makedirs(path) - torch.save(self.actor_net.state_dict(), f"{path}/{filename}_actor.pth") - torch.save(self.critic_net.state_dict(), f"{path}/{filename}_critic.pth") - logging.info("models has been saved...") - - def load_models(self, filepath: str, filename: str) -> None: - path = f"{filepath}/models" if filepath != "models" else filepath - self.actor_net.load_state_dict(torch.load(f"{path}/{filename}_actor.pth")) - self.critic_net.load_state_dict(torch.load(f"{path}/{filename}_critic.pth")) - logging.info("models has been loaded...") diff --git a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_ExaBatchReweight.py b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_ScaleBatchReweight.py similarity index 85% rename from cares_reinforcement_learning/algorithm/mbrl/DynaSAC_ExaBatchReweight.py rename to cares_reinforcement_learning/algorithm/mbrl/DynaSAC_ScaleBatchReweight.py index 02bc5a70..cf668539 100644 --- a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_ExaBatchReweight.py +++ b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_ScaleBatchReweight.py @@ -19,7 +19,10 @@ ) -class DynaSAC_ExaBatchReweight: +class DynaSAC_ScaleBatchReweight: + """ + Max as ? + """ def __init__( self, actor_network: torch.nn.Module, @@ -33,6 +36,10 @@ def __init__( alpha_lr: float, num_samples: int, horizon: int, + threshold_scale: float, + variance_scale: float, + mode: int, + sample_times: int, device: torch.device, ): self.type = "mbrl" @@ -69,6 +76,11 @@ def __init__( # World model self.world_model = world_network + # Parameter + self.threshold_scale = threshold_scale + self.variance_scale = variance_scale + self.mode = mode + self.sample_times = sample_times @property def _alpha(self) -> float: @@ -255,7 +267,7 @@ def _dyna_generate_and_train(self, next_states): pred_states, pred_actions, pred_rs, pred_n_states, pred_dones, pred_weights ) - def sampling(self, pred_means, pred_vars, phi=0.0001): + def sampling(self, pred_means, pred_vars): """ High std means low uncertainty. Therefore, divided by 1 @@ -263,24 +275,23 @@ def sampling(self, pred_means, pred_vars, phi=0.0001): :param pred_vars: :return: """ - sample_times = 10 with torch.no_grad(): # 5 models. Each predict 10 next_states. sample1 = torch.distributions.Normal(pred_means[0], pred_vars[0]).sample( - [sample_times]) + [self.sample_times]) sample2 = torch.distributions.Normal(pred_means[1], pred_vars[1]).sample( - [sample_times]) + [self.sample_times]) sample3 = torch.distributions.Normal(pred_means[2], pred_vars[2]).sample( - [sample_times]) + [self.sample_times]) sample4 = torch.distributions.Normal(pred_means[3], pred_vars[3]).sample( - [sample_times]) + [self.sample_times]) sample5 = torch.distributions.Normal(pred_means[4], pred_vars[4]).sample( - [sample_times]) + [self.sample_times]) rs = [] acts = [] qs = [] # Varying the next_state's distribution. - for i in range(sample_times): + for i in range(self.sample_times): # 5 models, each sampled 10 times = 50, pred_rwd1 = self.world_model.pred_rewards(sample1[i]) pred_rwd2 = self.world_model.pred_rewards(sample2[i]) @@ -329,33 +340,52 @@ def sampling(self, pred_means, pred_vars, phi=0.0001): qs = torch.stack(qs) var_r = torch.var(rs, dim=0) - var_a = torch.var(acts, dim=0) - var_q = torch.var(qs, dim=0) + + if self.mode < 3: + var_a = torch.var(acts, dim=0) + var_q = torch.var(qs, dim=0) # Computing covariance. - mean_a = torch.mean(acts, dim=0, keepdim=True) - mean_q = torch.mean(qs, dim=0, keepdim=True) - diff_a = acts - mean_a - diff_q = qs - mean_q - cov_aq = torch.mean(diff_a * diff_q, dim=0) + if self.mode < 2: + mean_a = torch.mean(acts, dim=0, keepdim=True) + mean_q = torch.mean(qs, dim=0, keepdim=True) + diff_a = acts - mean_a + diff_q = qs - mean_q + cov_aq = torch.mean(diff_a * diff_q, dim=0) + + if self.mode < 1: + mean_r = torch.mean(rs, dim=0, keepdim=True) + diff_r = rs - mean_r + cov_rq = torch.mean(diff_r * diff_q, dim=0) + + cov_ra = torch.mean(diff_r * diff_a, dim=0) + + # Ablation + if self.mode == 0: + total_var = var_r + var_a + var_q + 2 * cov_aq + 2 * cov_rq + 2 * cov_ra + if self.mode == 1: + total_var = var_r + var_a + var_q + 2 * cov_aq + if self.mode == 2: + total_var = var_r + var_a + var_q + if self.mode == 3: + total_var = var_r - mean_r = torch.mean(rs, dim=0, keepdim=True) - diff_r = rs - mean_r - cov_rq = torch.mean(diff_r * diff_q, dim=0) - cov_ra = torch.mean(diff_r * diff_a, dim=0) + # Exacerbate the sample difference. + old_mean_var = torch.mean(total_var) + # normalize vars to sum = 1 + total_var /= old_mean_var + min_var = torch.min(total_var) + max_var = torch.max(total_var) + # mean_var = torch.mean(total_var) - total_var = var_r + var_a + var_q + 2 * cov_aq + 2 * cov_rq + 2 * cov_ra + threshold = self.threshold_scale * (max_var - min_var) + min_var + total_var[total_var <= threshold] = threshold - # Exacerbate the sample difference. - scale = 2.0 - mean_var = torch.mean(total_var) - diff = total_var - mean_var - scale_diff = scale * diff - total_var += scale_diff - total_var[total_var < phi] = phi + # threshold = (self.threshold_scale * (max_var - mean_var)) + mean_var + # threshold = torch.min(threshold, min_var) + # total_var[total_var <= threshold] = max_var * self.variance_scale total_stds = 1 / total_var - return total_stds.detach() def set_statistics(self, stats: dict) -> None: diff --git a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_Var.py b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_Var.py deleted file mode 100644 index d53859e7..00000000 --- a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_Var.py +++ /dev/null @@ -1,363 +0,0 @@ -""" -Sutton, Richard S. "Dyna, an integrated architecture for learning, planning, and reacting." - -Original Paper: https://dl.acm.org/doi/abs/10.1145/122344.122377 - -This code runs automatic entropy tuning -""" - -import copy -import logging -import os - -import numpy as np -import torch -from cares_reinforcement_learning.memory import PrioritizedReplayBuffer - -from cares_reinforcement_learning.networks.world_models.ensemble_world import ( - EnsembleWorldAndOneReward, -) - - -class DynaSAC_Var: - def __init__( - self, - actor_network: torch.nn.Module, - critic_network: torch.nn.Module, - world_network: EnsembleWorldAndOneReward, - gamma: float, - tau: float, - action_num: int, - actor_lr: float, - critic_lr: float, - alpha_lr: float, - num_samples: int, - horizon: int, - device: torch.device, - ): - self.type = "mbrl" - self.device = device - - # this may be called policy_net in other implementations - self.actor_net = actor_network.to(self.device) - # this may be called soft_q_net in other implementations - self.critic_net = critic_network.to(self.device) - self.target_critic_net = copy.deepcopy(self.critic_net) - - self.gamma = gamma - self.tau = tau - - self.num_samples = num_samples - self.horizon = horizon - self.action_num = action_num - - self.learn_counter = 0 - self.policy_update_freq = 1 - - self.actor_net_optimiser = torch.optim.Adam( - self.actor_net.parameters(), lr=actor_lr - ) - self.critic_net_optimiser = torch.optim.Adam( - self.critic_net.parameters(), lr=critic_lr - ) - - # Set to initial alpha to 1.0 according to other baselines. - self.log_alpha = torch.tensor(np.log(1.0)).to(device) - self.log_alpha.requires_grad = True - self.target_entropy = -action_num - self.log_alpha_optimizer = torch.optim.Adam([self.log_alpha], lr=alpha_lr) - - # World model - self.world_model = world_network - - @property - def _alpha(self) -> float: - return self.log_alpha.exp() - - # pylint: disable-next=unused-argument to keep the same interface - def select_action_from_policy( - self, state: np.ndarray, evaluation: bool = False, noise_scale: float = 0 - ) -> np.ndarray: - # note that when evaluating this algorithm we need to select mu as - self.actor_net.eval() - with torch.no_grad(): - state_tensor = torch.FloatTensor(state).unsqueeze(0).to(self.device) - if evaluation is False: - (action, _, _) = self.actor_net(state_tensor) - else: - (_, _, action) = self.actor_net(state_tensor) - action = action.cpu().data.numpy().flatten() - self.actor_net.train() - return action - - def _train_policy( - self, - states: torch.Tensor, - actions: torch.Tensor, - rewards: torch.Tensor, - next_states: torch.Tensor, - dones: torch.Tensor, - weights: torch.Tensor, - ) -> None: - ################## Update the Critic First #################### - # Have more target values? - with torch.no_grad(): - next_actions, next_log_pi, _ = self.actor_net(next_states) - target_q_one, target_q_two = self.target_critic_net( - next_states, next_actions - ) - target_q_values = ( - torch.minimum(target_q_one, target_q_two) - self._alpha * next_log_pi - ) - q_target = rewards + self.gamma * (1 - dones) * target_q_values - - q_values_one, q_values_two = self.critic_net(states, actions) - - # Original loss function - l2_loss_one = (q_values_one - q_target).pow(2) - l2_loss_two = (q_values_two - q_target).pow(2) - - # Reweighted loss function. weight not participant in training. - weights = weights.detach() - disc_l2_loss_one = l2_loss_one * weights - disc_l2_loss_two = l2_loss_two * weights - # A ratio to scale the loss back to original loss scale. - - ratio_1 = torch.mean(l2_loss_one) / torch.mean(disc_l2_loss_one) - ratio_1 = ratio_1.detach() - ratio_2 = torch.mean(l2_loss_two) / torch.mean(disc_l2_loss_two) - ratio_2 = ratio_2.detach() - - critic_loss_one = disc_l2_loss_one.mean() * ratio_1 - critic_loss_two = disc_l2_loss_two.mean() * ratio_2 - - critic_loss_total = critic_loss_one + critic_loss_two - - # Update the Critic - self.critic_net_optimiser.zero_grad() - critic_loss_total.backward() - self.critic_net_optimiser.step() - - ################## Update the Actor Second #################### - pi, first_log_p, _ = self.actor_net(states) - qf1_pi, qf2_pi = self.critic_net(states, pi) - min_qf_pi = torch.minimum(qf1_pi, qf2_pi) - actor_loss = ((self._alpha * first_log_p) - min_qf_pi).mean() - - # Update the Actor - self.actor_net_optimiser.zero_grad() - actor_loss.backward() - self.actor_net_optimiser.step() - - # Update the temperature - alpha_loss = -( - self.log_alpha * (first_log_p + self.target_entropy).detach() - ).mean() - - self.log_alpha_optimizer.zero_grad() - alpha_loss.backward() - self.log_alpha_optimizer.step() - - if self.learn_counter % self.policy_update_freq == 0: - for target_param, param in zip( - self.target_critic_net.parameters(), self.critic_net.parameters() - ): - target_param.data.copy_( - param.data * self.tau + target_param.data * (1.0 - self.tau) - ) - - def train_world_model( - self, memory: PrioritizedReplayBuffer, batch_size: int - ) -> None: - experiences = memory.sample_uniform(batch_size) - states, actions, rewards, next_states, _, _ = experiences - - states = torch.FloatTensor(np.asarray(states)).to(self.device) - actions = torch.FloatTensor(np.asarray(actions)).to(self.device) - rewards = torch.FloatTensor(np.asarray(rewards)).to(self.device).unsqueeze(1) - next_states = torch.FloatTensor(np.asarray(next_states)).to(self.device) - - self.world_model.train_world( - states=states, - actions=actions, - next_states=next_states, - ) - self.world_model.train_reward( - next_states=next_states, - rewards=rewards, - ) - - def train_policy(self, memory: PrioritizedReplayBuffer, batch_size: int) -> None: - self.learn_counter += 1 - - experiences = memory.sample_uniform(batch_size) - states, actions, rewards, next_states, dones, _ = experiences - - # Convert into tensor - states = torch.FloatTensor(np.asarray(states)).to(self.device) - actions = torch.FloatTensor(np.asarray(actions)).to(self.device) - rewards = torch.FloatTensor(np.asarray(rewards)).to(self.device).unsqueeze(1) - next_states = torch.FloatTensor(np.asarray(next_states)).to(self.device) - dones = torch.LongTensor(np.asarray(dones)).to(self.device).unsqueeze(1) - full_weights = torch.ones(rewards.shape).to(self.device) - # Step 2 train as usual - self._train_policy( - states=states, - actions=actions, - rewards=rewards, - next_states=next_states, - dones=dones, - weights=full_weights, - ) - # # # Step 3 Dyna add more data - self._dyna_generate_and_train(next_states=next_states) - - def _dyna_generate_and_train(self, next_states): - """ - Only off-policy Dyna will work. - :param next_states: - """ - pred_states = [] - pred_actions = [] - pred_rs = [] - pred_n_states = [] - pred_uncerts = [] - with torch.no_grad(): - pred_state = next_states - for _ in range(self.horizon): - pred_state = torch.repeat_interleave(pred_state, self.num_samples, dim=0) - # This part is controversial. But random actions is empirically better. - rand_acts = np.random.uniform(-1, 1, (pred_state.shape[0], self.action_num)) - pred_acts = torch.FloatTensor(rand_acts).to(self.device) - - pred_next_state, _, pred_mean, pred_var = self.world_model.pred_next_states( - pred_state, pred_acts - ) - uncert = self.sampling(pred_means=pred_mean, pred_vars=pred_var) - uncert = uncert.unsqueeze(dim=1).to(self.device) - pred_uncerts.append(uncert) - - pred_reward = self.world_model.pred_rewards(pred_next_state) - pred_states.append(pred_state) - pred_actions.append(pred_acts.detach()) - pred_rs.append(pred_reward.detach()) - pred_n_states.append(pred_next_state.detach()) - pred_state = pred_next_state.detach() - pred_states = torch.vstack(pred_states) - pred_actions = torch.vstack(pred_actions) - pred_rs = torch.vstack(pred_rs) - pred_n_states = torch.vstack(pred_n_states) - pred_weights = torch.vstack(pred_uncerts) - # Pay attention to here! It is dones in the Cares RL Code! - pred_dones = torch.FloatTensor(np.zeros(pred_rs.shape)).to(self.device) - # states, actions, rewards, next_states, not_dones - self._train_policy( - pred_states, pred_actions, pred_rs, pred_n_states, pred_dones, pred_weights - ) - - def sampling(self, pred_means, pred_vars, phi=0.0001): - """ - High std means low uncertainty. Therefore, divided by 1 - - :param pred_means: - :param pred_vars: - :return: - """ - sample_times = 10 - with torch.no_grad(): - # 5 models. Each predict 10 next_states. - sample1 = torch.distributions.Normal(pred_means[0], pred_vars[0]).sample( - [sample_times]) - sample2 = torch.distributions.Normal(pred_means[1], pred_vars[1]).sample( - [sample_times]) - sample3 = torch.distributions.Normal(pred_means[2], pred_vars[2]).sample( - [sample_times]) - sample4 = torch.distributions.Normal(pred_means[3], pred_vars[3]).sample( - [sample_times]) - sample5 = torch.distributions.Normal(pred_means[4], pred_vars[4]).sample( - [sample_times]) - rs = [] - acts = [] - qs = [] - # Varying the next_state's distribution. - for i in range(sample_times): - # 5 models, each sampled 10 times = 50, - pred_rwd1 = self.world_model.pred_rewards(sample1[i]) - pred_rwd2 = self.world_model.pred_rewards(sample2[i]) - pred_rwd3 = self.world_model.pred_rewards(sample3[i]) - pred_rwd4 = self.world_model.pred_rewards(sample4[i]) - pred_rwd5 = self.world_model.pred_rewards(sample5[i]) - rs.append(pred_rwd1) - rs.append(pred_rwd2) - rs.append(pred_rwd3) - rs.append(pred_rwd4) - rs.append(pred_rwd5) - # Each times, 5 models predict different actions. - # [2560, 17] - pred_act1, log_pi1, _ = self.actor_net(sample1[i]) - pred_act2, log_pi2, _ = self.actor_net(sample2[i]) - pred_act3, log_pi3, _ = self.actor_net(sample3[i]) - pred_act4, log_pi4, _ = self.actor_net(sample4[i]) - pred_act5, log_pi5, _ = self.actor_net(sample5[i]) - acts.append(log_pi1) - acts.append(log_pi2) - acts.append(log_pi3) - acts.append(log_pi4) - acts.append(log_pi5) - # How to become the same next state, different action. - # Now: sample1 sample2... same next state, different model. - # Pred_act1 pred_act2 same next_state, different actions. - # 5[] * 10[var of state] - qa1, qa2 = self.target_critic_net(sample1[i], pred_act1) - qa = torch.minimum(qa1, qa2) - qb1, qb2 = self.target_critic_net(sample2[i], pred_act2) - qb = torch.minimum(qb1, qb2) - qc1, qc2 = self.target_critic_net(sample3[i], pred_act3) - qc = torch.minimum(qc1, qc2) - qd1, qd2 = self.target_critic_net(sample4[i], pred_act4) - qd = torch.minimum(qd1, qd2) - qe1, qe2 = self.target_critic_net(sample5[i], pred_act5) - qe = torch.minimum(qe1, qe2) - qs.append(qa) - qs.append(qb) - qs.append(qc) - qs.append(qd) - qs.append(qe) - - rs = torch.stack(rs) - acts = torch.stack(acts) - qs = torch.stack(qs) - - var_r = torch.var(rs, dim=0) - var_a = torch.var(acts, dim=0) - var_q = torch.var(qs, dim=0) - - # Computing covariance. - mean_a = torch.mean(acts, dim=0, keepdim=True) - mean_q = torch.mean(qs, dim=0, keepdim=True) - diff_a = acts - mean_a - diff_q = qs - mean_q - cov_aq = torch.mean(diff_a * diff_q, dim=0) - - total_var = var_r + var_a + var_q + 2 * cov_aq - total_var[total_var < phi] = phi - # total_stds = 1 / total_var - return total_var.detach() - - def set_statistics(self, stats: dict) -> None: - self.world_model.set_statistics(stats) - - def save_models(self, filename: str, filepath: str = "models") -> None: - path = f"{filepath}/models" if filepath != "models" else filepath - dir_exists = os.path.exists(path) - if not dir_exists: - os.makedirs(path) - torch.save(self.actor_net.state_dict(), f"{path}/{filename}_actor.pth") - torch.save(self.critic_net.state_dict(), f"{path}/{filename}_critic.pth") - logging.info("models has been saved...") - - def load_models(self, filepath: str, filename: str) -> None: - path = f"{filepath}/models" if filepath != "models" else filepath - self.actor_net.load_state_dict(torch.load(f"{path}/{filename}_actor.pth")) - self.critic_net.load_state_dict(torch.load(f"{path}/{filename}_critic.pth")) - logging.info("models has been loaded...") diff --git a/cares_reinforcement_learning/algorithm/mbrl/DynaSAT.py b/cares_reinforcement_learning/algorithm/mbrl/DynaSAT.py deleted file mode 100644 index f67d30f7..00000000 --- a/cares_reinforcement_learning/algorithm/mbrl/DynaSAT.py +++ /dev/null @@ -1,251 +0,0 @@ -""" -Sutton, Richard S. "Dyna, an integrated architecture for learning, planning, and reacting." - -Original Paper: https://dl.acm.org/doi/abs/10.1145/122344.122377 - -This code runs automatic entropy tuning -""" - -import copy -import logging -import os - -import numpy as np -import torch - -from cares_reinforcement_learning.memory import PrioritizedReplayBuffer - -from cares_reinforcement_learning.networks.world_models.ensemble_world import ( - EnsembleWorldAndOneReward, -) - - -class DynaSAT: - def __init__( - self, - actor_network: torch.nn.Module, - critic_network: torch.nn.Module, - world_network: EnsembleWorldAndOneReward, - gamma: float, - tau: float, - action_num: int, - actor_lr: float, - critic_lr: float, - alpha_lr: float, - num_samples: int, - horizon: int, - device: torch.device, - ): - self.type = "mbrl" - self.device = device - - # this may be called policy_net in other implementations - self.actor_net = actor_network.to(self.device) - # this may be called soft_q_net in other implementations - self.critic_net = critic_network.to(self.device) - self.target_critic_net = copy.deepcopy(self.critic_net) - - self.gamma = gamma - self.tau = tau - - self.num_samples = num_samples - self.horizon = horizon - self.action_num = action_num - - self.learn_counter = 0 - self.policy_update_freq = 1 - - self.actor_net_optimiser = torch.optim.Adam( - self.actor_net.parameters(), lr=actor_lr - ) - self.critic_net_optimiser = torch.optim.Adam( - self.critic_net.parameters(), lr=critic_lr - ) - - # Set to initial alpha to 1.0 according to other baselines. - self.log_alpha = torch.tensor(np.log(1.0)).to(device) - self.log_alpha.requires_grad = True - self.target_entropy = -action_num - self.log_alpha_optimizer = torch.optim.Adam([self.log_alpha], lr=alpha_lr) - - # World model - self.world_model = world_network - - @property - def _alpha(self) -> float: - return self.log_alpha.exp() - - # pylint: disable-next=unused-argument to keep the same interface - def select_action_from_policy( - self, state: np.ndarray, evaluation: bool = False, noise_scale: float = 0 - ) -> np.ndarray: - # note that when evaluating this algorithm we need to select mu as - self.actor_net.eval() - with torch.no_grad(): - state_tensor = torch.FloatTensor(state).unsqueeze(0).to(self.device) - if evaluation is False: - (action, _, _) = self.actor_net(state_tensor) - else: - (_, _, action) = self.actor_net(state_tensor) - action = action.cpu().data.numpy().flatten() - self.actor_net.train() - return action - - def _train_policy( - self, - states: torch.Tensor, - actions: torch.Tensor, - rewards: torch.Tensor, - next_states: torch.Tensor, - dones: torch.Tensor, - ) -> None: - ################## Update the Critic First #################### - with torch.no_grad(): - next_actions, next_log_pi, _ = self.actor_net(next_states) - target_q_one, target_q_two, target_q_three = self.target_critic_net( - next_states, next_actions - ) - target_q_values = ( - torch.minimum(torch.minimum(target_q_one, target_q_two), target_q_three) - self._alpha * next_log_pi - ) - q_target = rewards + self.gamma * (1 - dones) * target_q_values - - q_values_one, q_values_two, q_values_three = self.critic_net(states, actions) - critic_loss_one = ((q_values_one - q_target).pow(2)).mean() - critic_loss_two = ((q_values_two - q_target).pow(2)).mean() - critic_loss_three = ((q_values_three - q_target).pow(2)).mean() - - critic_loss_total = critic_loss_one + critic_loss_two + critic_loss_three - - # Update the Critic - self.critic_net_optimiser.zero_grad() - critic_loss_total.backward() - self.critic_net_optimiser.step() - - ################## Update the Actor Second #################### - pi, first_log_p, _ = self.actor_net(states) - qf1_pi, qf2_pi, qf3_pi = self.critic_net(states, pi) - min_qf_pi = torch.minimum(torch.minimum(qf1_pi, qf2_pi), qf3_pi) - - actor_loss = ((self._alpha * first_log_p) - min_qf_pi).mean() - - # Update the Actor - self.actor_net_optimiser.zero_grad() - actor_loss.backward() - self.actor_net_optimiser.step() - - # Update the temperature - alpha_loss = -( - self.log_alpha * (first_log_p + self.target_entropy).detach() - ).mean() - - self.log_alpha_optimizer.zero_grad() - alpha_loss.backward() - self.log_alpha_optimizer.step() - - if self.learn_counter % self.policy_update_freq == 0: - for target_param, param in zip( - self.target_critic_net.parameters(), self.critic_net.parameters() - ): - target_param.data.copy_( - param.data * self.tau + target_param.data * (1.0 - self.tau) - ) - - - - def train_policy(self, memory: PrioritizedReplayBuffer, batch_size: int) -> None: - self.learn_counter += 1 - - experiences = memory.sample_uniform(batch_size) - states, actions, rewards, next_states, dones, _ = experiences - - # Convert into tensor - states = torch.FloatTensor(np.asarray(states)).to(self.device) - actions = torch.FloatTensor(np.asarray(actions)).to(self.device) - rewards = torch.FloatTensor(np.asarray(rewards)).to(self.device).unsqueeze(1) - next_states = torch.FloatTensor(np.asarray(next_states)).to(self.device) - dones = torch.LongTensor(np.asarray(dones)).to(self.device).unsqueeze(1) - - # Step 2 train as usual - self._train_policy( - states=states, - actions=actions, - rewards=rewards, - next_states=next_states, - dones=dones, - ) - # # # Step 3 Dyna add more data - self._dyna_generate_and_train(next_states=next_states) - - def train_world_model( - self, memory: PrioritizedReplayBuffer, batch_size: int - ) -> None: - - experiences = memory.sample_uniform(batch_size) - states, actions, rewards, next_states, _, _ = experiences - - states = torch.FloatTensor(np.asarray(states)).to(self.device) - actions = torch.FloatTensor(np.asarray(actions)).to(self.device) - rewards = torch.FloatTensor(np.asarray(rewards)).to(self.device).unsqueeze(1) - next_states = torch.FloatTensor(np.asarray(next_states)).to(self.device) - - self.world_model.train_world( - states=states, - actions=actions, - next_states=next_states, - ) - self.world_model.train_reward( - next_states=next_states, - rewards=rewards, - ) - - def _dyna_generate_and_train(self, next_states: torch.Tensor) -> None: - pred_states = [] - pred_actions = [] - pred_rs = [] - pred_n_states = [] - - with torch.no_grad(): - pred_state = next_states - for _ in range(self.horizon): - pred_state = torch.repeat_interleave(pred_state, self.num_samples, dim=0) - # This part is controversial. But random actions is empirically better. - rand_acts = np.random.uniform(-1, 1, (pred_state.shape[0], self.action_num)) - pred_acts = torch.FloatTensor(rand_acts).to(self.device) - pred_next_state, _, _, _ = self.world_model.pred_next_states( - pred_state, pred_acts - ) - pred_reward = self.world_model.pred_rewards(pred_next_state) - pred_states.append(pred_state) - pred_actions.append(pred_acts.detach()) - pred_rs.append(pred_reward.detach()) - pred_n_states.append(pred_next_state.detach()) - pred_state = pred_next_state.detach() - pred_states = torch.vstack(pred_states) - pred_actions = torch.vstack(pred_actions) - pred_rs = torch.vstack(pred_rs) - pred_n_states = torch.vstack(pred_n_states) - # Pay attention to here! It is dones in the Cares RL Code! - pred_dones = torch.FloatTensor(np.zeros(pred_rs.shape)).to(self.device) - # states, actions, rewards, next_states, not_dones - self._train_policy( - pred_states, pred_actions, pred_rs, pred_n_states, pred_dones - ) - - def set_statistics(self, stats: dict) -> None: - self.world_model.set_statistics(stats) - - def save_models(self, filename: str, filepath: str = "models") -> None: - path = f"{filepath}/models" if filepath != "models" else filepath - dir_exists = os.path.exists(path) - if not dir_exists: - os.makedirs(path) - torch.save(self.actor_net.state_dict(), f"{path}/{filename}_actor.pth") - torch.save(self.critic_net.state_dict(), f"{path}/{filename}_critic.pth") - logging.info("models has been saved...") - - def load_models(self, filepath: str, filename: str) -> None: - path = f"{filepath}/models" if filepath != "models" else filepath - self.actor_net.load_state_dict(torch.load(f"{path}/{filename}_actor.pth")) - self.critic_net.load_state_dict(torch.load(f"{path}/{filename}_critic.pth")) - logging.info("models has been loaded...") diff --git a/cares_reinforcement_learning/algorithm/mbrl/DynaSAT_BatchReweight.py b/cares_reinforcement_learning/algorithm/mbrl/DynaSAT_BatchReweight.py deleted file mode 100644 index 548bc8b9..00000000 --- a/cares_reinforcement_learning/algorithm/mbrl/DynaSAT_BatchReweight.py +++ /dev/null @@ -1,394 +0,0 @@ -""" -Sutton, Richard S. "Dyna, an integrated architecture for learning, planning, and reacting." - -Original Paper: https://dl.acm.org/doi/abs/10.1145/122344.122377 - -This code runs automatic entropy tuning -""" - -import copy -import logging -import os - -import numpy as np -import torch -from cares_reinforcement_learning.memory import PrioritizedReplayBuffer - -from cares_reinforcement_learning.networks.world_models.ensemble_world import ( - EnsembleWorldAndOneReward, -) - - -class DynaSAT_BatchReweight: - def __init__( - self, - actor_network: torch.nn.Module, - critic_network: torch.nn.Module, - world_network: EnsembleWorldAndOneReward, - gamma: float, - tau: float, - action_num: int, - actor_lr: float, - critic_lr: float, - alpha_lr: float, - num_samples: int, - horizon: int, - device: torch.device, - ): - self.type = "mbrl" - self.device = device - - # this may be called policy_net in other implementations - self.actor_net = actor_network.to(self.device) - # this may be called soft_q_net in other implementations - self.critic_net = critic_network.to(self.device) - self.target_critic_net = copy.deepcopy(self.critic_net) - - self.gamma = gamma - self.tau = tau - - self.num_samples = num_samples - self.horizon = horizon - self.action_num = action_num - - self.learn_counter = 0 - self.policy_update_freq = 1 - - self.actor_net_optimiser = torch.optim.Adam( - self.actor_net.parameters(), lr=actor_lr - ) - self.critic_net_optimiser = torch.optim.Adam( - self.critic_net.parameters(), lr=critic_lr - ) - - # Set to initial alpha to 1.0 according to other baselines. - self.log_alpha = torch.tensor(np.log(1.0)).to(device) - self.log_alpha.requires_grad = True - self.target_entropy = -action_num - self.log_alpha_optimizer = torch.optim.Adam([self.log_alpha], lr=alpha_lr) - - # World model - self.world_model = world_network - - @property - def _alpha(self) -> float: - return self.log_alpha.exp() - - # pylint: disable-next=unused-argument to keep the same interface - def select_action_from_policy( - self, state: np.ndarray, evaluation: bool = False, noise_scale: float = 0 - ) -> np.ndarray: - # note that when evaluating this algorithm we need to select mu as - self.actor_net.eval() - with torch.no_grad(): - state_tensor = torch.FloatTensor(state).unsqueeze(0).to(self.device) - if evaluation is False: - (action, _, _) = self.actor_net(state_tensor) - else: - (_, _, action) = self.actor_net(state_tensor) - action = action.cpu().data.numpy().flatten() - self.actor_net.train() - return action - - def _train_policy( - self, - states: torch.Tensor, - actions: torch.Tensor, - rewards: torch.Tensor, - next_states: torch.Tensor, - dones: torch.Tensor, - weights: torch.Tensor, - ) -> None: - ################## Update the Critic First #################### - with torch.no_grad(): - next_actions, next_log_pi, _ = self.actor_net(next_states) - target_q_one, target_q_two, target_q_three = self.target_critic_net( - next_states, next_actions - ) - target_q_values = ( - torch.minimum(torch.minimum(target_q_one, target_q_two), target_q_three) - self._alpha * next_log_pi - ) - q_target = rewards + self.gamma * (1 - dones) * target_q_values - - q_values_one, q_values_two, q_values_three = self.critic_net(states, actions) - - # Original loss function - l2_loss_one = (q_values_one - q_target).pow(2) - l2_loss_two = (q_values_two - q_target).pow(2) - l2_loss_three = (q_values_three - q_target).pow(2) - - # Reweighted loss function. weight not participant in training. - weights = weights.detach() - disc_l2_loss_one = l2_loss_one * weights - disc_l2_loss_two = l2_loss_two * weights - disc_l2_loss_three = l2_loss_three * weights - - # A ratio to scale the loss back to original loss scale. - ratio_1 = torch.mean(l2_loss_one) / torch.mean(disc_l2_loss_one) - ratio_1 = ratio_1.detach() - ratio_2 = torch.mean(l2_loss_two) / torch.mean(disc_l2_loss_two) - ratio_2 = ratio_2.detach() - ratio_3 = torch.mean(l2_loss_three) / torch.mean(disc_l2_loss_three) - ratio_3 = ratio_3.detach() - - critic_loss_one = disc_l2_loss_one.mean() * ratio_1 - critic_loss_two = disc_l2_loss_two.mean() * ratio_2 - critic_loss_three = disc_l2_loss_three.mean() * ratio_3 - - critic_loss_total = critic_loss_one + critic_loss_two + critic_loss_three - - # Update the Critic - self.critic_net_optimiser.zero_grad() - critic_loss_total.backward() - self.critic_net_optimiser.step() - - ################## Update the Actor Second #################### - pi, first_log_p, _ = self.actor_net(states) - qf1_pi, qf2_pi, qf3_pi = self.critic_net(states, pi) - min_qf_pi = torch.minimum(torch.minimum(qf1_pi, qf2_pi), qf3_pi) - - actor_loss = ((self._alpha * first_log_p) - min_qf_pi).mean() - - # Update the Actor - self.actor_net_optimiser.zero_grad() - actor_loss.backward() - self.actor_net_optimiser.step() - - # Update the temperature - alpha_loss = -( - self.log_alpha * (first_log_p + self.target_entropy).detach() - ).mean() - - self.log_alpha_optimizer.zero_grad() - alpha_loss.backward() - self.log_alpha_optimizer.step() - - if self.learn_counter % self.policy_update_freq == 0: - for target_param, param in zip( - self.target_critic_net.parameters(), self.critic_net.parameters() - ): - target_param.data.copy_( - param.data * self.tau + target_param.data * (1.0 - self.tau) - ) - - def train_policy(self, memory: PrioritizedReplayBuffer, batch_size: int) -> None: - self.learn_counter += 1 - experiences = memory.sample_uniform(batch_size) - states, actions, rewards, next_states, dones, _ = experiences - # Convert into tensor - states = torch.FloatTensor(np.asarray(states)).to(self.device) - actions = torch.FloatTensor(np.asarray(actions)).to(self.device) - rewards = torch.FloatTensor(np.asarray(rewards)).to(self.device).unsqueeze(1) - next_states = torch.FloatTensor(np.asarray(next_states)).to(self.device) - dones = torch.LongTensor(np.asarray(dones)).to(self.device).unsqueeze(1) - full_weights = torch.ones(rewards.shape).to(self.device) - # Step 2 train as usual - self._train_policy( - states=states, - actions=actions, - rewards=rewards, - next_states=next_states, - dones=dones, - weights=full_weights, - ) - # # # Step 3 Dyna add more data - self._dyna_generate_and_train(next_states=next_states) - - def train_world_model( - self, memory: PrioritizedReplayBuffer, batch_size: int - ) -> None: - experiences = memory.sample_uniform(batch_size) - states, actions, rewards, next_states, _, _ = experiences - states = torch.FloatTensor(np.asarray(states)).to(self.device) - actions = torch.FloatTensor(np.asarray(actions)).to(self.device) - rewards = torch.FloatTensor(np.asarray(rewards)).to(self.device).unsqueeze(1) - next_states = torch.FloatTensor(np.asarray(next_states)).to(self.device) - self.world_model.train_world( - states=states, - actions=actions, - next_states=next_states, - ) - self.world_model.train_reward( - next_states=next_states, - rewards=rewards, - ) - - def _dyna_generate_and_train(self, next_states: torch.Tensor) -> None: - pred_states = [] - pred_actions = [] - pred_rs = [] - pred_n_states = [] - pred_uncerts = [] - with torch.no_grad(): - pred_state = next_states - for _ in range(self.horizon): - pred_state = torch.repeat_interleave(pred_state, self.num_samples, dim=0) - # This part is controversial. But random actions is empirically better. - rand_acts = np.random.uniform(-1, 1, (pred_state.shape[0], self.action_num)) - pred_acts = torch.FloatTensor(rand_acts).to(self.device) - pred_next_state, _, pred_mean, pred_var = self.world_model.pred_next_states( - pred_state, pred_acts - ) - uncert = self.sampling(pred_means=pred_mean, pred_vars=pred_var) - uncert = uncert.unsqueeze(dim=1).to(self.device) - - pred_reward = self.world_model.pred_rewards(pred_next_state) - # uncert = torch.ones(pred_reward.shape).to(self.device) - pred_uncerts.append(uncert) - - pred_states.append(pred_state) - pred_actions.append(pred_acts.detach()) - pred_rs.append(pred_reward.detach()) - pred_n_states.append(pred_next_state.detach()) - pred_state = pred_next_state.detach() - pred_states = torch.vstack(pred_states) - pred_actions = torch.vstack(pred_actions) - pred_rs = torch.vstack(pred_rs) - pred_n_states = torch.vstack(pred_n_states) - pred_weights = torch.vstack(pred_uncerts) - # Pay attention to here! It is dones in the Cares RL Code! - pred_dones = torch.FloatTensor(np.zeros(pred_rs.shape)).to(self.device) - # states, actions, rewards, next_states, not_dones - self._train_policy( - pred_states, pred_actions, pred_rs, pred_n_states, pred_dones, pred_weights - ) - - def sampling(self, pred_means, pred_vars, phi=0.0001): - """ - High std means low uncertainty. Therefore, divided by 1 - - :param pred_means: - :param pred_vars: - :return: - """ - sample_times = 10 - with torch.no_grad(): - sample1 = torch.distributions.Normal(pred_means[0], pred_vars[0]).sample( - [sample_times]) - sample2 = torch.distributions.Normal(pred_means[1], pred_vars[1]).sample( - [sample_times]) - sample3 = torch.distributions.Normal(pred_means[2], pred_vars[2]).sample( - [sample_times]) - sample4 = torch.distributions.Normal(pred_means[3], pred_vars[3]).sample( - [sample_times]) - sample5 = torch.distributions.Normal(pred_means[4], pred_vars[4]).sample( - [sample_times]) - - rs = [] - acts = [] - qs = [] - q_vars = [] - q_means = [] - # Varying the next_state's distribution. - for i in range(sample_times): - # 5 models, each sampled 10 times = 50, - pred_rwd1 = self.world_model.pred_rewards(sample1[i]) - pred_rwd2 = self.world_model.pred_rewards(sample2[i]) - pred_rwd3 = self.world_model.pred_rewards(sample3[i]) - pred_rwd4 = self.world_model.pred_rewards(sample4[i]) - pred_rwd5 = self.world_model.pred_rewards(sample5[i]) - rs.append(pred_rwd1) - rs.append(pred_rwd2) - rs.append(pred_rwd3) - rs.append(pred_rwd4) - rs.append(pred_rwd5) - # Each times, 5 models predict different actions. - # [2560, 17] - # Same sample, different model same next_state. - pred_act1, log_pi1, _ = self.actor_net(sample1[i]) - pred_act2, log_pi2, _ = self.actor_net(sample2[i]) - pred_act3, log_pi3, _ = self.actor_net(sample3[i]) - pred_act4, log_pi4, _ = self.actor_net(sample4[i]) - pred_act5, log_pi5, _ = self.actor_net(sample5[i]) - acts.append(log_pi1) - acts.append(log_pi2) - acts.append(log_pi3) - acts.append(log_pi4) - acts.append(log_pi5) - # How to become the same next state, different action. - # Now: sample1 sample2... same next state, different model. - # Pred_act1 pred_act2 same next_state, different actions. - # 5 models * 10 samples [var of state] - qa1, qa2, qa3 = self.target_critic_net(sample1[i], pred_act1) - qa_mean = (qa1 + qa2 + qa3) / 3.0 - qa_var = ((qa1 - qa_mean).pow(2) + (qa2 - qa_mean).pow(2) + (qa3 - qa_mean).pow(2)) / 3.0 - q_vars.append(qa_var) - q_means.append(qa_mean) - # qa_mins = torch.minimum(torch.minimum(qa1, qa2), qa3) - # qs.append(qa_mins) - - qb1, qb2, qb3 = self.target_critic_net(sample2[i], pred_act2) - qb_mean = (qb1 + qb2 + qb3) / 3.0 - qb_var = ((qb1 - qb_mean).pow(2) + (qb2 - qb_mean).pow(2) + (qb3 - qb_mean).pow(2)) / 3.0 - q_vars.append(qb_var) - q_means.append(qb_mean) - # qb_mins = torch.minimum(torch.minimum(qb1, qb2), qb3) - # qs.append(qb_mins) - - qc1, qc2, qc3 = self.target_critic_net(sample3[i], pred_act3) - qc_mean = (qc1 + qc2 + qc3) / 3.0 - qc_var = ((qc1 - qc_mean).pow(2) + (qc2 - qc_mean).pow(2) + (qc3 - qc_mean).pow(2)) / 3.0 - q_vars.append(qc_var) - q_means.append(qc_mean) - # qc_mins = torch.minimum(torch.minimum(qc1, qc2), qc3) - # qs.append(qc_mins) - - qd1, qd2, qd3 = self.target_critic_net(sample4[i], pred_act4) - qd_mean = (qd1 + qd2 + qd3) / 3.0 - qd_var = ((qd1 - qd_mean).pow(2) + (qd2 - qd_mean).pow(2) + (qd3 - qd_mean).pow(2)) / 3.0 - q_vars.append(qd_var) - q_means.append(qd_mean) - # qd_mins = torch.minimum(torch.minimum(qd1, qd2), qd3) - # qs.append(qd_mins) - - qe1, qe2, qe3 = self.target_critic_net(sample5[i], pred_act5) - qe_mean = (qe1 + qe2 + qe3) / 3.0 - qe_var = ((qe1 - qe_mean).pow(2) + (qe2 - qe_mean).pow(2) + (qe3 - qe_mean).pow(2)) / 3.0 - q_vars.append(qe_var) - q_means.append(qe_mean) - # qe_mins = torch.minimum(torch.minimum(qe1, qe2), qe3) - # qs.append(qe_mins) - - rs = torch.stack(rs) - acts = torch.stack(acts) - # qs = torch.stack(qs) - - var_r = torch.var(rs, dim=0) - var_a = torch.var(acts, dim=0) - - q_vars = torch.stack(q_vars) - q_means = torch.stack(q_means) - var_of_mean = torch.var(q_means, dim=0) - mean_of_vars = torch.mean(q_vars, dim=0) - var_q = var_of_mean + mean_of_vars - - # Computing covariance. - # mean_a = torch.mean(acts, dim=0, keepdim=True) - # mean_q = torch.mean(qs, dim=0, keepdim=True) - # diff_a = acts - mean_a - # diff_q = qs - mean_q - # cov_aq = torch.mean(diff_a * diff_q, dim=0) - - total_var = var_r + var_a + var_q # + 2 * cov_aq - # Clip for sigmoid - total_var[total_var < phi] = phi - total_stds = 1 / total_var - return total_stds.detach() - - def set_statistics(self, stats: dict) -> None: - self.world_model.set_statistics(stats) - - def save_models(self, filename: str, filepath: str = "models") -> None: - path = f"{filepath}/models" if filepath != "models" else filepath - dir_exists = os.path.exists(path) - if not dir_exists: - os.makedirs(path) - torch.save(self.actor_net.state_dict(), f"{path}/{filename}_actor.pth") - torch.save(self.critic_net.state_dict(), f"{path}/{filename}_critic.pth") - logging.info("models has been saved...") - - def load_models(self, filepath: str, filename: str) -> None: - path = f"{filepath}/models" if filepath != "models" else filepath - self.actor_net.load_state_dict(torch.load(f"{path}/{filename}_actor.pth")) - self.critic_net.load_state_dict(torch.load(f"{path}/{filename}_critic.pth")) - logging.info("models has been loaded...") diff --git a/cares_reinforcement_learning/algorithm/mbrl/__init__.py b/cares_reinforcement_learning/algorithm/mbrl/__init__.py index d7c59f62..4e98bc13 100644 --- a/cares_reinforcement_learning/algorithm/mbrl/__init__.py +++ b/cares_reinforcement_learning/algorithm/mbrl/__init__.py @@ -1,7 +1,3 @@ from .DynaSAC import DynaSAC -from .DynaSAC_BatchReweight import DynaSAC_BatchReweight -from .DynaSAC_Var import DynaSAC_Var -from .DynaSAT import DynaSAT -from .DynaSAT_BatchReweight import DynaSAT_BatchReweight -from .DynaSAC_ExaBatchReweight import DynaSAC_ExaBatchReweight +from .DynaSAC_ScaleBatchReweight import DynaSAC_ScaleBatchReweight from .DynaSAC_BinaryBatchReweight import DynaSAC_BinaryBatchReweight diff --git a/cares_reinforcement_learning/util/configurations.py b/cares_reinforcement_learning/util/configurations.py index 38f02932..8e9abe79 100644 --- a/cares_reinforcement_learning/util/configurations.py +++ b/cares_reinforcement_learning/util/configurations.py @@ -138,26 +138,8 @@ class SACConfig(AlgorithmConfig): reward_scale: Optional[float] = 1.0 -class DynaSAC_VarConfig(AlgorithmConfig): - algorithm: str = Field("DynaSAC_Var", Literal=True) - actor_lr: Optional[float] = 3e-4 - critic_lr: Optional[float] = 3e-4 - - alpha_lr: Optional[float] = 3e-4 - use_bounded_active: Optional[bool] = False - num_models: Optional[int] = 5 - - gamma: Optional[float] = 0.99 - tau: Optional[float] = 0.005 - reward_scale: Optional[float] = 1.0 - - horizon: Optional[int] = 1 - num_samples: Optional[int] = 10 - world_model_lr: Optional[float] = 0.001 - - -class DynaSAT_BatchReweightConfig(AlgorithmConfig): - algorithm: str = Field("DynaSAT_BatchReweight", Literal=True) +class DynaSACConfig(AlgorithmConfig): + algorithm: str = Field("DynaSAC", Literal=True) actor_lr: Optional[float] = 3e-4 critic_lr: Optional[float] = 3e-4 @@ -192,12 +174,12 @@ class DynaSAC_BinaryBatchReweightConfig(AlgorithmConfig): world_model_lr: Optional[float] = 0.001 threshold_scale: Optional[float] = 0.7 - mode: Optional[int] = 0 + mode: Optional[int] = 1 sample_times: Optional[int] = 10 -class DynaSAC_ExaBatchReweightConfig(AlgorithmConfig): - algorithm: str = Field("DynaSAC_ExaBatchReweight", Literal=True) +class DynaSAC_ScaleBatchReweightConfig(AlgorithmConfig): + algorithm: str = Field("DynaSAC_ScaleBatchReweight", Literal=True) actor_lr: Optional[float] = 3e-4 critic_lr: Optional[float] = 3e-4 @@ -213,59 +195,10 @@ class DynaSAC_ExaBatchReweightConfig(AlgorithmConfig): num_samples: Optional[int] = 10 world_model_lr: Optional[float] = 0.001 - -class DynaSAC_BatchReweightConfig(AlgorithmConfig): - algorithm: str = Field("DynaSAC_BatchReweight", Literal=True) - actor_lr: Optional[float] = 3e-4 - critic_lr: Optional[float] = 3e-4 - - alpha_lr: Optional[float] = 3e-4 - use_bounded_active: Optional[bool] = False - num_models: Optional[int] = 5 - - gamma: Optional[float] = 0.99 - tau: Optional[float] = 0.005 - reward_scale: Optional[float] = 1.0 - - horizon: Optional[int] = 1 - num_samples: Optional[int] = 10 - world_model_lr: Optional[float] = 0.001 - - -class DynaSATConfig(AlgorithmConfig): - algorithm: str = Field("DynaSAT", Literal=True) - actor_lr: Optional[float] = 3e-4 - critic_lr: Optional[float] = 3e-4 - - alpha_lr: Optional[float] = 3e-4 - use_bounded_active: Optional[bool] = False - num_models: Optional[int] = 5 - - gamma: Optional[float] = 0.99 - tau: Optional[float] = 0.005 - reward_scale: Optional[float] = 1.0 - - horizon: Optional[int] = 1 - num_samples: Optional[int] = 10 - world_model_lr: Optional[float] = 0.001 - - -class DynaSACConfig(AlgorithmConfig): - algorithm: str = Field("DynaSAC", Literal=True) - actor_lr: Optional[float] = 3e-4 - critic_lr: Optional[float] = 3e-4 - - alpha_lr: Optional[float] = 3e-4 - use_bounded_active: Optional[bool] = False - num_models: Optional[int] = 5 - - gamma: Optional[float] = 0.99 - tau: Optional[float] = 0.005 - reward_scale: Optional[float] = 1.0 - - horizon: Optional[int] = 1 - num_samples: Optional[int] = 10 - world_model_lr: Optional[float] = 0.001 + threshold_scale: Optional[float] = 0.7 + variance_scale: Optional[float] = 0.1 + mode: Optional[int] = 1 + sample_times: Optional[int] = 10 class NaSATD3Config(AlgorithmConfig): diff --git a/cares_reinforcement_learning/util/network_factory.py b/cares_reinforcement_learning/util/network_factory.py index 2c5c869c..c8d34aed 100644 --- a/cares_reinforcement_learning/util/network_factory.py +++ b/cares_reinforcement_learning/util/network_factory.py @@ -76,13 +76,14 @@ def create_PPO(observation_size, action_num, config: AlgorithmConfig): ) return agent -def create_DynaSAC_BinaryBatchReweight(observation_size, action_num, config: AlgorithmConfig): + +def create_DynaSAC_ScaleBatchReweight(observation_size, action_num, config: AlgorithmConfig): """ Create networks for model-based SAC agent. The Actor and Critic is same. An extra world model is added. """ - from cares_reinforcement_learning.algorithm.mbrl import DynaSAC_BinaryBatchReweight + from cares_reinforcement_learning.algorithm.mbrl import DynaSAC_ScaleBatchReweight from cares_reinforcement_learning.networks.SAC import Actor, Critic from cares_reinforcement_learning.networks.world_models import EnsembleWorldAndOneReward @@ -99,7 +100,7 @@ def create_DynaSAC_BinaryBatchReweight(observation_size, action_num, config: Alg lr=config.world_model_lr, ) - agent = DynaSAC_BinaryBatchReweight( + agent = DynaSAC_ScaleBatchReweight( actor_network=actor, critic_network=critic, world_network=world_model, @@ -113,178 +114,20 @@ def create_DynaSAC_BinaryBatchReweight(observation_size, action_num, config: Alg horizon=config.horizon, num_samples=config.num_samples, threshold_scale=config.threshold_scale, + variance_scale=config.variance_scale, mode=config.mode, sample_times=config.sample_times, ) return agent -def create_DynaSAC_ExaBatchReweight(observation_size, action_num, config: AlgorithmConfig): - """ - Create networks for model-based SAC agent. The Actor and Critic is same. - An extra world model is added. - - """ - from cares_reinforcement_learning.algorithm.mbrl import DynaSAC_ExaBatchReweight - from cares_reinforcement_learning.networks.SAC import Actor, Critic - from cares_reinforcement_learning.networks.world_models import EnsembleWorldAndOneReward - - actor = Actor(observation_size, action_num) - critic = Critic(observation_size, action_num) - - device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - - world_model = EnsembleWorldAndOneReward( - observation_size=observation_size, - num_actions=action_num, - num_models=config.num_models, - device=device, - lr=config.world_model_lr, - ) - - agent = DynaSAC_ExaBatchReweight( - actor_network=actor, - critic_network=critic, - world_network=world_model, - actor_lr=config.actor_lr, - critic_lr=config.critic_lr, - gamma=config.gamma, - tau=config.tau, - action_num=action_num, - device=device, - alpha_lr=config.alpha_lr, - horizon=config.horizon, - num_samples=config.num_samples, - ) - return agent - -def create_DynaSAT_BatchReweight(observation_size, action_num, config: AlgorithmConfig): - """ - Create networks for model-based SAC agent. The Actor and Critic is same. - An extra world model is added. - - """ - from cares_reinforcement_learning.algorithm.mbrl import DynaSAT_BatchReweight - from cares_reinforcement_learning.networks.SAC import Actor, TriCritic - from cares_reinforcement_learning.networks.world_models import EnsembleWorldAndOneReward - - actor = Actor(observation_size, action_num) - critic = TriCritic(observation_size, action_num) - - device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - - world_model = EnsembleWorldAndOneReward( - observation_size=observation_size, - num_actions=action_num, - num_models=config.num_models, - device=device, - lr=config.world_model_lr, - ) - - agent = DynaSAT_BatchReweight( - actor_network=actor, - critic_network=critic, - world_network=world_model, - actor_lr=config.actor_lr, - critic_lr=config.critic_lr, - gamma=config.gamma, - tau=config.tau, - action_num=action_num, - device=device, - alpha_lr=config.alpha_lr, - horizon=config.horizon, - num_samples=config.num_samples, - ) - return agent - - -def create_DynaSAT(observation_size, action_num, config: AlgorithmConfig): - """ - Create networks for model-based SAC agent. The Actor and Critic is same. - An extra world model is added. - - """ - from cares_reinforcement_learning.algorithm.mbrl import DynaSAT - from cares_reinforcement_learning.networks.SAC import Actor, TriCritic - from cares_reinforcement_learning.networks.world_models import EnsembleWorldAndOneReward - - actor = Actor(observation_size, action_num) - critic = TriCritic(observation_size, action_num) - - device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - - world_model = EnsembleWorldAndOneReward( - observation_size=observation_size, - num_actions=action_num, - num_models=config.num_models, - lr=config.world_model_lr, - device=device, - ) - - agent = DynaSAT( - actor_network=actor, - critic_network=critic, - world_network=world_model, - actor_lr=config.actor_lr, - critic_lr=config.critic_lr, - gamma=config.gamma, - tau=config.tau, - action_num=action_num, - alpha_lr=config.alpha_lr, - horizon=config.horizon, - num_samples=config.num_samples, - device=device, - ) - return agent - - -def create_DynaSAC_BatchReweight(observation_size, action_num, config: AlgorithmConfig): - """ - Create networks for model-based SAC agent. The Actor and Critic is same. - An extra world model is added. - - """ - from cares_reinforcement_learning.algorithm.mbrl import DynaSAC_BatchReweight - from cares_reinforcement_learning.networks.SAC import Actor, Critic - from cares_reinforcement_learning.networks.world_models import EnsembleWorldAndOneReward - - actor = Actor(observation_size, action_num) - critic = Critic(observation_size, action_num) - - device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - - world_model = EnsembleWorldAndOneReward( - observation_size=observation_size, - num_actions=action_num, - num_models=config.num_models, - device=device, - lr=config.world_model_lr, - ) - - agent = DynaSAC_BatchReweight( - actor_network=actor, - critic_network=critic, - world_network=world_model, - actor_lr=config.actor_lr, - critic_lr=config.critic_lr, - gamma=config.gamma, - tau=config.tau, - action_num=action_num, - device=device, - alpha_lr=config.alpha_lr, - horizon=config.horizon, - num_samples=config.num_samples, - ) - return agent - - -def create_DynaSAC_Var(observation_size, action_num, config: AlgorithmConfig): +def create_DynaSAC_BinaryBatchReweight(observation_size, action_num, config: AlgorithmConfig): """ Create networks for model-based SAC agent. The Actor and Critic is same. An extra world model is added. """ - from cares_reinforcement_learning.algorithm.mbrl import DynaSAC_Var + from cares_reinforcement_learning.algorithm.mbrl import DynaSAC_BinaryBatchReweight from cares_reinforcement_learning.networks.SAC import Actor, Critic from cares_reinforcement_learning.networks.world_models import EnsembleWorldAndOneReward @@ -301,7 +144,7 @@ def create_DynaSAC_Var(observation_size, action_num, config: AlgorithmConfig): lr=config.world_model_lr, ) - agent = DynaSAC_Var( + agent = DynaSAC_BinaryBatchReweight( actor_network=actor, critic_network=critic, world_network=world_model, @@ -314,6 +157,9 @@ def create_DynaSAC_Var(observation_size, action_num, config: AlgorithmConfig): alpha_lr=config.alpha_lr, horizon=config.horizon, num_samples=config.num_samples, + threshold_scale=config.threshold_scale, + mode=config.mode, + sample_times=config.sample_times, ) return agent From 99dd56e0970a0a0d08a7452ecec02bf6176d7c3f Mon Sep 17 00:00:00 2001 From: tony Date: Mon, 17 Jun 2024 08:38:02 +1200 Subject: [PATCH 33/91] clean up --- .../mbrl/DynaSAC_MaxBatchReweight.py | 399 ++++++++++++++++++ .../algorithm/mbrl/__init__.py | 1 + .../util/configurations.py | 23 + .../util/network_factory.py | 44 ++ 4 files changed, 467 insertions(+) create mode 100644 cares_reinforcement_learning/algorithm/mbrl/DynaSAC_MaxBatchReweight.py diff --git a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_MaxBatchReweight.py b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_MaxBatchReweight.py new file mode 100644 index 00000000..9645d89f --- /dev/null +++ b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_MaxBatchReweight.py @@ -0,0 +1,399 @@ +""" +Sutton, Richard S. "Dyna, an integrated architecture for learning, planning, and reacting." + +Original Paper: https://dl.acm.org/doi/abs/10.1145/122344.122377 + +This code runs automatic entropy tuning +""" + +import copy +import logging +import os + +import numpy as np +import torch +from cares_reinforcement_learning.memory import PrioritizedReplayBuffer + +from cares_reinforcement_learning.networks.world_models.ensemble_world import ( + EnsembleWorldAndOneReward, +) + + +class DynaSAC_MaxBatchReweight: + """ + Max as ? + """ + def __init__( + self, + actor_network: torch.nn.Module, + critic_network: torch.nn.Module, + world_network: EnsembleWorldAndOneReward, + gamma: float, + tau: float, + action_num: int, + actor_lr: float, + critic_lr: float, + alpha_lr: float, + num_samples: int, + horizon: int, + threshold_scale: float, + variance_scale: float, + mode: int, + sample_times: int, + device: torch.device, + ): + self.type = "mbrl" + self.device = device + + # this may be called policy_net in other implementations + self.actor_net = actor_network.to(self.device) + # this may be called soft_q_net in other implementations + self.critic_net = critic_network.to(self.device) + self.target_critic_net = copy.deepcopy(self.critic_net) + + self.gamma = gamma + self.tau = tau + + self.num_samples = num_samples + self.horizon = horizon + self.action_num = action_num + + self.learn_counter = 0 + self.policy_update_freq = 1 + + self.actor_net_optimiser = torch.optim.Adam( + self.actor_net.parameters(), lr=actor_lr + ) + self.critic_net_optimiser = torch.optim.Adam( + self.critic_net.parameters(), lr=critic_lr + ) + + # Set to initial alpha to 1.0 according to other baselines. + self.log_alpha = torch.tensor(np.log(1.0)).to(device) + self.log_alpha.requires_grad = True + self.target_entropy = -action_num + self.log_alpha_optimizer = torch.optim.Adam([self.log_alpha], lr=alpha_lr) + + # World model + self.world_model = world_network + # Parameter + self.threshold_scale = threshold_scale + self.variance_scale = variance_scale + self.mode = mode + self.sample_times = sample_times + + @property + def _alpha(self) -> float: + return self.log_alpha.exp() + + # pylint: disable-next=unused-argument to keep the same interface + def select_action_from_policy( + self, state: np.ndarray, evaluation: bool = False, noise_scale: float = 0 + ) -> np.ndarray: + # note that when evaluating this algorithm we need to select mu as + self.actor_net.eval() + with torch.no_grad(): + state_tensor = torch.FloatTensor(state).unsqueeze(0).to(self.device) + if evaluation is False: + (action, _, _) = self.actor_net(state_tensor) + else: + (_, _, action) = self.actor_net(state_tensor) + action = action.cpu().data.numpy().flatten() + self.actor_net.train() + return action + + def _train_policy( + self, + states: torch.Tensor, + actions: torch.Tensor, + rewards: torch.Tensor, + next_states: torch.Tensor, + dones: torch.Tensor, + weights: torch.Tensor, + ) -> None: + ################## Update the Critic First #################### + # Have more target values? + with torch.no_grad(): + next_actions, next_log_pi, _ = self.actor_net(next_states) + target_q_one, target_q_two = self.target_critic_net( + next_states, next_actions + ) + target_q_values = ( + torch.minimum(target_q_one, target_q_two) - self._alpha * next_log_pi + ) + q_target = rewards + self.gamma * (1 - dones) * target_q_values + + q_values_one, q_values_two = self.critic_net(states, actions) + + # Original loss function + l2_loss_one = (q_values_one - q_target).pow(2) + l2_loss_two = (q_values_two - q_target).pow(2) + + # Reweighted loss function. weight not participant in training. + weights = weights.detach() + disc_l2_loss_one = l2_loss_one * weights + disc_l2_loss_two = l2_loss_two * weights + # A ratio to scale the loss back to original loss scale. + + ratio_1 = torch.mean(l2_loss_one) / torch.mean(disc_l2_loss_one) + ratio_1 = ratio_1.detach() + ratio_2 = torch.mean(l2_loss_two) / torch.mean(disc_l2_loss_two) + ratio_2 = ratio_2.detach() + + critic_loss_one = disc_l2_loss_one.mean() * ratio_1 + critic_loss_two = disc_l2_loss_two.mean() * ratio_2 + + critic_loss_total = critic_loss_one + critic_loss_two + + # Update the Critic + self.critic_net_optimiser.zero_grad() + critic_loss_total.backward() + self.critic_net_optimiser.step() + + ################## Update the Actor Second #################### + pi, first_log_p, _ = self.actor_net(states) + qf1_pi, qf2_pi = self.critic_net(states, pi) + min_qf_pi = torch.minimum(qf1_pi, qf2_pi) + actor_loss = ((self._alpha * first_log_p) - min_qf_pi).mean() + + # Update the Actor + self.actor_net_optimiser.zero_grad() + actor_loss.backward() + self.actor_net_optimiser.step() + + # Update the temperature + alpha_loss = -( + self.log_alpha * (first_log_p + self.target_entropy).detach() + ).mean() + + self.log_alpha_optimizer.zero_grad() + alpha_loss.backward() + self.log_alpha_optimizer.step() + + if self.learn_counter % self.policy_update_freq == 0: + for target_param, param in zip( + self.target_critic_net.parameters(), self.critic_net.parameters() + ): + target_param.data.copy_( + param.data * self.tau + target_param.data * (1.0 - self.tau) + ) + + def train_world_model( + self, memory: PrioritizedReplayBuffer, batch_size: int + ) -> None: + experiences = memory.sample_uniform(batch_size) + states, actions, rewards, next_states, _, _ = experiences + + states = torch.FloatTensor(np.asarray(states)).to(self.device) + actions = torch.FloatTensor(np.asarray(actions)).to(self.device) + rewards = torch.FloatTensor(np.asarray(rewards)).to(self.device).unsqueeze(1) + next_states = torch.FloatTensor(np.asarray(next_states)).to(self.device) + + self.world_model.train_world( + states=states, + actions=actions, + next_states=next_states, + ) + self.world_model.train_reward( + next_states=next_states, + rewards=rewards, + ) + + def train_policy(self, memory: PrioritizedReplayBuffer, batch_size: int) -> None: + self.learn_counter += 1 + + experiences = memory.sample_uniform(batch_size) + states, actions, rewards, next_states, dones, _ = experiences + + # Convert into tensor + states = torch.FloatTensor(np.asarray(states)).to(self.device) + actions = torch.FloatTensor(np.asarray(actions)).to(self.device) + rewards = torch.FloatTensor(np.asarray(rewards)).to(self.device).unsqueeze(1) + next_states = torch.FloatTensor(np.asarray(next_states)).to(self.device) + dones = torch.LongTensor(np.asarray(dones)).to(self.device).unsqueeze(1) + full_weights = torch.ones(rewards.shape).to(self.device) + # Step 2 train as usual + self._train_policy( + states=states, + actions=actions, + rewards=rewards, + next_states=next_states, + dones=dones, + weights=full_weights, + ) + # # # Step 3 Dyna add more data + self._dyna_generate_and_train(next_states=next_states) + + def _dyna_generate_and_train(self, next_states): + """ + Only off-policy Dyna will work. + :param next_states: + """ + pred_states = [] + pred_actions = [] + pred_rs = [] + pred_n_states = [] + pred_uncerts = [] + with torch.no_grad(): + pred_state = next_states + for _ in range(self.horizon): + pred_state = torch.repeat_interleave(pred_state, self.num_samples, dim=0) + # This part is controversial. But random actions is empirically better. + rand_acts = np.random.uniform(-1, 1, (pred_state.shape[0], self.action_num)) + pred_acts = torch.FloatTensor(rand_acts).to(self.device) + + pred_next_state, _, pred_mean, pred_var = self.world_model.pred_next_states( + pred_state, pred_acts + ) + uncert = self.sampling(pred_means=pred_mean, pred_vars=pred_var) + uncert = uncert.unsqueeze(dim=1).to(self.device) + pred_uncerts.append(uncert) + + pred_reward = self.world_model.pred_rewards(pred_next_state) + pred_states.append(pred_state) + pred_actions.append(pred_acts.detach()) + pred_rs.append(pred_reward.detach()) + pred_n_states.append(pred_next_state.detach()) + pred_state = pred_next_state.detach() + pred_states = torch.vstack(pred_states) + pred_actions = torch.vstack(pred_actions) + pred_rs = torch.vstack(pred_rs) + pred_n_states = torch.vstack(pred_n_states) + pred_weights = torch.vstack(pred_uncerts) + # Pay attention to here! It is dones in the Cares RL Code! + pred_dones = torch.FloatTensor(np.zeros(pred_rs.shape)).to(self.device) + # states, actions, rewards, next_states, not_dones + self._train_policy( + pred_states, pred_actions, pred_rs, pred_n_states, pred_dones, pred_weights + ) + + def sampling(self, pred_means, pred_vars): + """ + High std means low uncertainty. Therefore, divided by 1 + + :param pred_means: + :param pred_vars: + :return: + """ + with torch.no_grad(): + # 5 models. Each predict 10 next_states. + sample1 = torch.distributions.Normal(pred_means[0], pred_vars[0]).sample( + [self.sample_times]) + sample2 = torch.distributions.Normal(pred_means[1], pred_vars[1]).sample( + [self.sample_times]) + sample3 = torch.distributions.Normal(pred_means[2], pred_vars[2]).sample( + [self.sample_times]) + sample4 = torch.distributions.Normal(pred_means[3], pred_vars[3]).sample( + [self.sample_times]) + sample5 = torch.distributions.Normal(pred_means[4], pred_vars[4]).sample( + [self.sample_times]) + rs = [] + acts = [] + qs = [] + # Varying the next_state's distribution. + for i in range(self.sample_times): + # 5 models, each sampled 10 times = 50, + pred_rwd1 = self.world_model.pred_rewards(sample1[i]) + pred_rwd2 = self.world_model.pred_rewards(sample2[i]) + pred_rwd3 = self.world_model.pred_rewards(sample3[i]) + pred_rwd4 = self.world_model.pred_rewards(sample4[i]) + pred_rwd5 = self.world_model.pred_rewards(sample5[i]) + rs.append(pred_rwd1) + rs.append(pred_rwd2) + rs.append(pred_rwd3) + rs.append(pred_rwd4) + rs.append(pred_rwd5) + # Each times, 5 models predict different actions. + # [2560, 17] + pred_act1, log_pi1, _ = self.actor_net(sample1[i]) + pred_act2, log_pi2, _ = self.actor_net(sample2[i]) + pred_act3, log_pi3, _ = self.actor_net(sample3[i]) + pred_act4, log_pi4, _ = self.actor_net(sample4[i]) + pred_act5, log_pi5, _ = self.actor_net(sample5[i]) + acts.append(log_pi1) + acts.append(log_pi2) + acts.append(log_pi3) + acts.append(log_pi4) + acts.append(log_pi5) + # How to become the same next state, different action. + # Now: sample1 sample2... same next state, different model. + # Pred_act1 pred_act2 same next_state, different actions. + # 5[] * 10[var of state] + qa1, qa2 = self.target_critic_net(sample1[i], pred_act1) + qa = torch.minimum(qa1, qa2) + qb1, qb2 = self.target_critic_net(sample2[i], pred_act2) + qb = torch.minimum(qb1, qb2) + qc1, qc2 = self.target_critic_net(sample3[i], pred_act3) + qc = torch.minimum(qc1, qc2) + qd1, qd2 = self.target_critic_net(sample4[i], pred_act4) + qd = torch.minimum(qd1, qd2) + qe1, qe2 = self.target_critic_net(sample5[i], pred_act5) + qe = torch.minimum(qe1, qe2) + qs.append(qa) + qs.append(qb) + qs.append(qc) + qs.append(qd) + qs.append(qe) + + rs = torch.stack(rs) + acts = torch.stack(acts) + qs = torch.stack(qs) + + var_r = torch.var(rs, dim=0) + + if self.mode < 3: + var_a = torch.var(acts, dim=0) + var_q = torch.var(qs, dim=0) + + # Computing covariance. + if self.mode < 2: + mean_a = torch.mean(acts, dim=0, keepdim=True) + mean_q = torch.mean(qs, dim=0, keepdim=True) + diff_a = acts - mean_a + diff_q = qs - mean_q + cov_aq = torch.mean(diff_a * diff_q, dim=0) + + if self.mode < 1: + mean_r = torch.mean(rs, dim=0, keepdim=True) + diff_r = rs - mean_r + cov_rq = torch.mean(diff_r * diff_q, dim=0) + + cov_ra = torch.mean(diff_r * diff_a, dim=0) + + # Ablation + if self.mode == 0: + total_var = var_r + var_a + var_q + 2 * cov_aq + 2 * cov_rq + 2 * cov_ra + if self.mode == 1: + total_var = var_r + var_a + var_q + 2 * cov_aq + if self.mode == 2: + total_var = var_r + var_a + var_q + if self.mode == 3: + total_var = var_r + + # Exacerbate the sample difference. + min_var = torch.min(total_var) + max_var = torch.max(total_var) + total_var /= (max_var - min_var) + threshold = self.threshold_scale + total_var[total_var <= threshold] = self.variance_scale + + total_stds = 1 / total_var + return total_stds.detach() + + def set_statistics(self, stats: dict) -> None: + self.world_model.set_statistics(stats) + + def save_models(self, filename: str, filepath: str = "models") -> None: + path = f"{filepath}/models" if filepath != "models" else filepath + dir_exists = os.path.exists(path) + if not dir_exists: + os.makedirs(path) + torch.save(self.actor_net.state_dict(), f"{path}/{filename}_actor.pth") + torch.save(self.critic_net.state_dict(), f"{path}/{filename}_critic.pth") + logging.info("models has been saved...") + + def load_models(self, filepath: str, filename: str) -> None: + path = f"{filepath}/models" if filepath != "models" else filepath + self.actor_net.load_state_dict(torch.load(f"{path}/{filename}_actor.pth")) + self.critic_net.load_state_dict(torch.load(f"{path}/{filename}_critic.pth")) + logging.info("models has been loaded...") diff --git a/cares_reinforcement_learning/algorithm/mbrl/__init__.py b/cares_reinforcement_learning/algorithm/mbrl/__init__.py index 4e98bc13..661f9882 100644 --- a/cares_reinforcement_learning/algorithm/mbrl/__init__.py +++ b/cares_reinforcement_learning/algorithm/mbrl/__init__.py @@ -1,3 +1,4 @@ from .DynaSAC import DynaSAC from .DynaSAC_ScaleBatchReweight import DynaSAC_ScaleBatchReweight from .DynaSAC_BinaryBatchReweight import DynaSAC_BinaryBatchReweight +from .DynaSAC_MaxBatchReweight import DynaSAC_MaxBatchReweight diff --git a/cares_reinforcement_learning/util/configurations.py b/cares_reinforcement_learning/util/configurations.py index 8e9abe79..21415cfd 100644 --- a/cares_reinforcement_learning/util/configurations.py +++ b/cares_reinforcement_learning/util/configurations.py @@ -178,6 +178,29 @@ class DynaSAC_BinaryBatchReweightConfig(AlgorithmConfig): sample_times: Optional[int] = 10 +class DynaSAC_MaxBatchReweightConfig(AlgorithmConfig): + algorithm: str = Field("DynaSAC_MaxBatchReweight", Literal=True) + actor_lr: Optional[float] = 3e-4 + critic_lr: Optional[float] = 3e-4 + + alpha_lr: Optional[float] = 3e-4 + use_bounded_active: Optional[bool] = False + num_models: Optional[int] = 5 + + gamma: Optional[float] = 0.99 + tau: Optional[float] = 0.005 + reward_scale: Optional[float] = 1.0 + + horizon: Optional[int] = 1 + num_samples: Optional[int] = 10 + world_model_lr: Optional[float] = 0.001 + + threshold_scale: Optional[float] = 0.7 + variance_scale: Optional[float] = 0.1 + mode: Optional[int] = 1 + sample_times: Optional[int] = 10 + + class DynaSAC_ScaleBatchReweightConfig(AlgorithmConfig): algorithm: str = Field("DynaSAC_ScaleBatchReweight", Literal=True) actor_lr: Optional[float] = 3e-4 diff --git a/cares_reinforcement_learning/util/network_factory.py b/cares_reinforcement_learning/util/network_factory.py index c8d34aed..e11f9917 100644 --- a/cares_reinforcement_learning/util/network_factory.py +++ b/cares_reinforcement_learning/util/network_factory.py @@ -121,6 +121,50 @@ def create_DynaSAC_ScaleBatchReweight(observation_size, action_num, config: Algo return agent +def create_DynaSAC_MaxBatchReweight(observation_size, action_num, config: AlgorithmConfig): + """ + Create networks for model-based SAC agent. The Actor and Critic is same. + An extra world model is added. + + """ + from cares_reinforcement_learning.algorithm.mbrl import DynaSAC_MaxBatchReweight + from cares_reinforcement_learning.networks.SAC import Actor, Critic + from cares_reinforcement_learning.networks.world_models import EnsembleWorldAndOneReward + + actor = Actor(observation_size, action_num) + critic = Critic(observation_size, action_num) + + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + world_model = EnsembleWorldAndOneReward( + observation_size=observation_size, + num_actions=action_num, + num_models=config.num_models, + device=device, + lr=config.world_model_lr, + ) + + agent = DynaSAC_MaxBatchReweight( + actor_network=actor, + critic_network=critic, + world_network=world_model, + actor_lr=config.actor_lr, + critic_lr=config.critic_lr, + gamma=config.gamma, + tau=config.tau, + action_num=action_num, + device=device, + alpha_lr=config.alpha_lr, + horizon=config.horizon, + num_samples=config.num_samples, + threshold_scale=config.threshold_scale, + variance_scale=config.variance_scale, + mode=config.mode, + sample_times=config.sample_times, + ) + return agent + + def create_DynaSAC_BinaryBatchReweight(observation_size, action_num, config: AlgorithmConfig): """ Create networks for model-based SAC agent. The Actor and Critic is same. From ec841e4e4e92a24bf1783184b2700f3b915bc8fe Mon Sep 17 00:00:00 2001 From: tony Date: Mon, 17 Jun 2024 09:52:43 +1200 Subject: [PATCH 34/91] gamma square --- .../algorithm/mbrl/DynaSAC_BinaryBatchReweight.py | 8 +++++--- .../algorithm/mbrl/DynaSAC_MaxBatchReweight.py | 9 ++++++--- .../algorithm/mbrl/DynaSAC_ScaleBatchReweight.py | 8 +++++--- 3 files changed, 16 insertions(+), 9 deletions(-) diff --git a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_BinaryBatchReweight.py b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_BinaryBatchReweight.py index f34ea44f..8bf8508b 100644 --- a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_BinaryBatchReweight.py +++ b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_BinaryBatchReweight.py @@ -358,13 +358,15 @@ def sampling(self, pred_means, pred_vars): cov_ra = torch.mean(diff_r * diff_a, dim=0) + gamma_sq = self.gamma * self.gamma # Ablation if self.mode == 0: - total_var = var_r + var_a + var_q + 2 * cov_aq + 2 * cov_rq + 2 * cov_ra + total_var = var_r + gamma_sq * var_a + gamma_sq * var_q + gamma_sq * 2 * cov_aq + \ + gamma_sq * 2 * cov_rq + gamma_sq * 2 * cov_ra if self.mode == 1: - total_var = var_r + var_a + var_q + 2 * cov_aq + total_var = var_r + gamma_sq * var_a + gamma_sq * var_q + gamma_sq * 2 * cov_aq if self.mode == 2: - total_var = var_r + var_a + var_q + total_var = var_r + gamma_sq * var_a + gamma_sq * var_q if self.mode == 3: total_var = var_r diff --git a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_MaxBatchReweight.py b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_MaxBatchReweight.py index 9645d89f..42205a36 100644 --- a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_MaxBatchReweight.py +++ b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_MaxBatchReweight.py @@ -360,16 +360,19 @@ def sampling(self, pred_means, pred_vars): cov_ra = torch.mean(diff_r * diff_a, dim=0) + gamma_sq = self.gamma * self.gamma # Ablation if self.mode == 0: - total_var = var_r + var_a + var_q + 2 * cov_aq + 2 * cov_rq + 2 * cov_ra + total_var = var_r + gamma_sq * var_a + gamma_sq * var_q + gamma_sq * 2 * cov_aq + \ + gamma_sq * 2 * cov_rq + gamma_sq * 2 * cov_ra if self.mode == 1: - total_var = var_r + var_a + var_q + 2 * cov_aq + total_var = var_r + gamma_sq * var_a + gamma_sq * var_q + gamma_sq * 2 * cov_aq if self.mode == 2: - total_var = var_r + var_a + var_q + total_var = var_r + gamma_sq * var_a + gamma_sq * var_q if self.mode == 3: total_var = var_r + # Exacerbate the sample difference. min_var = torch.min(total_var) max_var = torch.max(total_var) diff --git a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_ScaleBatchReweight.py b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_ScaleBatchReweight.py index cf668539..d9ebd9bc 100644 --- a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_ScaleBatchReweight.py +++ b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_ScaleBatchReweight.py @@ -360,13 +360,15 @@ def sampling(self, pred_means, pred_vars): cov_ra = torch.mean(diff_r * diff_a, dim=0) + gamma_sq = self.gamma * self.gamma # Ablation if self.mode == 0: - total_var = var_r + var_a + var_q + 2 * cov_aq + 2 * cov_rq + 2 * cov_ra + total_var = var_r + gamma_sq * var_a + gamma_sq * var_q + gamma_sq * 2 * cov_aq + \ + gamma_sq * 2 * cov_rq + gamma_sq * 2 * cov_ra if self.mode == 1: - total_var = var_r + var_a + var_q + 2 * cov_aq + total_var = var_r + gamma_sq * var_a + gamma_sq * var_q + gamma_sq * 2 * cov_aq if self.mode == 2: - total_var = var_r + var_a + var_q + total_var = var_r + gamma_sq * var_a + gamma_sq * var_q if self.mode == 3: total_var = var_r From 4cdf0e7c15e2907e460f3fdb84913ab113573506 Mon Sep 17 00:00:00 2001 From: tony Date: Sat, 22 Jun 2024 22:52:23 +1200 Subject: [PATCH 35/91] reweight_actor --- .../mbrl/DynaSAC_MaxBatchReweight.py | 2 - .../mbrl/DynaSAC_ScaleBatchReweight.py | 62 +++++++++++-------- .../algorithm/mbrl/STEVESAC.py | 0 .../util/configurations.py | 4 +- .../util/network_factory.py | 3 +- 5 files changed, 41 insertions(+), 30 deletions(-) create mode 100644 cares_reinforcement_learning/algorithm/mbrl/STEVESAC.py diff --git a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_MaxBatchReweight.py b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_MaxBatchReweight.py index 42205a36..7ec92fac 100644 --- a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_MaxBatchReweight.py +++ b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_MaxBatchReweight.py @@ -372,14 +372,12 @@ def sampling(self, pred_means, pred_vars): if self.mode == 3: total_var = var_r - # Exacerbate the sample difference. min_var = torch.min(total_var) max_var = torch.max(total_var) total_var /= (max_var - min_var) threshold = self.threshold_scale total_var[total_var <= threshold] = self.variance_scale - total_stds = 1 / total_var return total_stds.detach() diff --git a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_ScaleBatchReweight.py b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_ScaleBatchReweight.py index d9ebd9bc..741d3c0d 100644 --- a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_ScaleBatchReweight.py +++ b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_ScaleBatchReweight.py @@ -13,6 +13,8 @@ import numpy as np import torch from cares_reinforcement_learning.memory import PrioritizedReplayBuffer +import torch.nn.functional as F + from cares_reinforcement_learning.networks.world_models.ensemble_world import ( EnsembleWorldAndOneReward, @@ -37,14 +39,16 @@ def __init__( num_samples: int, horizon: int, threshold_scale: float, - variance_scale: float, + reweigt_critic: bool, + reweigt_actor: bool, mode: int, sample_times: int, device: torch.device, ): self.type = "mbrl" self.device = device - + self.reweight_critic = reweigt_critic + self.reweight_actor = reweigt_actor # this may be called policy_net in other implementations self.actor_net = actor_network.to(self.device) # this may be called soft_q_net in other implementations @@ -78,7 +82,6 @@ def __init__( self.world_model = world_network # Parameter self.threshold_scale = threshold_scale - self.variance_scale = variance_scale self.mode = mode self.sample_times = sample_times @@ -125,25 +128,29 @@ def _train_policy( q_values_one, q_values_two = self.critic_net(states, actions) - # Original loss function - l2_loss_one = (q_values_one - q_target).pow(2) - l2_loss_two = (q_values_two - q_target).pow(2) + if self.reweight_critic: + # Reweighted loss function. weight not participant in training. + l2_loss_one = (q_values_one - q_target).pow(2) + l2_loss_two = (q_values_two - q_target).pow(2) - # Reweighted loss function. weight not participant in training. - weights = weights.detach() - disc_l2_loss_one = l2_loss_one * weights - disc_l2_loss_two = l2_loss_two * weights - # A ratio to scale the loss back to original loss scale. + weights = weights.detach() + disc_l2_loss_one = l2_loss_one * weights + disc_l2_loss_two = l2_loss_two * weights + # A ratio to scale the loss back to original loss scale. - ratio_1 = torch.mean(l2_loss_one) / torch.mean(disc_l2_loss_one) - ratio_1 = ratio_1.detach() - ratio_2 = torch.mean(l2_loss_two) / torch.mean(disc_l2_loss_two) - ratio_2 = ratio_2.detach() + ratio_1 = torch.mean(l2_loss_one) / torch.mean(disc_l2_loss_one) + ratio_1 = ratio_1.detach() + ratio_2 = torch.mean(l2_loss_two) / torch.mean(disc_l2_loss_two) + ratio_2 = ratio_2.detach() - critic_loss_one = disc_l2_loss_one.mean() * ratio_1 - critic_loss_two = disc_l2_loss_two.mean() * ratio_2 + critic_loss_one = disc_l2_loss_one.mean() * ratio_1 + critic_loss_two = disc_l2_loss_two.mean() * ratio_2 - critic_loss_total = critic_loss_one + critic_loss_two + critic_loss_total = critic_loss_one + critic_loss_two + else: + critic_loss_one = F.mse_loss(q_values_one, q_target) + critic_loss_two = F.mse_loss(q_values_two, q_target) + critic_loss_total = critic_loss_one + critic_loss_two # Update the Critic self.critic_net_optimiser.zero_grad() @@ -154,7 +161,16 @@ def _train_policy( pi, first_log_p, _ = self.actor_net(states) qf1_pi, qf2_pi = self.critic_net(states, pi) min_qf_pi = torch.minimum(qf1_pi, qf2_pi) - actor_loss = ((self._alpha * first_log_p) - min_qf_pi).mean() + + if self.reweight_actor: + weights = weights.detach() + a_loss = (self._alpha * first_log_p) - min_qf_pi + disc_actor_loss = a_loss * weights + ratio = torch.mean(a_loss) / torch.mean(disc_actor_loss) + ratio = ratio.detach() + actor_loss = ratio * torch.mean(disc_actor_loss) + else: + actor_loss = ((self._alpha * first_log_p) - min_qf_pi).mean() # Update the Actor self.actor_net_optimiser.zero_grad() @@ -378,15 +394,9 @@ def sampling(self, pred_means, pred_vars): total_var /= old_mean_var min_var = torch.min(total_var) max_var = torch.max(total_var) - # mean_var = torch.mean(total_var) - + # As (max-min) decrease, threshold should go down. threshold = self.threshold_scale * (max_var - min_var) + min_var total_var[total_var <= threshold] = threshold - - # threshold = (self.threshold_scale * (max_var - mean_var)) + mean_var - # threshold = torch.min(threshold, min_var) - # total_var[total_var <= threshold] = max_var * self.variance_scale - total_stds = 1 / total_var return total_stds.detach() diff --git a/cares_reinforcement_learning/algorithm/mbrl/STEVESAC.py b/cares_reinforcement_learning/algorithm/mbrl/STEVESAC.py new file mode 100644 index 00000000..e69de29b diff --git a/cares_reinforcement_learning/util/configurations.py b/cares_reinforcement_learning/util/configurations.py index 21415cfd..b21eb580 100644 --- a/cares_reinforcement_learning/util/configurations.py +++ b/cares_reinforcement_learning/util/configurations.py @@ -219,7 +219,9 @@ class DynaSAC_ScaleBatchReweightConfig(AlgorithmConfig): world_model_lr: Optional[float] = 0.001 threshold_scale: Optional[float] = 0.7 - variance_scale: Optional[float] = 0.1 + reweight_critic: Optional[bool] = True + reweight_actor: Optional[bool] = False + mode: Optional[int] = 1 sample_times: Optional[int] = 10 diff --git a/cares_reinforcement_learning/util/network_factory.py b/cares_reinforcement_learning/util/network_factory.py index e11f9917..c9b772a1 100644 --- a/cares_reinforcement_learning/util/network_factory.py +++ b/cares_reinforcement_learning/util/network_factory.py @@ -114,7 +114,8 @@ def create_DynaSAC_ScaleBatchReweight(observation_size, action_num, config: Algo horizon=config.horizon, num_samples=config.num_samples, threshold_scale=config.threshold_scale, - variance_scale=config.variance_scale, + reweight_actor=config.reweight_actor, + reweight_critic=config.reweight_critic, mode=config.mode, sample_times=config.sample_times, ) From 1e722e384f5062b5a83e9a7ee36bf19f64aabc4c Mon Sep 17 00:00:00 2001 From: tony Date: Sat, 22 Jun 2024 22:55:48 +1200 Subject: [PATCH 36/91] reweight_actor --- cares_reinforcement_learning/util/network_factory.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cares_reinforcement_learning/util/network_factory.py b/cares_reinforcement_learning/util/network_factory.py index c9b772a1..be87b72c 100644 --- a/cares_reinforcement_learning/util/network_factory.py +++ b/cares_reinforcement_learning/util/network_factory.py @@ -114,8 +114,8 @@ def create_DynaSAC_ScaleBatchReweight(observation_size, action_num, config: Algo horizon=config.horizon, num_samples=config.num_samples, threshold_scale=config.threshold_scale, - reweight_actor=config.reweight_actor, reweight_critic=config.reweight_critic, + reweight_actor=config.reweight_actor, mode=config.mode, sample_times=config.sample_times, ) From 9a59f3ff9b48a53a78c53b1e5d6465d9f8f52b49 Mon Sep 17 00:00:00 2001 From: tony Date: Sat, 22 Jun 2024 23:46:31 +1200 Subject: [PATCH 37/91] Add baselines. --- .../algorithm/mbrl/DynaSAC_BIV.py | 453 ++++++++++++++++++ .../algorithm/mbrl/DynaSAC_SUNRISE.py | 411 ++++++++++++++++ .../mbrl/DynaSAC_ScaleBatchReweight.py | 1 + .../algorithm/mbrl/DynaSAC_UWAC.py | 411 ++++++++++++++++ .../algorithm/mbrl/__init__.py | 3 + .../util/configurations.py | 73 +++ .../util/network_factory.py | 132 +++++ 7 files changed, 1484 insertions(+) create mode 100644 cares_reinforcement_learning/algorithm/mbrl/DynaSAC_BIV.py create mode 100644 cares_reinforcement_learning/algorithm/mbrl/DynaSAC_SUNRISE.py create mode 100644 cares_reinforcement_learning/algorithm/mbrl/DynaSAC_UWAC.py diff --git a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_BIV.py b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_BIV.py new file mode 100644 index 00000000..17a0d109 --- /dev/null +++ b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_BIV.py @@ -0,0 +1,453 @@ +""" +Sutton, Richard S. "Dyna, an integrated architecture for learning, planning, and reacting." + +Original Paper: https://dl.acm.org/doi/abs/10.1145/122344.122377 + +This code runs automatic entropy tuning +""" + +import copy +import logging +import os +from scipy.optimize import minimize +import numpy as np +import torch +from cares_reinforcement_learning.memory import PrioritizedReplayBuffer +import torch.nn.functional as F + +from cares_reinforcement_learning.networks.world_models.ensemble_world import ( + EnsembleWorldAndOneReward, +) + + +class DynaSAC_BIVReweight: + """ + Max as ? + """ + + def __init__( + self, + actor_network: torch.nn.Module, + critic_network: torch.nn.Module, + world_network: EnsembleWorldAndOneReward, + gamma: float, + tau: float, + action_num: int, + actor_lr: float, + critic_lr: float, + alpha_lr: float, + num_samples: int, + horizon: int, + threshold_scale: float, + reweigt_critic: bool, + reweigt_actor: bool, + mode: int, + sample_times: int, + device: torch.device, + ): + self.type = "mbrl" + self.device = device + self.reweight_critic = reweigt_critic + self.reweight_actor = reweigt_actor + # this may be called policy_net in other implementations + self.actor_net = actor_network.to(self.device) + # this may be called soft_q_net in other implementations + self.critic_net = critic_network.to(self.device) + self.target_critic_net = copy.deepcopy(self.critic_net) + + self.gamma = gamma + self.tau = tau + + self.num_samples = num_samples + self.horizon = horizon + self.action_num = action_num + + self.learn_counter = 0 + self.policy_update_freq = 1 + + self.actor_net_optimiser = torch.optim.Adam( + self.actor_net.parameters(), lr=actor_lr + ) + self.critic_net_optimiser = torch.optim.Adam( + self.critic_net.parameters(), lr=critic_lr + ) + + # Set to initial alpha to 1.0 according to other baselines. + self.log_alpha = torch.tensor(np.log(1.0)).to(device) + self.log_alpha.requires_grad = True + self.target_entropy = -action_num + self.log_alpha_optimizer = torch.optim.Adam([self.log_alpha], lr=alpha_lr) + + # World model + self.world_model = world_network + # Parameter + self.threshold_scale = threshold_scale + self.mode = mode + self.sample_times = sample_times + + @property + def _alpha(self) -> float: + return self.log_alpha.exp() + + # pylint: disable-next=unused-argument to keep the same interface + def select_action_from_policy( + self, state: np.ndarray, evaluation: bool = False, noise_scale: float = 0 + ) -> np.ndarray: + # note that when evaluating this algorithm we need to select mu as + self.actor_net.eval() + with torch.no_grad(): + state_tensor = torch.FloatTensor(state).unsqueeze(0).to(self.device) + if evaluation is False: + (action, _, _) = self.actor_net(state_tensor) + else: + (_, _, action) = self.actor_net(state_tensor) + action = action.cpu().data.numpy().flatten() + self.actor_net.train() + return action + + def _train_policy( + self, + states: torch.Tensor, + actions: torch.Tensor, + rewards: torch.Tensor, + next_states: torch.Tensor, + dones: torch.Tensor, + weights: torch.Tensor, + ) -> None: + ################## Update the Critic First #################### + # Have more target values? + with torch.no_grad(): + next_actions, next_log_pi, _ = self.actor_net(next_states) + target_q_one, target_q_two = self.target_critic_net( + next_states, next_actions + ) + target_q_values = ( + torch.minimum(target_q_one, target_q_two) - self._alpha * next_log_pi + ) + q_target = rewards + self.gamma * (1 - dones) * target_q_values + + q_values_one, q_values_two = self.critic_net(states, actions) + + if self.reweight_critic: + # Reweighted loss function. weight not participant in training. + l2_loss_one = (q_values_one - q_target).pow(2) + l2_loss_two = (q_values_two - q_target).pow(2) + + weights = weights.detach() + disc_l2_loss_one = l2_loss_one * weights + disc_l2_loss_two = l2_loss_two * weights + # A ratio to scale the loss back to original loss scale. + + ratio_1 = torch.mean(l2_loss_one) / torch.mean(disc_l2_loss_one) + ratio_1 = ratio_1.detach() + ratio_2 = torch.mean(l2_loss_two) / torch.mean(disc_l2_loss_two) + ratio_2 = ratio_2.detach() + + critic_loss_one = disc_l2_loss_one.mean() * ratio_1 + critic_loss_two = disc_l2_loss_two.mean() * ratio_2 + + critic_loss_total = critic_loss_one + critic_loss_two + else: + critic_loss_one = F.mse_loss(q_values_one, q_target) + critic_loss_two = F.mse_loss(q_values_two, q_target) + critic_loss_total = critic_loss_one + critic_loss_two + + # Update the Critic + self.critic_net_optimiser.zero_grad() + critic_loss_total.backward() + self.critic_net_optimiser.step() + + ################## Update the Actor Second #################### + pi, first_log_p, _ = self.actor_net(states) + qf1_pi, qf2_pi = self.critic_net(states, pi) + min_qf_pi = torch.minimum(qf1_pi, qf2_pi) + + if self.reweight_actor: + weights = weights.detach() + a_loss = (self._alpha * first_log_p) - min_qf_pi + disc_actor_loss = a_loss * weights + ratio = torch.mean(a_loss) / torch.mean(disc_actor_loss) + ratio = ratio.detach() + actor_loss = ratio * torch.mean(disc_actor_loss) + else: + actor_loss = ((self._alpha * first_log_p) - min_qf_pi).mean() + + # Update the Actor + self.actor_net_optimiser.zero_grad() + actor_loss.backward() + self.actor_net_optimiser.step() + + # Update the temperature + alpha_loss = -( + self.log_alpha * (first_log_p + self.target_entropy).detach() + ).mean() + + self.log_alpha_optimizer.zero_grad() + alpha_loss.backward() + self.log_alpha_optimizer.step() + + if self.learn_counter % self.policy_update_freq == 0: + for target_param, param in zip( + self.target_critic_net.parameters(), self.critic_net.parameters() + ): + target_param.data.copy_( + param.data * self.tau + target_param.data * (1.0 - self.tau) + ) + + def train_world_model( + self, memory: PrioritizedReplayBuffer, batch_size: int + ) -> None: + experiences = memory.sample_uniform(batch_size) + states, actions, rewards, next_states, _, _ = experiences + + states = torch.FloatTensor(np.asarray(states)).to(self.device) + actions = torch.FloatTensor(np.asarray(actions)).to(self.device) + rewards = torch.FloatTensor(np.asarray(rewards)).to(self.device).unsqueeze(1) + next_states = torch.FloatTensor(np.asarray(next_states)).to(self.device) + + self.world_model.train_world( + states=states, + actions=actions, + next_states=next_states, + ) + self.world_model.train_reward( + next_states=next_states, + rewards=rewards, + ) + + def train_policy(self, memory: PrioritizedReplayBuffer, batch_size: int) -> None: + self.learn_counter += 1 + + experiences = memory.sample_uniform(batch_size) + states, actions, rewards, next_states, dones, _ = experiences + + # Convert into tensor + states = torch.FloatTensor(np.asarray(states)).to(self.device) + actions = torch.FloatTensor(np.asarray(actions)).to(self.device) + rewards = torch.FloatTensor(np.asarray(rewards)).to(self.device).unsqueeze(1) + next_states = torch.FloatTensor(np.asarray(next_states)).to(self.device) + dones = torch.LongTensor(np.asarray(dones)).to(self.device).unsqueeze(1) + full_weights = torch.ones(rewards.shape).to(self.device) + # Step 2 train as usual + self._train_policy( + states=states, + actions=actions, + rewards=rewards, + next_states=next_states, + dones=dones, + weights=full_weights, + ) + # # # Step 3 Dyna add more data + self._dyna_generate_and_train(next_states=next_states) + + def _dyna_generate_and_train(self, next_states): + """ + Only off-policy Dyna will work. + :param next_states: + """ + pred_states = [] + pred_actions = [] + pred_rs = [] + pred_n_states = [] + pred_uncerts = [] + with torch.no_grad(): + pred_state = next_states + for _ in range(self.horizon): + pred_state = torch.repeat_interleave(pred_state, self.num_samples, dim=0) + # This part is controversial. But random actions is empirically better. + rand_acts = np.random.uniform(-1, 1, (pred_state.shape[0], self.action_num)) + pred_acts = torch.FloatTensor(rand_acts).to(self.device) + + pred_next_state, _, pred_mean, pred_var = self.world_model.pred_next_states( + pred_state, pred_acts + ) + uncert = self.sampling(pred_means=pred_mean, pred_vars=pred_var) + uncert = uncert.unsqueeze(dim=1).to(self.device) + pred_uncerts.append(uncert) + + pred_reward = self.world_model.pred_rewards(pred_next_state) + pred_states.append(pred_state) + pred_actions.append(pred_acts.detach()) + pred_rs.append(pred_reward.detach()) + pred_n_states.append(pred_next_state.detach()) + pred_state = pred_next_state.detach() + pred_states = torch.vstack(pred_states) + pred_actions = torch.vstack(pred_actions) + pred_rs = torch.vstack(pred_rs) + pred_n_states = torch.vstack(pred_n_states) + pred_weights = torch.vstack(pred_uncerts) + # Pay attention to here! It is dones in the Cares RL Code! + pred_dones = torch.FloatTensor(np.zeros(pred_rs.shape)).to(self.device) + # states, actions, rewards, next_states, not_dones + self._train_policy( + pred_states, pred_actions, pred_rs, pred_n_states, pred_dones, pred_weights + ) + + def sampling(self, pred_means, pred_vars): + """ + High std means low uncertainty. Therefore, divided by 1 + + :param pred_means: + :param pred_vars: + :return: + """ + with torch.no_grad(): + # 5 models. Each predict 10 next_states. + sample1 = torch.distributions.Normal(pred_means[0], pred_vars[0]).sample( + [self.sample_times]) + sample2 = torch.distributions.Normal(pred_means[1], pred_vars[1]).sample( + [self.sample_times]) + sample3 = torch.distributions.Normal(pred_means[2], pred_vars[2]).sample( + [self.sample_times]) + sample4 = torch.distributions.Normal(pred_means[3], pred_vars[3]).sample( + [self.sample_times]) + sample5 = torch.distributions.Normal(pred_means[4], pred_vars[4]).sample( + [self.sample_times]) + rs = [] + acts = [] + qs = [] + # Varying the next_state's distribution. + for i in range(self.sample_times): + # 5 models, each sampled 10 times = 50, + pred_rwd1 = self.world_model.pred_rewards(sample1[i]) + pred_rwd2 = self.world_model.pred_rewards(sample2[i]) + pred_rwd3 = self.world_model.pred_rewards(sample3[i]) + pred_rwd4 = self.world_model.pred_rewards(sample4[i]) + pred_rwd5 = self.world_model.pred_rewards(sample5[i]) + rs.append(pred_rwd1) + rs.append(pred_rwd2) + rs.append(pred_rwd3) + rs.append(pred_rwd4) + rs.append(pred_rwd5) + # Each times, 5 models predict different actions. + # [2560, 17] + pred_act1, log_pi1, _ = self.actor_net(sample1[i]) + pred_act2, log_pi2, _ = self.actor_net(sample2[i]) + pred_act3, log_pi3, _ = self.actor_net(sample3[i]) + pred_act4, log_pi4, _ = self.actor_net(sample4[i]) + pred_act5, log_pi5, _ = self.actor_net(sample5[i]) + acts.append(log_pi1) + acts.append(log_pi2) + acts.append(log_pi3) + acts.append(log_pi4) + acts.append(log_pi5) + # How to become the same next state, different action. + # Now: sample1 sample2... same next state, different model. + # Pred_act1 pred_act2 same next_state, different actions. + # 5[] * 10[var of state] + qa1, qa2 = self.target_critic_net(sample1[i], pred_act1) + qa = torch.minimum(qa1, qa2) + qb1, qb2 = self.target_critic_net(sample2[i], pred_act2) + qb = torch.minimum(qb1, qb2) + qc1, qc2 = self.target_critic_net(sample3[i], pred_act3) + qc = torch.minimum(qc1, qc2) + qd1, qd2 = self.target_critic_net(sample4[i], pred_act4) + qd = torch.minimum(qd1, qd2) + qe1, qe2 = self.target_critic_net(sample5[i], pred_act5) + qe = torch.minimum(qe1, qe2) + qs.append(qa) + qs.append(qb) + qs.append(qc) + qs.append(qd) + qs.append(qe) + + rs = torch.stack(rs) + acts = torch.stack(acts) + qs = torch.stack(qs) + + var_r = torch.var(rs, dim=0) + + if self.mode < 3: + var_a = torch.var(acts, dim=0) + var_q = torch.var(qs, dim=0) + + # Computing covariance. + if self.mode < 2: + mean_a = torch.mean(acts, dim=0, keepdim=True) + mean_q = torch.mean(qs, dim=0, keepdim=True) + diff_a = acts - mean_a + diff_q = qs - mean_q + cov_aq = torch.mean(diff_a * diff_q, dim=0) + + if self.mode < 1: + mean_r = torch.mean(rs, dim=0, keepdim=True) + diff_r = rs - mean_r + cov_rq = torch.mean(diff_r * diff_q, dim=0) + + cov_ra = torch.mean(diff_r * diff_a, dim=0) + + gamma_sq = self.gamma * self.gamma + # Ablation + if self.mode == 0: + total_var = var_r + gamma_sq * var_a + gamma_sq * var_q + gamma_sq * 2 * cov_aq + \ + gamma_sq * 2 * cov_rq + gamma_sq * 2 * cov_ra + if self.mode == 1: + total_var = var_r + gamma_sq * var_a + gamma_sq * var_q + gamma_sq * 2 * cov_aq + if self.mode == 2: + total_var = var_r + gamma_sq * var_a + gamma_sq * var_q + if self.mode == 3: + total_var = var_r + + xi = self.get_optimal_xi(total_var.detach().numpy()) + xi = torch.FloatTensor(xi).to(self.device) + total_var += xi + + # Weight = inverse of sum of weights * inverse of varaince. + total_stds = 1.0 / total_var + ratio = 1.0 / torch.sum(total_stds) + total_stds = ratio * total_stds + + return total_stds.detach() + + def get_iv_weights(self, variances): + ''' + Returns Inverse Variance weights + Params + ====== + variances (numpy array): variance of the targets + ''' + weights = 1 / variances + weights = weights / np.sum(weights) + return weights + + def compute_eff_bs(self, weights): + # Compute original effective mini-batch size + eff_bs = 1 / np.sum(np.square(weights)) + # print(eff_bs) + return eff_bs + + def get_optimal_xi(self, variances, minimal_size): + minimal_size = self.threshold_scale + minimal_size = min(variances.shape[0] - 1, minimal_size) + if self.compute_eff_bs(self.get_iv_weights(variances)) >= minimal_size: + return 0 + fn = lambda x: np.abs(self.compute_eff_bs(self.get_iv_weights(variances + np.abs(x))) - minimal_size) + epsilon = minimize(fn, 0, method='Nelder-Mead', options={'fatol': 1.0, 'maxiter': 100}) + xi = np.abs(epsilon.x[0]) + xi = 0 if xi is None else xi + return xi + + def compute_ebs(self, weights): + weights_sum = torch.sum(weights) + weights_square = weights.pow(2) + # ebs = square of sum / sum of square. + ebs = weights_sum.pow(2) / torch.sum(weights_square) + return ebs + + def set_statistics(self, stats: dict) -> None: + self.world_model.set_statistics(stats) + + def save_models(self, filename: str, filepath: str = "models") -> None: + path = f"{filepath}/models" if filepath != "models" else filepath + dir_exists = os.path.exists(path) + if not dir_exists: + os.makedirs(path) + torch.save(self.actor_net.state_dict(), f"{path}/{filename}_actor.pth") + torch.save(self.critic_net.state_dict(), f"{path}/{filename}_critic.pth") + logging.info("models has been saved...") + + def load_models(self, filepath: str, filename: str) -> None: + path = f"{filepath}/models" if filepath != "models" else filepath + self.actor_net.load_state_dict(torch.load(f"{path}/{filename}_actor.pth")) + self.critic_net.load_state_dict(torch.load(f"{path}/{filename}_critic.pth")) + logging.info("models has been loaded...") diff --git a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_SUNRISE.py b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_SUNRISE.py new file mode 100644 index 00000000..7115ffa9 --- /dev/null +++ b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_SUNRISE.py @@ -0,0 +1,411 @@ +""" +Sutton, Richard S. "Dyna, an integrated architecture for learning, planning, and reacting." + +Original Paper: https://dl.acm.org/doi/abs/10.1145/122344.122377 + +This code runs automatic entropy tuning +""" + +import copy +import logging +import os + +import numpy as np +import torch +from cares_reinforcement_learning.memory import PrioritizedReplayBuffer +import torch.nn.functional as F + +from cares_reinforcement_learning.networks.world_models.ensemble_world import ( + EnsembleWorldAndOneReward, +) + + +class DynaSAC_SUNRISEReweight: + """ + Max as ? + """ + + def __init__( + self, + actor_network: torch.nn.Module, + critic_network: torch.nn.Module, + world_network: EnsembleWorldAndOneReward, + gamma: float, + tau: float, + action_num: int, + actor_lr: float, + critic_lr: float, + alpha_lr: float, + num_samples: int, + horizon: int, + threshold_scale: float, + reweigt_critic: bool, + reweigt_actor: bool, + mode: int, + sample_times: int, + device: torch.device, + ): + self.type = "mbrl" + self.device = device + self.reweight_critic = reweigt_critic + self.reweight_actor = reweigt_actor + # this may be called policy_net in other implementations + self.actor_net = actor_network.to(self.device) + # this may be called soft_q_net in other implementations + self.critic_net = critic_network.to(self.device) + self.target_critic_net = copy.deepcopy(self.critic_net) + + self.gamma = gamma + self.tau = tau + + self.num_samples = num_samples + self.horizon = horizon + self.action_num = action_num + + self.learn_counter = 0 + self.policy_update_freq = 1 + + self.actor_net_optimiser = torch.optim.Adam( + self.actor_net.parameters(), lr=actor_lr + ) + self.critic_net_optimiser = torch.optim.Adam( + self.critic_net.parameters(), lr=critic_lr + ) + + # Set to initial alpha to 1.0 according to other baselines. + self.log_alpha = torch.tensor(np.log(1.0)).to(device) + self.log_alpha.requires_grad = True + self.target_entropy = -action_num + self.log_alpha_optimizer = torch.optim.Adam([self.log_alpha], lr=alpha_lr) + + # World model + self.world_model = world_network + # Parameter + self.threshold_scale = threshold_scale + self.mode = mode + self.sample_times = sample_times + + @property + def _alpha(self) -> float: + return self.log_alpha.exp() + + # pylint: disable-next=unused-argument to keep the same interface + def select_action_from_policy( + self, state: np.ndarray, evaluation: bool = False, noise_scale: float = 0 + ) -> np.ndarray: + # note that when evaluating this algorithm we need to select mu as + self.actor_net.eval() + with torch.no_grad(): + state_tensor = torch.FloatTensor(state).unsqueeze(0).to(self.device) + if evaluation is False: + (action, _, _) = self.actor_net(state_tensor) + else: + (_, _, action) = self.actor_net(state_tensor) + action = action.cpu().data.numpy().flatten() + self.actor_net.train() + return action + + def _train_policy( + self, + states: torch.Tensor, + actions: torch.Tensor, + rewards: torch.Tensor, + next_states: torch.Tensor, + dones: torch.Tensor, + weights: torch.Tensor, + ) -> None: + ################## Update the Critic First #################### + # Have more target values? + with torch.no_grad(): + next_actions, next_log_pi, _ = self.actor_net(next_states) + target_q_one, target_q_two = self.target_critic_net( + next_states, next_actions + ) + target_q_values = ( + torch.minimum(target_q_one, target_q_two) - self._alpha * next_log_pi + ) + q_target = rewards + self.gamma * (1 - dones) * target_q_values + + q_values_one, q_values_two = self.critic_net(states, actions) + + if self.reweight_critic: + # Reweighted loss function. weight not participant in training. + l2_loss_one = (q_values_one - q_target).pow(2) + l2_loss_two = (q_values_two - q_target).pow(2) + + weights = weights.detach() + disc_l2_loss_one = l2_loss_one * weights + disc_l2_loss_two = l2_loss_two * weights + # A ratio to scale the loss back to original loss scale. + + ratio_1 = torch.mean(l2_loss_one) / torch.mean(disc_l2_loss_one) + ratio_1 = ratio_1.detach() + ratio_2 = torch.mean(l2_loss_two) / torch.mean(disc_l2_loss_two) + ratio_2 = ratio_2.detach() + + critic_loss_one = disc_l2_loss_one.mean() * ratio_1 + critic_loss_two = disc_l2_loss_two.mean() * ratio_2 + + critic_loss_total = critic_loss_one + critic_loss_two + else: + critic_loss_one = F.mse_loss(q_values_one, q_target) + critic_loss_two = F.mse_loss(q_values_two, q_target) + critic_loss_total = critic_loss_one + critic_loss_two + + # Update the Critic + self.critic_net_optimiser.zero_grad() + critic_loss_total.backward() + self.critic_net_optimiser.step() + + ################## Update the Actor Second #################### + pi, first_log_p, _ = self.actor_net(states) + qf1_pi, qf2_pi = self.critic_net(states, pi) + min_qf_pi = torch.minimum(qf1_pi, qf2_pi) + + if self.reweight_actor: + weights = weights.detach() + a_loss = (self._alpha * first_log_p) - min_qf_pi + disc_actor_loss = a_loss * weights + ratio = torch.mean(a_loss) / torch.mean(disc_actor_loss) + ratio = ratio.detach() + actor_loss = ratio * torch.mean(disc_actor_loss) + else: + actor_loss = ((self._alpha * first_log_p) - min_qf_pi).mean() + + # Update the Actor + self.actor_net_optimiser.zero_grad() + actor_loss.backward() + self.actor_net_optimiser.step() + + # Update the temperature + alpha_loss = -( + self.log_alpha * (first_log_p + self.target_entropy).detach() + ).mean() + + self.log_alpha_optimizer.zero_grad() + alpha_loss.backward() + self.log_alpha_optimizer.step() + + if self.learn_counter % self.policy_update_freq == 0: + for target_param, param in zip( + self.target_critic_net.parameters(), self.critic_net.parameters() + ): + target_param.data.copy_( + param.data * self.tau + target_param.data * (1.0 - self.tau) + ) + + def train_world_model( + self, memory: PrioritizedReplayBuffer, batch_size: int + ) -> None: + experiences = memory.sample_uniform(batch_size) + states, actions, rewards, next_states, _, _ = experiences + + states = torch.FloatTensor(np.asarray(states)).to(self.device) + actions = torch.FloatTensor(np.asarray(actions)).to(self.device) + rewards = torch.FloatTensor(np.asarray(rewards)).to(self.device).unsqueeze(1) + next_states = torch.FloatTensor(np.asarray(next_states)).to(self.device) + + self.world_model.train_world( + states=states, + actions=actions, + next_states=next_states, + ) + self.world_model.train_reward( + next_states=next_states, + rewards=rewards, + ) + + def train_policy(self, memory: PrioritizedReplayBuffer, batch_size: int) -> None: + self.learn_counter += 1 + + experiences = memory.sample_uniform(batch_size) + states, actions, rewards, next_states, dones, _ = experiences + + # Convert into tensor + states = torch.FloatTensor(np.asarray(states)).to(self.device) + actions = torch.FloatTensor(np.asarray(actions)).to(self.device) + rewards = torch.FloatTensor(np.asarray(rewards)).to(self.device).unsqueeze(1) + next_states = torch.FloatTensor(np.asarray(next_states)).to(self.device) + dones = torch.LongTensor(np.asarray(dones)).to(self.device).unsqueeze(1) + full_weights = torch.ones(rewards.shape).to(self.device) + # Step 2 train as usual + self._train_policy( + states=states, + actions=actions, + rewards=rewards, + next_states=next_states, + dones=dones, + weights=full_weights, + ) + # # # Step 3 Dyna add more data + self._dyna_generate_and_train(next_states=next_states) + + def _dyna_generate_and_train(self, next_states): + """ + Only off-policy Dyna will work. + :param next_states: + """ + pred_states = [] + pred_actions = [] + pred_rs = [] + pred_n_states = [] + pred_uncerts = [] + with torch.no_grad(): + pred_state = next_states + for _ in range(self.horizon): + pred_state = torch.repeat_interleave(pred_state, self.num_samples, dim=0) + # This part is controversial. But random actions is empirically better. + rand_acts = np.random.uniform(-1, 1, (pred_state.shape[0], self.action_num)) + pred_acts = torch.FloatTensor(rand_acts).to(self.device) + + pred_next_state, _, pred_mean, pred_var = self.world_model.pred_next_states( + pred_state, pred_acts + ) + uncert = self.sampling(pred_means=pred_mean, pred_vars=pred_var) + uncert = uncert.unsqueeze(dim=1).to(self.device) + pred_uncerts.append(uncert) + + pred_reward = self.world_model.pred_rewards(pred_next_state) + pred_states.append(pred_state) + pred_actions.append(pred_acts.detach()) + pred_rs.append(pred_reward.detach()) + pred_n_states.append(pred_next_state.detach()) + pred_state = pred_next_state.detach() + pred_states = torch.vstack(pred_states) + pred_actions = torch.vstack(pred_actions) + pred_rs = torch.vstack(pred_rs) + pred_n_states = torch.vstack(pred_n_states) + pred_weights = torch.vstack(pred_uncerts) + # Pay attention to here! It is dones in the Cares RL Code! + pred_dones = torch.FloatTensor(np.zeros(pred_rs.shape)).to(self.device) + # states, actions, rewards, next_states, not_dones + self._train_policy( + pred_states, pred_actions, pred_rs, pred_n_states, pred_dones, pred_weights + ) + + def sampling(self, pred_means, pred_vars): + """ + High std means low uncertainty. Therefore, divided by 1 + + :param pred_means: + :param pred_vars: + :return: + """ + with torch.no_grad(): + # 5 models. Each predict 10 next_states. + sample1 = torch.distributions.Normal(pred_means[0], pred_vars[0]).sample( + [self.sample_times]) + sample2 = torch.distributions.Normal(pred_means[1], pred_vars[1]).sample( + [self.sample_times]) + sample3 = torch.distributions.Normal(pred_means[2], pred_vars[2]).sample( + [self.sample_times]) + sample4 = torch.distributions.Normal(pred_means[3], pred_vars[3]).sample( + [self.sample_times]) + sample5 = torch.distributions.Normal(pred_means[4], pred_vars[4]).sample( + [self.sample_times]) + rs = [] + acts = [] + qs = [] + # Varying the next_state's distribution. + for i in range(self.sample_times): + # 5 models, each sampled 10 times = 50, + pred_rwd1 = self.world_model.pred_rewards(sample1[i]) + pred_rwd2 = self.world_model.pred_rewards(sample2[i]) + pred_rwd3 = self.world_model.pred_rewards(sample3[i]) + pred_rwd4 = self.world_model.pred_rewards(sample4[i]) + pred_rwd5 = self.world_model.pred_rewards(sample5[i]) + rs.append(pred_rwd1) + rs.append(pred_rwd2) + rs.append(pred_rwd3) + rs.append(pred_rwd4) + rs.append(pred_rwd5) + # Each times, 5 models predict different actions. + # [2560, 17] + pred_act1, log_pi1, _ = self.actor_net(sample1[i]) + pred_act2, log_pi2, _ = self.actor_net(sample2[i]) + pred_act3, log_pi3, _ = self.actor_net(sample3[i]) + pred_act4, log_pi4, _ = self.actor_net(sample4[i]) + pred_act5, log_pi5, _ = self.actor_net(sample5[i]) + acts.append(log_pi1) + acts.append(log_pi2) + acts.append(log_pi3) + acts.append(log_pi4) + acts.append(log_pi5) + # How to become the same next state, different action. + # Now: sample1 sample2... same next state, different model. + # Pred_act1 pred_act2 same next_state, different actions. + # 5[] * 10[var of state] + qa1, qa2 = self.target_critic_net(sample1[i], pred_act1) + qa = torch.minimum(qa1, qa2) + qb1, qb2 = self.target_critic_net(sample2[i], pred_act2) + qb = torch.minimum(qb1, qb2) + qc1, qc2 = self.target_critic_net(sample3[i], pred_act3) + qc = torch.minimum(qc1, qc2) + qd1, qd2 = self.target_critic_net(sample4[i], pred_act4) + qd = torch.minimum(qd1, qd2) + qe1, qe2 = self.target_critic_net(sample5[i], pred_act5) + qe = torch.minimum(qe1, qe2) + qs.append(qa) + qs.append(qb) + qs.append(qc) + qs.append(qd) + qs.append(qe) + + rs = torch.stack(rs) + acts = torch.stack(acts) + qs = torch.stack(qs) + + var_r = torch.var(rs, dim=0) + + if self.mode < 3: + var_a = torch.var(acts, dim=0) + var_q = torch.var(qs, dim=0) + + # Computing covariance. + if self.mode < 2: + mean_a = torch.mean(acts, dim=0, keepdim=True) + mean_q = torch.mean(qs, dim=0, keepdim=True) + diff_a = acts - mean_a + diff_q = qs - mean_q + cov_aq = torch.mean(diff_a * diff_q, dim=0) + + if self.mode < 1: + mean_r = torch.mean(rs, dim=0, keepdim=True) + diff_r = rs - mean_r + cov_rq = torch.mean(diff_r * diff_q, dim=0) + + cov_ra = torch.mean(diff_r * diff_a, dim=0) + + gamma_sq = self.gamma * self.gamma + # Ablation + if self.mode == 0: + total_var = var_r + gamma_sq * var_a + gamma_sq * var_q + gamma_sq * 2 * cov_aq + \ + gamma_sq * 2 * cov_rq + gamma_sq * 2 * cov_ra + if self.mode == 1: + total_var = var_r + gamma_sq * var_a + gamma_sq * var_q + gamma_sq * 2 * cov_aq + if self.mode == 2: + total_var = var_r + gamma_sq * var_a + gamma_sq * var_q + if self.mode == 3: + total_var = var_r + + total_stds = torch.sigmoid(-1 * torch.sqrt(total_var) * self.threshold_scale) + 0.5 + + return total_stds.detach() + + def set_statistics(self, stats: dict) -> None: + self.world_model.set_statistics(stats) + + def save_models(self, filename: str, filepath: str = "models") -> None: + path = f"{filepath}/models" if filepath != "models" else filepath + dir_exists = os.path.exists(path) + if not dir_exists: + os.makedirs(path) + torch.save(self.actor_net.state_dict(), f"{path}/{filename}_actor.pth") + torch.save(self.critic_net.state_dict(), f"{path}/{filename}_critic.pth") + logging.info("models has been saved...") + + def load_models(self, filepath: str, filename: str) -> None: + path = f"{filepath}/models" if filepath != "models" else filepath + self.actor_net.load_state_dict(torch.load(f"{path}/{filename}_actor.pth")) + self.critic_net.load_state_dict(torch.load(f"{path}/{filename}_critic.pth")) + logging.info("models has been loaded...") diff --git a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_ScaleBatchReweight.py b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_ScaleBatchReweight.py index 741d3c0d..42542baf 100644 --- a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_ScaleBatchReweight.py +++ b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_ScaleBatchReweight.py @@ -398,6 +398,7 @@ def sampling(self, pred_means, pred_vars): threshold = self.threshold_scale * (max_var - min_var) + min_var total_var[total_var <= threshold] = threshold total_stds = 1 / total_var + return total_stds.detach() def set_statistics(self, stats: dict) -> None: diff --git a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_UWAC.py b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_UWAC.py new file mode 100644 index 00000000..6380fa4e --- /dev/null +++ b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_UWAC.py @@ -0,0 +1,411 @@ +""" +Sutton, Richard S. "Dyna, an integrated architecture for learning, planning, and reacting." + +Original Paper: https://dl.acm.org/doi/abs/10.1145/122344.122377 + +This code runs automatic entropy tuning +""" + +import copy +import logging +import os + +import numpy as np +import torch +from cares_reinforcement_learning.memory import PrioritizedReplayBuffer +import torch.nn.functional as F + +from cares_reinforcement_learning.networks.world_models.ensemble_world import ( + EnsembleWorldAndOneReward, +) + + +class DynaSAC_UWACReweight: + """ + Max as ? + """ + + def __init__( + self, + actor_network: torch.nn.Module, + critic_network: torch.nn.Module, + world_network: EnsembleWorldAndOneReward, + gamma: float, + tau: float, + action_num: int, + actor_lr: float, + critic_lr: float, + alpha_lr: float, + num_samples: int, + horizon: int, + threshold_scale: float, + reweigt_critic: bool, + reweigt_actor: bool, + mode: int, + sample_times: int, + device: torch.device, + ): + self.type = "mbrl" + self.device = device + self.reweight_critic = reweigt_critic + self.reweight_actor = reweigt_actor + # this may be called policy_net in other implementations + self.actor_net = actor_network.to(self.device) + # this may be called soft_q_net in other implementations + self.critic_net = critic_network.to(self.device) + self.target_critic_net = copy.deepcopy(self.critic_net) + + self.gamma = gamma + self.tau = tau + + self.num_samples = num_samples + self.horizon = horizon + self.action_num = action_num + + self.learn_counter = 0 + self.policy_update_freq = 1 + + self.actor_net_optimiser = torch.optim.Adam( + self.actor_net.parameters(), lr=actor_lr + ) + self.critic_net_optimiser = torch.optim.Adam( + self.critic_net.parameters(), lr=critic_lr + ) + + # Set to initial alpha to 1.0 according to other baselines. + self.log_alpha = torch.tensor(np.log(1.0)).to(device) + self.log_alpha.requires_grad = True + self.target_entropy = -action_num + self.log_alpha_optimizer = torch.optim.Adam([self.log_alpha], lr=alpha_lr) + + # World model + self.world_model = world_network + # Parameter + self.threshold_scale = threshold_scale + self.mode = mode + self.sample_times = sample_times + + @property + def _alpha(self) -> float: + return self.log_alpha.exp() + + # pylint: disable-next=unused-argument to keep the same interface + def select_action_from_policy( + self, state: np.ndarray, evaluation: bool = False, noise_scale: float = 0 + ) -> np.ndarray: + # note that when evaluating this algorithm we need to select mu as + self.actor_net.eval() + with torch.no_grad(): + state_tensor = torch.FloatTensor(state).unsqueeze(0).to(self.device) + if evaluation is False: + (action, _, _) = self.actor_net(state_tensor) + else: + (_, _, action) = self.actor_net(state_tensor) + action = action.cpu().data.numpy().flatten() + self.actor_net.train() + return action + + def _train_policy( + self, + states: torch.Tensor, + actions: torch.Tensor, + rewards: torch.Tensor, + next_states: torch.Tensor, + dones: torch.Tensor, + weights: torch.Tensor, + ) -> None: + ################## Update the Critic First #################### + # Have more target values? + with torch.no_grad(): + next_actions, next_log_pi, _ = self.actor_net(next_states) + target_q_one, target_q_two = self.target_critic_net( + next_states, next_actions + ) + target_q_values = ( + torch.minimum(target_q_one, target_q_two) - self._alpha * next_log_pi + ) + q_target = rewards + self.gamma * (1 - dones) * target_q_values + + q_values_one, q_values_two = self.critic_net(states, actions) + + if self.reweight_critic: + # Reweighted loss function. weight not participant in training. + l2_loss_one = (q_values_one - q_target).pow(2) + l2_loss_two = (q_values_two - q_target).pow(2) + + weights = weights.detach() + disc_l2_loss_one = l2_loss_one * weights + disc_l2_loss_two = l2_loss_two * weights + # A ratio to scale the loss back to original loss scale. + + ratio_1 = torch.mean(l2_loss_one) / torch.mean(disc_l2_loss_one) + ratio_1 = ratio_1.detach() + ratio_2 = torch.mean(l2_loss_two) / torch.mean(disc_l2_loss_two) + ratio_2 = ratio_2.detach() + + critic_loss_one = disc_l2_loss_one.mean() * ratio_1 + critic_loss_two = disc_l2_loss_two.mean() * ratio_2 + + critic_loss_total = critic_loss_one + critic_loss_two + else: + critic_loss_one = F.mse_loss(q_values_one, q_target) + critic_loss_two = F.mse_loss(q_values_two, q_target) + critic_loss_total = critic_loss_one + critic_loss_two + + # Update the Critic + self.critic_net_optimiser.zero_grad() + critic_loss_total.backward() + self.critic_net_optimiser.step() + + ################## Update the Actor Second #################### + pi, first_log_p, _ = self.actor_net(states) + qf1_pi, qf2_pi = self.critic_net(states, pi) + min_qf_pi = torch.minimum(qf1_pi, qf2_pi) + + if self.reweight_actor: + weights = weights.detach() + a_loss = (self._alpha * first_log_p) - min_qf_pi + disc_actor_loss = a_loss * weights + ratio = torch.mean(a_loss) / torch.mean(disc_actor_loss) + ratio = ratio.detach() + actor_loss = ratio * torch.mean(disc_actor_loss) + else: + actor_loss = ((self._alpha * first_log_p) - min_qf_pi).mean() + + # Update the Actor + self.actor_net_optimiser.zero_grad() + actor_loss.backward() + self.actor_net_optimiser.step() + + # Update the temperature + alpha_loss = -( + self.log_alpha * (first_log_p + self.target_entropy).detach() + ).mean() + + self.log_alpha_optimizer.zero_grad() + alpha_loss.backward() + self.log_alpha_optimizer.step() + + if self.learn_counter % self.policy_update_freq == 0: + for target_param, param in zip( + self.target_critic_net.parameters(), self.critic_net.parameters() + ): + target_param.data.copy_( + param.data * self.tau + target_param.data * (1.0 - self.tau) + ) + + def train_world_model( + self, memory: PrioritizedReplayBuffer, batch_size: int + ) -> None: + experiences = memory.sample_uniform(batch_size) + states, actions, rewards, next_states, _, _ = experiences + + states = torch.FloatTensor(np.asarray(states)).to(self.device) + actions = torch.FloatTensor(np.asarray(actions)).to(self.device) + rewards = torch.FloatTensor(np.asarray(rewards)).to(self.device).unsqueeze(1) + next_states = torch.FloatTensor(np.asarray(next_states)).to(self.device) + + self.world_model.train_world( + states=states, + actions=actions, + next_states=next_states, + ) + self.world_model.train_reward( + next_states=next_states, + rewards=rewards, + ) + + def train_policy(self, memory: PrioritizedReplayBuffer, batch_size: int) -> None: + self.learn_counter += 1 + + experiences = memory.sample_uniform(batch_size) + states, actions, rewards, next_states, dones, _ = experiences + + # Convert into tensor + states = torch.FloatTensor(np.asarray(states)).to(self.device) + actions = torch.FloatTensor(np.asarray(actions)).to(self.device) + rewards = torch.FloatTensor(np.asarray(rewards)).to(self.device).unsqueeze(1) + next_states = torch.FloatTensor(np.asarray(next_states)).to(self.device) + dones = torch.LongTensor(np.asarray(dones)).to(self.device).unsqueeze(1) + full_weights = torch.ones(rewards.shape).to(self.device) + # Step 2 train as usual + self._train_policy( + states=states, + actions=actions, + rewards=rewards, + next_states=next_states, + dones=dones, + weights=full_weights, + ) + # # # Step 3 Dyna add more data + self._dyna_generate_and_train(next_states=next_states) + + def _dyna_generate_and_train(self, next_states): + """ + Only off-policy Dyna will work. + :param next_states: + """ + pred_states = [] + pred_actions = [] + pred_rs = [] + pred_n_states = [] + pred_uncerts = [] + with torch.no_grad(): + pred_state = next_states + for _ in range(self.horizon): + pred_state = torch.repeat_interleave(pred_state, self.num_samples, dim=0) + # This part is controversial. But random actions is empirically better. + rand_acts = np.random.uniform(-1, 1, (pred_state.shape[0], self.action_num)) + pred_acts = torch.FloatTensor(rand_acts).to(self.device) + + pred_next_state, _, pred_mean, pred_var = self.world_model.pred_next_states( + pred_state, pred_acts + ) + uncert = self.sampling(pred_means=pred_mean, pred_vars=pred_var) + uncert = uncert.unsqueeze(dim=1).to(self.device) + pred_uncerts.append(uncert) + + pred_reward = self.world_model.pred_rewards(pred_next_state) + pred_states.append(pred_state) + pred_actions.append(pred_acts.detach()) + pred_rs.append(pred_reward.detach()) + pred_n_states.append(pred_next_state.detach()) + pred_state = pred_next_state.detach() + pred_states = torch.vstack(pred_states) + pred_actions = torch.vstack(pred_actions) + pred_rs = torch.vstack(pred_rs) + pred_n_states = torch.vstack(pred_n_states) + pred_weights = torch.vstack(pred_uncerts) + # Pay attention to here! It is dones in the Cares RL Code! + pred_dones = torch.FloatTensor(np.zeros(pred_rs.shape)).to(self.device) + # states, actions, rewards, next_states, not_dones + self._train_policy( + pred_states, pred_actions, pred_rs, pred_n_states, pred_dones, pred_weights + ) + + def sampling(self, pred_means, pred_vars): + """ + High std means low uncertainty. Therefore, divided by 1 + + :param pred_means: + :param pred_vars: + :return: + """ + with torch.no_grad(): + # 5 models. Each predict 10 next_states. + sample1 = torch.distributions.Normal(pred_means[0], pred_vars[0]).sample( + [self.sample_times]) + sample2 = torch.distributions.Normal(pred_means[1], pred_vars[1]).sample( + [self.sample_times]) + sample3 = torch.distributions.Normal(pred_means[2], pred_vars[2]).sample( + [self.sample_times]) + sample4 = torch.distributions.Normal(pred_means[3], pred_vars[3]).sample( + [self.sample_times]) + sample5 = torch.distributions.Normal(pred_means[4], pred_vars[4]).sample( + [self.sample_times]) + rs = [] + acts = [] + qs = [] + # Varying the next_state's distribution. + for i in range(self.sample_times): + # 5 models, each sampled 10 times = 50, + pred_rwd1 = self.world_model.pred_rewards(sample1[i]) + pred_rwd2 = self.world_model.pred_rewards(sample2[i]) + pred_rwd3 = self.world_model.pred_rewards(sample3[i]) + pred_rwd4 = self.world_model.pred_rewards(sample4[i]) + pred_rwd5 = self.world_model.pred_rewards(sample5[i]) + rs.append(pred_rwd1) + rs.append(pred_rwd2) + rs.append(pred_rwd3) + rs.append(pred_rwd4) + rs.append(pred_rwd5) + # Each times, 5 models predict different actions. + # [2560, 17] + pred_act1, log_pi1, _ = self.actor_net(sample1[i]) + pred_act2, log_pi2, _ = self.actor_net(sample2[i]) + pred_act3, log_pi3, _ = self.actor_net(sample3[i]) + pred_act4, log_pi4, _ = self.actor_net(sample4[i]) + pred_act5, log_pi5, _ = self.actor_net(sample5[i]) + acts.append(log_pi1) + acts.append(log_pi2) + acts.append(log_pi3) + acts.append(log_pi4) + acts.append(log_pi5) + # How to become the same next state, different action. + # Now: sample1 sample2... same next state, different model. + # Pred_act1 pred_act2 same next_state, different actions. + # 5[] * 10[var of state] + qa1, qa2 = self.target_critic_net(sample1[i], pred_act1) + qa = torch.minimum(qa1, qa2) + qb1, qb2 = self.target_critic_net(sample2[i], pred_act2) + qb = torch.minimum(qb1, qb2) + qc1, qc2 = self.target_critic_net(sample3[i], pred_act3) + qc = torch.minimum(qc1, qc2) + qd1, qd2 = self.target_critic_net(sample4[i], pred_act4) + qd = torch.minimum(qd1, qd2) + qe1, qe2 = self.target_critic_net(sample5[i], pred_act5) + qe = torch.minimum(qe1, qe2) + qs.append(qa) + qs.append(qb) + qs.append(qc) + qs.append(qd) + qs.append(qe) + + rs = torch.stack(rs) + acts = torch.stack(acts) + qs = torch.stack(qs) + + var_r = torch.var(rs, dim=0) + + if self.mode < 3: + var_a = torch.var(acts, dim=0) + var_q = torch.var(qs, dim=0) + + # Computing covariance. + if self.mode < 2: + mean_a = torch.mean(acts, dim=0, keepdim=True) + mean_q = torch.mean(qs, dim=0, keepdim=True) + diff_a = acts - mean_a + diff_q = qs - mean_q + cov_aq = torch.mean(diff_a * diff_q, dim=0) + + if self.mode < 1: + mean_r = torch.mean(rs, dim=0, keepdim=True) + diff_r = rs - mean_r + cov_rq = torch.mean(diff_r * diff_q, dim=0) + + cov_ra = torch.mean(diff_r * diff_a, dim=0) + + gamma_sq = self.gamma * self.gamma + # Ablation + if self.mode == 0: + total_var = var_r + gamma_sq * var_a + gamma_sq * var_q + gamma_sq * 2 * cov_aq + \ + gamma_sq * 2 * cov_rq + gamma_sq * 2 * cov_ra + if self.mode == 1: + total_var = var_r + gamma_sq * var_a + gamma_sq * var_q + gamma_sq * 2 * cov_aq + if self.mode == 2: + total_var = var_r + gamma_sq * var_a + gamma_sq * var_q + if self.mode == 3: + total_var = var_r + + total_stds = torch.minimum(self.threshold_scale/total_var, torch.ones(total_var.shape) * 1.5) + + return total_stds.detach() + + def set_statistics(self, stats: dict) -> None: + self.world_model.set_statistics(stats) + + def save_models(self, filename: str, filepath: str = "models") -> None: + path = f"{filepath}/models" if filepath != "models" else filepath + dir_exists = os.path.exists(path) + if not dir_exists: + os.makedirs(path) + torch.save(self.actor_net.state_dict(), f"{path}/{filename}_actor.pth") + torch.save(self.critic_net.state_dict(), f"{path}/{filename}_critic.pth") + logging.info("models has been saved...") + + def load_models(self, filepath: str, filename: str) -> None: + path = f"{filepath}/models" if filepath != "models" else filepath + self.actor_net.load_state_dict(torch.load(f"{path}/{filename}_actor.pth")) + self.critic_net.load_state_dict(torch.load(f"{path}/{filename}_critic.pth")) + logging.info("models has been loaded...") diff --git a/cares_reinforcement_learning/algorithm/mbrl/__init__.py b/cares_reinforcement_learning/algorithm/mbrl/__init__.py index 661f9882..d1ab0109 100644 --- a/cares_reinforcement_learning/algorithm/mbrl/__init__.py +++ b/cares_reinforcement_learning/algorithm/mbrl/__init__.py @@ -2,3 +2,6 @@ from .DynaSAC_ScaleBatchReweight import DynaSAC_ScaleBatchReweight from .DynaSAC_BinaryBatchReweight import DynaSAC_BinaryBatchReweight from .DynaSAC_MaxBatchReweight import DynaSAC_MaxBatchReweight +from .DynaSAC_SUNRISE import DynaSAC_SUNRISEReweight +from .DynaSAC_UWAC import DynaSAC_UWACReweight +from .DynaSAC_BIV import DynaSAC_BIVReweight diff --git a/cares_reinforcement_learning/util/configurations.py b/cares_reinforcement_learning/util/configurations.py index b21eb580..d7db18b3 100644 --- a/cares_reinforcement_learning/util/configurations.py +++ b/cares_reinforcement_learning/util/configurations.py @@ -225,6 +225,79 @@ class DynaSAC_ScaleBatchReweightConfig(AlgorithmConfig): mode: Optional[int] = 1 sample_times: Optional[int] = 10 +class DynaSAC_BIVReweightConfig(AlgorithmConfig): + algorithm: str = Field("DynaSAC_BIVReweight", Literal=True) + actor_lr: Optional[float] = 3e-4 + critic_lr: Optional[float] = 3e-4 + + alpha_lr: Optional[float] = 3e-4 + use_bounded_active: Optional[bool] = False + num_models: Optional[int] = 5 + + gamma: Optional[float] = 0.99 + tau: Optional[float] = 0.005 + reward_scale: Optional[float] = 1.0 + + horizon: Optional[int] = 1 + num_samples: Optional[int] = 10 + world_model_lr: Optional[float] = 0.001 + + threshold_scale: Optional[float] = 0.7 + reweight_critic: Optional[bool] = True + reweight_actor: Optional[bool] = False + + mode: Optional[int] = 1 + sample_times: Optional[int] = 10 + + +class DynaSAC_SUNRISEReweightConfig(AlgorithmConfig): + algorithm: str = Field("DynaSAC_SUNRISEReweight", Literal=True) + actor_lr: Optional[float] = 3e-4 + critic_lr: Optional[float] = 3e-4 + + alpha_lr: Optional[float] = 3e-4 + use_bounded_active: Optional[bool] = False + num_models: Optional[int] = 5 + + gamma: Optional[float] = 0.99 + tau: Optional[float] = 0.005 + reward_scale: Optional[float] = 1.0 + + horizon: Optional[int] = 1 + num_samples: Optional[int] = 10 + world_model_lr: Optional[float] = 0.001 + + threshold_scale: Optional[float] = 0.7 + reweight_critic: Optional[bool] = True + reweight_actor: Optional[bool] = False + + mode: Optional[int] = 1 + sample_times: Optional[int] = 10 + +class DynaSAC_UWACReweightConfig(AlgorithmConfig): + algorithm: str = Field("DynaSAC_UWACReweight", Literal=True) + actor_lr: Optional[float] = 3e-4 + critic_lr: Optional[float] = 3e-4 + + alpha_lr: Optional[float] = 3e-4 + use_bounded_active: Optional[bool] = False + num_models: Optional[int] = 5 + + gamma: Optional[float] = 0.99 + tau: Optional[float] = 0.005 + reward_scale: Optional[float] = 1.0 + + horizon: Optional[int] = 1 + num_samples: Optional[int] = 10 + world_model_lr: Optional[float] = 0.001 + + threshold_scale: Optional[float] = 0.7 + reweight_critic: Optional[bool] = True + reweight_actor: Optional[bool] = False + + mode: Optional[int] = 1 + sample_times: Optional[int] = 10 + class NaSATD3Config(AlgorithmConfig): algorithm: str = Field("NaSATD3", Literal=True) diff --git a/cares_reinforcement_learning/util/network_factory.py b/cares_reinforcement_learning/util/network_factory.py index be87b72c..788326f7 100644 --- a/cares_reinforcement_learning/util/network_factory.py +++ b/cares_reinforcement_learning/util/network_factory.py @@ -121,6 +121,138 @@ def create_DynaSAC_ScaleBatchReweight(observation_size, action_num, config: Algo ) return agent +def create_DynaSAC_BIVReweight(observation_size, action_num, config: AlgorithmConfig): + """ + Create networks for model-based SAC agent. The Actor and Critic is same. + An extra world model is added. + + """ + from cares_reinforcement_learning.algorithm.mbrl import DynaSAC_BIVReweight + from cares_reinforcement_learning.networks.SAC import Actor, Critic + from cares_reinforcement_learning.networks.world_models import EnsembleWorldAndOneReward + + actor = Actor(observation_size, action_num) + critic = Critic(observation_size, action_num) + + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + world_model = EnsembleWorldAndOneReward( + observation_size=observation_size, + num_actions=action_num, + num_models=config.num_models, + device=device, + lr=config.world_model_lr, + ) + + agent = DynaSAC_BIVReweight( + actor_network=actor, + critic_network=critic, + world_network=world_model, + actor_lr=config.actor_lr, + critic_lr=config.critic_lr, + gamma=config.gamma, + tau=config.tau, + action_num=action_num, + device=device, + alpha_lr=config.alpha_lr, + horizon=config.horizon, + num_samples=config.num_samples, + threshold_scale=config.threshold_scale, + reweight_critic=config.reweight_critic, + reweight_actor=config.reweight_actor, + mode=config.mode, + sample_times=config.sample_times, + ) + return agent + +def create_DynaSAC_SUNRISEReweight(observation_size, action_num, config: AlgorithmConfig): + """ + Create networks for model-based SAC agent. The Actor and Critic is same. + An extra world model is added. + + """ + from cares_reinforcement_learning.algorithm.mbrl import DynaSAC_SUNRISEReweight + from cares_reinforcement_learning.networks.SAC import Actor, Critic + from cares_reinforcement_learning.networks.world_models import EnsembleWorldAndOneReward + + actor = Actor(observation_size, action_num) + critic = Critic(observation_size, action_num) + + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + world_model = EnsembleWorldAndOneReward( + observation_size=observation_size, + num_actions=action_num, + num_models=config.num_models, + device=device, + lr=config.world_model_lr, + ) + + agent = DynaSAC_SUNRISEReweight( + actor_network=actor, + critic_network=critic, + world_network=world_model, + actor_lr=config.actor_lr, + critic_lr=config.critic_lr, + gamma=config.gamma, + tau=config.tau, + action_num=action_num, + device=device, + alpha_lr=config.alpha_lr, + horizon=config.horizon, + num_samples=config.num_samples, + threshold_scale=config.threshold_scale, + reweight_critic=config.reweight_critic, + reweight_actor=config.reweight_actor, + mode=config.mode, + sample_times=config.sample_times, + ) + return agent + +def create_DynaSAC_UWACReweight(observation_size, action_num, config: AlgorithmConfig): + """ + Create networks for model-based SAC agent. The Actor and Critic is same. + An extra world model is added. + + """ + from cares_reinforcement_learning.algorithm.mbrl import DynaSAC_UWACReweight + from cares_reinforcement_learning.networks.SAC import Actor, Critic + from cares_reinforcement_learning.networks.world_models import EnsembleWorldAndOneReward + + actor = Actor(observation_size, action_num) + critic = Critic(observation_size, action_num) + + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + world_model = EnsembleWorldAndOneReward( + observation_size=observation_size, + num_actions=action_num, + num_models=config.num_models, + device=device, + lr=config.world_model_lr, + ) + + agent = DynaSAC_UWACReweight( + actor_network=actor, + critic_network=critic, + world_network=world_model, + actor_lr=config.actor_lr, + critic_lr=config.critic_lr, + gamma=config.gamma, + tau=config.tau, + action_num=action_num, + device=device, + alpha_lr=config.alpha_lr, + horizon=config.horizon, + num_samples=config.num_samples, + threshold_scale=config.threshold_scale, + reweight_critic=config.reweight_critic, + reweight_actor=config.reweight_actor, + mode=config.mode, + sample_times=config.sample_times, + ) + return agent + def create_DynaSAC_MaxBatchReweight(observation_size, action_num, config: AlgorithmConfig): """ From 344512932c53a1612eda771ecf44dc4734ba6bbc Mon Sep 17 00:00:00 2001 From: tony Date: Sun, 23 Jun 2024 19:10:41 +1200 Subject: [PATCH 38/91] Add baselines. --- .../DynaSAC_NormalizedSigmoidBatchReweight.py | 406 ++++++++++++++++++ .../algorithm/mbrl/__init__.py | 1 + .../util/configurations.py | 25 ++ .../util/network_factory.py | 45 ++ 4 files changed, 477 insertions(+) create mode 100644 cares_reinforcement_learning/algorithm/mbrl/DynaSAC_NormalizedSigmoidBatchReweight.py diff --git a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_NormalizedSigmoidBatchReweight.py b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_NormalizedSigmoidBatchReweight.py new file mode 100644 index 00000000..48a9e493 --- /dev/null +++ b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_NormalizedSigmoidBatchReweight.py @@ -0,0 +1,406 @@ +""" +Sutton, Richard S. "Dyna, an integrated architecture for learning, planning, and reacting." + +Original Paper: https://dl.acm.org/doi/abs/10.1145/122344.122377 + +This code runs automatic entropy tuning +""" + +import copy +import logging +import os + +import numpy as np +import torch +from cares_reinforcement_learning.memory import PrioritizedReplayBuffer + +from cares_reinforcement_learning.networks.world_models.ensemble_world import ( + EnsembleWorldAndOneReward, +) + + +class DynaSAC_NormalizedSigmoidBatchReweight: + """ + Max as ? + """ + def __init__( + self, + actor_network: torch.nn.Module, + critic_network: torch.nn.Module, + world_network: EnsembleWorldAndOneReward, + gamma: float, + tau: float, + action_num: int, + actor_lr: float, + critic_lr: float, + alpha_lr: float, + num_samples: int, + horizon: int, + threshold_scale: float, + mode: int, + sample_times: int, + device: torch.device, + ): + self.type = "mbrl" + self.device = device + + # this may be called policy_net in other implementations + self.actor_net = actor_network.to(self.device) + # this may be called soft_q_net in other implementations + self.critic_net = critic_network.to(self.device) + self.target_critic_net = copy.deepcopy(self.critic_net) + + self.gamma = gamma + self.tau = tau + + self.num_samples = num_samples + self.horizon = horizon + self.action_num = action_num + + self.learn_counter = 0 + self.policy_update_freq = 1 + + self.actor_net_optimiser = torch.optim.Adam( + self.actor_net.parameters(), lr=actor_lr + ) + self.critic_net_optimiser = torch.optim.Adam( + self.critic_net.parameters(), lr=critic_lr + ) + + # Set to initial alpha to 1.0 according to other baselines. + self.log_alpha = torch.tensor(np.log(1.0)).to(device) + self.log_alpha.requires_grad = True + self.target_entropy = -action_num + self.log_alpha_optimizer = torch.optim.Adam([self.log_alpha], lr=alpha_lr) + + # World model + self.world_model = world_network + # Parameter + self.threshold_scale = threshold_scale + self.mode = mode + self.sample_times = sample_times + + @property + def _alpha(self) -> float: + return self.log_alpha.exp() + + # pylint: disable-next=unused-argument to keep the same interface + def select_action_from_policy( + self, state: np.ndarray, evaluation: bool = False, noise_scale: float = 0 + ) -> np.ndarray: + # note that when evaluating this algorithm we need to select mu as + self.actor_net.eval() + with torch.no_grad(): + state_tensor = torch.FloatTensor(state).unsqueeze(0).to(self.device) + if evaluation is False: + (action, _, _) = self.actor_net(state_tensor) + else: + (_, _, action) = self.actor_net(state_tensor) + action = action.cpu().data.numpy().flatten() + self.actor_net.train() + return action + + def _train_policy( + self, + states: torch.Tensor, + actions: torch.Tensor, + rewards: torch.Tensor, + next_states: torch.Tensor, + dones: torch.Tensor, + weights: torch.Tensor, + ) -> None: + ################## Update the Critic First #################### + # Have more target values? + with torch.no_grad(): + next_actions, next_log_pi, _ = self.actor_net(next_states) + target_q_one, target_q_two = self.target_critic_net( + next_states, next_actions + ) + target_q_values = ( + torch.minimum(target_q_one, target_q_two) - self._alpha * next_log_pi + ) + q_target = rewards + self.gamma * (1 - dones) * target_q_values + + q_values_one, q_values_two = self.critic_net(states, actions) + + # Original loss function + l2_loss_one = (q_values_one - q_target).pow(2) + l2_loss_two = (q_values_two - q_target).pow(2) + + # Reweighted loss function. weight not participant in training. + weights = weights.detach() + disc_l2_loss_one = l2_loss_one * weights + disc_l2_loss_two = l2_loss_two * weights + # A ratio to scale the loss back to original loss scale. + + ratio_1 = torch.mean(l2_loss_one) / torch.mean(disc_l2_loss_one) + ratio_1 = ratio_1.detach() + ratio_2 = torch.mean(l2_loss_two) / torch.mean(disc_l2_loss_two) + ratio_2 = ratio_2.detach() + + critic_loss_one = disc_l2_loss_one.mean() * ratio_1 + critic_loss_two = disc_l2_loss_two.mean() * ratio_2 + + critic_loss_total = critic_loss_one + critic_loss_two + + # Update the Critic + self.critic_net_optimiser.zero_grad() + critic_loss_total.backward() + self.critic_net_optimiser.step() + + ################## Update the Actor Second #################### + pi, first_log_p, _ = self.actor_net(states) + qf1_pi, qf2_pi = self.critic_net(states, pi) + min_qf_pi = torch.minimum(qf1_pi, qf2_pi) + actor_loss = ((self._alpha * first_log_p) - min_qf_pi).mean() + + # Update the Actor + self.actor_net_optimiser.zero_grad() + actor_loss.backward() + self.actor_net_optimiser.step() + + # Update the temperature + alpha_loss = -( + self.log_alpha * (first_log_p + self.target_entropy).detach() + ).mean() + + self.log_alpha_optimizer.zero_grad() + alpha_loss.backward() + self.log_alpha_optimizer.step() + + if self.learn_counter % self.policy_update_freq == 0: + for target_param, param in zip( + self.target_critic_net.parameters(), self.critic_net.parameters() + ): + target_param.data.copy_( + param.data * self.tau + target_param.data * (1.0 - self.tau) + ) + + def train_world_model( + self, memory: PrioritizedReplayBuffer, batch_size: int + ) -> None: + experiences = memory.sample_uniform(batch_size) + states, actions, rewards, next_states, _, _ = experiences + + states = torch.FloatTensor(np.asarray(states)).to(self.device) + actions = torch.FloatTensor(np.asarray(actions)).to(self.device) + rewards = torch.FloatTensor(np.asarray(rewards)).to(self.device).unsqueeze(1) + next_states = torch.FloatTensor(np.asarray(next_states)).to(self.device) + + self.world_model.train_world( + states=states, + actions=actions, + next_states=next_states, + ) + self.world_model.train_reward( + next_states=next_states, + rewards=rewards, + ) + + def train_policy(self, memory: PrioritizedReplayBuffer, batch_size: int) -> None: + self.learn_counter += 1 + + experiences = memory.sample_uniform(batch_size) + states, actions, rewards, next_states, dones, _ = experiences + + # Convert into tensor + states = torch.FloatTensor(np.asarray(states)).to(self.device) + actions = torch.FloatTensor(np.asarray(actions)).to(self.device) + rewards = torch.FloatTensor(np.asarray(rewards)).to(self.device).unsqueeze(1) + next_states = torch.FloatTensor(np.asarray(next_states)).to(self.device) + dones = torch.LongTensor(np.asarray(dones)).to(self.device).unsqueeze(1) + full_weights = torch.ones(rewards.shape).to(self.device) + # Step 2 train as usual + self._train_policy( + states=states, + actions=actions, + rewards=rewards, + next_states=next_states, + dones=dones, + weights=full_weights, + ) + # # # Step 3 Dyna add more data + self._dyna_generate_and_train(next_states=next_states) + + def _dyna_generate_and_train(self, next_states): + """ + Only off-policy Dyna will work. + :param next_states: + """ + pred_states = [] + pred_actions = [] + pred_rs = [] + pred_n_states = [] + pred_uncerts = [] + with torch.no_grad(): + pred_state = next_states + for _ in range(self.horizon): + pred_state = torch.repeat_interleave(pred_state, self.num_samples, dim=0) + # This part is controversial. But random actions is empirically better. + rand_acts = np.random.uniform(-1, 1, (pred_state.shape[0], self.action_num)) + pred_acts = torch.FloatTensor(rand_acts).to(self.device) + + pred_next_state, _, pred_mean, pred_var = self.world_model.pred_next_states( + pred_state, pred_acts + ) + uncert = self.sampling(pred_means=pred_mean, pred_vars=pred_var) + uncert = uncert.unsqueeze(dim=1).to(self.device) + pred_uncerts.append(uncert) + + pred_reward = self.world_model.pred_rewards(pred_next_state) + pred_states.append(pred_state) + pred_actions.append(pred_acts.detach()) + pred_rs.append(pred_reward.detach()) + pred_n_states.append(pred_next_state.detach()) + pred_state = pred_next_state.detach() + pred_states = torch.vstack(pred_states) + pred_actions = torch.vstack(pred_actions) + pred_rs = torch.vstack(pred_rs) + pred_n_states = torch.vstack(pred_n_states) + pred_weights = torch.vstack(pred_uncerts) + # Pay attention to here! It is dones in the Cares RL Code! + pred_dones = torch.FloatTensor(np.zeros(pred_rs.shape)).to(self.device) + # states, actions, rewards, next_states, not_dones + self._train_policy( + pred_states, pred_actions, pred_rs, pred_n_states, pred_dones, pred_weights + ) + + def sampling(self, pred_means, pred_vars): + """ + High std means low uncertainty. Therefore, divided by 1 + + :param pred_means: + :param pred_vars: + :return: + """ + with torch.no_grad(): + # 5 models. Each predict 10 next_states. + sample1 = torch.distributions.Normal(pred_means[0], pred_vars[0]).sample( + [self.sample_times]) + sample2 = torch.distributions.Normal(pred_means[1], pred_vars[1]).sample( + [self.sample_times]) + sample3 = torch.distributions.Normal(pred_means[2], pred_vars[2]).sample( + [self.sample_times]) + sample4 = torch.distributions.Normal(pred_means[3], pred_vars[3]).sample( + [self.sample_times]) + sample5 = torch.distributions.Normal(pred_means[4], pred_vars[4]).sample( + [self.sample_times]) + rs = [] + acts = [] + qs = [] + # Varying the next_state's distribution. + for i in range(self.sample_times): + # 5 models, each sampled 10 times = 50, + pred_rwd1 = self.world_model.pred_rewards(sample1[i]) + pred_rwd2 = self.world_model.pred_rewards(sample2[i]) + pred_rwd3 = self.world_model.pred_rewards(sample3[i]) + pred_rwd4 = self.world_model.pred_rewards(sample4[i]) + pred_rwd5 = self.world_model.pred_rewards(sample5[i]) + rs.append(pred_rwd1) + rs.append(pred_rwd2) + rs.append(pred_rwd3) + rs.append(pred_rwd4) + rs.append(pred_rwd5) + # Each times, 5 models predict different actions. + # [2560, 17] + pred_act1, log_pi1, _ = self.actor_net(sample1[i]) + pred_act2, log_pi2, _ = self.actor_net(sample2[i]) + pred_act3, log_pi3, _ = self.actor_net(sample3[i]) + pred_act4, log_pi4, _ = self.actor_net(sample4[i]) + pred_act5, log_pi5, _ = self.actor_net(sample5[i]) + acts.append(log_pi1) + acts.append(log_pi2) + acts.append(log_pi3) + acts.append(log_pi4) + acts.append(log_pi5) + # How to become the same next state, different action. + # Now: sample1 sample2... same next state, different model. + # Pred_act1 pred_act2 same next_state, different actions. + # 5[] * 10[var of state] + qa1, qa2 = self.target_critic_net(sample1[i], pred_act1) + qa = torch.minimum(qa1, qa2) + qb1, qb2 = self.target_critic_net(sample2[i], pred_act2) + qb = torch.minimum(qb1, qb2) + qc1, qc2 = self.target_critic_net(sample3[i], pred_act3) + qc = torch.minimum(qc1, qc2) + qd1, qd2 = self.target_critic_net(sample4[i], pred_act4) + qd = torch.minimum(qd1, qd2) + qe1, qe2 = self.target_critic_net(sample5[i], pred_act5) + qe = torch.minimum(qe1, qe2) + qs.append(qa) + qs.append(qb) + qs.append(qc) + qs.append(qd) + qs.append(qe) + + rs = torch.stack(rs) + acts = torch.stack(acts) + qs = torch.stack(qs) + + var_r = torch.var(rs, dim=0) + + if self.mode < 3: + var_a = torch.var(acts, dim=0) + var_q = torch.var(qs, dim=0) + + # Computing covariance. + if self.mode < 2: + mean_a = torch.mean(acts, dim=0, keepdim=True) + mean_q = torch.mean(qs, dim=0, keepdim=True) + diff_a = acts - mean_a + diff_q = qs - mean_q + cov_aq = torch.mean(diff_a * diff_q, dim=0) + + if self.mode < 1: + mean_r = torch.mean(rs, dim=0, keepdim=True) + diff_r = rs - mean_r + cov_rq = torch.mean(diff_r * diff_q, dim=0) + + cov_ra = torch.mean(diff_r * diff_a, dim=0) + + gamma_sq = self.gamma * self.gamma + # Ablation + if self.mode == 0: + total_var = var_r + gamma_sq * var_a + gamma_sq * var_q + gamma_sq * 2 * cov_aq + \ + gamma_sq * 2 * cov_rq + gamma_sq * 2 * cov_ra + if self.mode == 1: + total_var = var_r + gamma_sq * var_a + gamma_sq * var_q + gamma_sq * 2 * cov_aq + if self.mode == 2: + total_var = var_r + gamma_sq * var_a + gamma_sq * var_q + if self.mode == 3: + total_var = var_r + + # Exacerbate the sample difference. + min_var = torch.min(total_var) + max_var = torch.max(total_var) + scale_var = max_var - min_var + # 0 - 1 + total_var -= min_var + total_var /= scale_var + # 0 - scale + total_var *= self.threshold_scale + total_stds = torch.sigmoid(-1.0 * total_var) + 0.5 + + # mean_var = torch.mean(total_var) + # threshold = (self.threshold_scale * (max_var - mean_var)) + mean_var + # total_var[total_var <= threshold] = min_var + # total_stds = 1 / total_var + return total_stds.detach() + + def set_statistics(self, stats: dict) -> None: + self.world_model.set_statistics(stats) + + def save_models(self, filename: str, filepath: str = "models") -> None: + path = f"{filepath}/models" if filepath != "models" else filepath + dir_exists = os.path.exists(path) + if not dir_exists: + os.makedirs(path) + torch.save(self.actor_net.state_dict(), f"{path}/{filename}_actor.pth") + torch.save(self.critic_net.state_dict(), f"{path}/{filename}_critic.pth") + logging.info("models has been saved...") + + def load_models(self, filepath: str, filename: str) -> None: + path = f"{filepath}/models" if filepath != "models" else filepath + self.actor_net.load_state_dict(torch.load(f"{path}/{filename}_actor.pth")) + self.critic_net.load_state_dict(torch.load(f"{path}/{filename}_critic.pth")) + logging.info("models has been loaded...") diff --git a/cares_reinforcement_learning/algorithm/mbrl/__init__.py b/cares_reinforcement_learning/algorithm/mbrl/__init__.py index d1ab0109..9614f39e 100644 --- a/cares_reinforcement_learning/algorithm/mbrl/__init__.py +++ b/cares_reinforcement_learning/algorithm/mbrl/__init__.py @@ -5,3 +5,4 @@ from .DynaSAC_SUNRISE import DynaSAC_SUNRISEReweight from .DynaSAC_UWAC import DynaSAC_UWACReweight from .DynaSAC_BIV import DynaSAC_BIVReweight +from .DynaSAC_NormalizedSigmoidBatchReweight import DynaSAC_NormalizedSigmoidBatchReweight diff --git a/cares_reinforcement_learning/util/configurations.py b/cares_reinforcement_learning/util/configurations.py index d7db18b3..b3d846c6 100644 --- a/cares_reinforcement_learning/util/configurations.py +++ b/cares_reinforcement_learning/util/configurations.py @@ -299,6 +299,31 @@ class DynaSAC_UWACReweightConfig(AlgorithmConfig): sample_times: Optional[int] = 10 +class DynaSAC_NormalizedSigmoidBatchReweightConfig(AlgorithmConfig): + algorithm: str = Field("DynaSAC_NormalizedSigmoidBatchReweight", Literal=True) + actor_lr: Optional[float] = 3e-4 + critic_lr: Optional[float] = 3e-4 + + alpha_lr: Optional[float] = 3e-4 + use_bounded_active: Optional[bool] = False + num_models: Optional[int] = 5 + + gamma: Optional[float] = 0.99 + tau: Optional[float] = 0.005 + reward_scale: Optional[float] = 1.0 + + horizon: Optional[int] = 1 + num_samples: Optional[int] = 10 + world_model_lr: Optional[float] = 0.001 + + threshold_scale: Optional[float] = 4.0 + reweight_critic: Optional[bool] = True + reweight_actor: Optional[bool] = False + + mode: Optional[int] = 1 + sample_times: Optional[int] = 10 + + class NaSATD3Config(AlgorithmConfig): algorithm: str = Field("NaSATD3", Literal=True) diff --git a/cares_reinforcement_learning/util/network_factory.py b/cares_reinforcement_learning/util/network_factory.py index 788326f7..1ad24fb0 100644 --- a/cares_reinforcement_learning/util/network_factory.py +++ b/cares_reinforcement_learning/util/network_factory.py @@ -254,6 +254,51 @@ def create_DynaSAC_UWACReweight(observation_size, action_num, config: AlgorithmC return agent +def create_DynaSAC_NormalizedSigmoidBatchReweight(observation_size, action_num, config: AlgorithmConfig): + """ + Create networks for model-based SAC agent. The Actor and Critic is same. + An extra world model is added. + + """ + from cares_reinforcement_learning.algorithm.mbrl import DynaSAC_NormalizedSigmoidBatchReweight + from cares_reinforcement_learning.networks.SAC import Actor, Critic + from cares_reinforcement_learning.networks.world_models import EnsembleWorldAndOneReward + + actor = Actor(observation_size, action_num) + critic = Critic(observation_size, action_num) + + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + world_model = EnsembleWorldAndOneReward( + observation_size=observation_size, + num_actions=action_num, + num_models=config.num_models, + device=device, + lr=config.world_model_lr, + ) + + agent = DynaSAC_NormalizedSigmoidBatchReweight( + actor_network=actor, + critic_network=critic, + world_network=world_model, + actor_lr=config.actor_lr, + critic_lr=config.critic_lr, + gamma=config.gamma, + tau=config.tau, + action_num=action_num, + device=device, + alpha_lr=config.alpha_lr, + horizon=config.horizon, + num_samples=config.num_samples, + threshold_scale=config.threshold_scale, + reweight_critic=config.reweight_critic, + reweight_actor=config.reweight_actor, + mode=config.mode, + sample_times=config.sample_times, + ) + return agent + + def create_DynaSAC_MaxBatchReweight(observation_size, action_num, config: AlgorithmConfig): """ Create networks for model-based SAC agent. The Actor and Critic is same. From 7626722daedba814ea86dc8ca9ec3a33d8f57eac Mon Sep 17 00:00:00 2001 From: tony Date: Sun, 23 Jun 2024 19:21:55 +1200 Subject: [PATCH 39/91] Add baselines. --- .../DynaSAC_NormalizedSigmoidBatchReweight.py | 56 ++++++++++++------- 1 file changed, 35 insertions(+), 21 deletions(-) diff --git a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_NormalizedSigmoidBatchReweight.py b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_NormalizedSigmoidBatchReweight.py index 48a9e493..709ee29c 100644 --- a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_NormalizedSigmoidBatchReweight.py +++ b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_NormalizedSigmoidBatchReweight.py @@ -13,6 +13,8 @@ import numpy as np import torch from cares_reinforcement_learning.memory import PrioritizedReplayBuffer +import torch.nn.functional as F + from cares_reinforcement_learning.networks.world_models.ensemble_world import ( EnsembleWorldAndOneReward, @@ -37,13 +39,16 @@ def __init__( num_samples: int, horizon: int, threshold_scale: float, + reweigt_critic: bool, + reweigt_actor: bool, mode: int, sample_times: int, device: torch.device, ): self.type = "mbrl" self.device = device - + self.reweight_critic = reweigt_critic + self.reweight_actor = reweigt_actor # this may be called policy_net in other implementations self.actor_net = actor_network.to(self.device) # this may be called soft_q_net in other implementations @@ -123,25 +128,29 @@ def _train_policy( q_values_one, q_values_two = self.critic_net(states, actions) - # Original loss function - l2_loss_one = (q_values_one - q_target).pow(2) - l2_loss_two = (q_values_two - q_target).pow(2) + if self.reweight_critic: + # Reweighted loss function. weight not participant in training. + l2_loss_one = (q_values_one - q_target).pow(2) + l2_loss_two = (q_values_two - q_target).pow(2) - # Reweighted loss function. weight not participant in training. - weights = weights.detach() - disc_l2_loss_one = l2_loss_one * weights - disc_l2_loss_two = l2_loss_two * weights - # A ratio to scale the loss back to original loss scale. + weights = weights.detach() + disc_l2_loss_one = l2_loss_one * weights + disc_l2_loss_two = l2_loss_two * weights + # A ratio to scale the loss back to original loss scale. - ratio_1 = torch.mean(l2_loss_one) / torch.mean(disc_l2_loss_one) - ratio_1 = ratio_1.detach() - ratio_2 = torch.mean(l2_loss_two) / torch.mean(disc_l2_loss_two) - ratio_2 = ratio_2.detach() + ratio_1 = torch.mean(l2_loss_one) / torch.mean(disc_l2_loss_one) + ratio_1 = ratio_1.detach() + ratio_2 = torch.mean(l2_loss_two) / torch.mean(disc_l2_loss_two) + ratio_2 = ratio_2.detach() - critic_loss_one = disc_l2_loss_one.mean() * ratio_1 - critic_loss_two = disc_l2_loss_two.mean() * ratio_2 + critic_loss_one = disc_l2_loss_one.mean() * ratio_1 + critic_loss_two = disc_l2_loss_two.mean() * ratio_2 - critic_loss_total = critic_loss_one + critic_loss_two + critic_loss_total = critic_loss_one + critic_loss_two + else: + critic_loss_one = F.mse_loss(q_values_one, q_target) + critic_loss_two = F.mse_loss(q_values_two, q_target) + critic_loss_total = critic_loss_one + critic_loss_two # Update the Critic self.critic_net_optimiser.zero_grad() @@ -152,7 +161,16 @@ def _train_policy( pi, first_log_p, _ = self.actor_net(states) qf1_pi, qf2_pi = self.critic_net(states, pi) min_qf_pi = torch.minimum(qf1_pi, qf2_pi) - actor_loss = ((self._alpha * first_log_p) - min_qf_pi).mean() + + if self.reweight_actor: + weights = weights.detach() + a_loss = (self._alpha * first_log_p) - min_qf_pi + disc_actor_loss = a_loss * weights + ratio = torch.mean(a_loss) / torch.mean(disc_actor_loss) + ratio = ratio.detach() + actor_loss = ratio * torch.mean(disc_actor_loss) + else: + actor_loss = ((self._alpha * first_log_p) - min_qf_pi).mean() # Update the Actor self.actor_net_optimiser.zero_grad() @@ -381,10 +399,6 @@ def sampling(self, pred_means, pred_vars): total_var *= self.threshold_scale total_stds = torch.sigmoid(-1.0 * total_var) + 0.5 - # mean_var = torch.mean(total_var) - # threshold = (self.threshold_scale * (max_var - mean_var)) + mean_var - # total_var[total_var <= threshold] = min_var - # total_stds = 1 / total_var return total_stds.detach() def set_statistics(self, stats: dict) -> None: From c1017d1301c3b9cc6c4e8aa786c17ff39472bf78 Mon Sep 17 00:00:00 2001 From: tony Date: Sun, 23 Jun 2024 19:24:58 +1200 Subject: [PATCH 40/91] typo --- .../mbrl/DynaSAC_NormalizedSigmoidBatchReweight.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_NormalizedSigmoidBatchReweight.py b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_NormalizedSigmoidBatchReweight.py index 709ee29c..89597c8e 100644 --- a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_NormalizedSigmoidBatchReweight.py +++ b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_NormalizedSigmoidBatchReweight.py @@ -39,16 +39,16 @@ def __init__( num_samples: int, horizon: int, threshold_scale: float, - reweigt_critic: bool, - reweigt_actor: bool, + reweight_critic: bool, + reweight_actor: bool, mode: int, sample_times: int, device: torch.device, ): self.type = "mbrl" self.device = device - self.reweight_critic = reweigt_critic - self.reweight_actor = reweigt_actor + self.reweight_critic = reweight_critic + self.reweight_actor = reweight_actor # this may be called policy_net in other implementations self.actor_net = actor_network.to(self.device) # this may be called soft_q_net in other implementations From 82c95c787e330d042fc96b1cb1e3192138453355 Mon Sep 17 00:00:00 2001 From: tony Date: Mon, 24 Jun 2024 11:47:59 +1200 Subject: [PATCH 41/91] typo --- .../algorithm/mbrl/DynaSAC_BIV.py | 8 ++++---- .../algorithm/mbrl/DynaSAC_SUNRISE.py | 8 ++++---- .../algorithm/mbrl/DynaSAC_ScaleBatchReweight.py | 8 ++++---- .../algorithm/mbrl/DynaSAC_UWAC.py | 8 ++++---- 4 files changed, 16 insertions(+), 16 deletions(-) diff --git a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_BIV.py b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_BIV.py index 17a0d109..6272aee6 100644 --- a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_BIV.py +++ b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_BIV.py @@ -39,16 +39,16 @@ def __init__( num_samples: int, horizon: int, threshold_scale: float, - reweigt_critic: bool, - reweigt_actor: bool, + reweight_critic: bool, + reweight_actor: bool, mode: int, sample_times: int, device: torch.device, ): self.type = "mbrl" self.device = device - self.reweight_critic = reweigt_critic - self.reweight_actor = reweigt_actor + self.reweight_critic = reweight_critic + self.reweight_actor = reweight_actor # this may be called policy_net in other implementations self.actor_net = actor_network.to(self.device) # this may be called soft_q_net in other implementations diff --git a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_SUNRISE.py b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_SUNRISE.py index 7115ffa9..c5f174d6 100644 --- a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_SUNRISE.py +++ b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_SUNRISE.py @@ -39,16 +39,16 @@ def __init__( num_samples: int, horizon: int, threshold_scale: float, - reweigt_critic: bool, - reweigt_actor: bool, + reweight_critic: bool, + reweight_actor: bool, mode: int, sample_times: int, device: torch.device, ): self.type = "mbrl" self.device = device - self.reweight_critic = reweigt_critic - self.reweight_actor = reweigt_actor + self.reweight_critic = reweight_critic + self.reweight_actor = reweight_actor # this may be called policy_net in other implementations self.actor_net = actor_network.to(self.device) # this may be called soft_q_net in other implementations diff --git a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_ScaleBatchReweight.py b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_ScaleBatchReweight.py index 42542baf..51dec6a2 100644 --- a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_ScaleBatchReweight.py +++ b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_ScaleBatchReweight.py @@ -39,16 +39,16 @@ def __init__( num_samples: int, horizon: int, threshold_scale: float, - reweigt_critic: bool, - reweigt_actor: bool, + reweight_critic: bool, + reweight_actor: bool, mode: int, sample_times: int, device: torch.device, ): self.type = "mbrl" self.device = device - self.reweight_critic = reweigt_critic - self.reweight_actor = reweigt_actor + self.reweight_critic = reweight_critic + self.reweight_actor = reweight_actor # this may be called policy_net in other implementations self.actor_net = actor_network.to(self.device) # this may be called soft_q_net in other implementations diff --git a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_UWAC.py b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_UWAC.py index 6380fa4e..33c915b7 100644 --- a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_UWAC.py +++ b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_UWAC.py @@ -39,16 +39,16 @@ def __init__( num_samples: int, horizon: int, threshold_scale: float, - reweigt_critic: bool, - reweigt_actor: bool, + reweight_critic: bool, + reweight_actor: bool, mode: int, sample_times: int, device: torch.device, ): self.type = "mbrl" self.device = device - self.reweight_critic = reweigt_critic - self.reweight_actor = reweigt_actor + self.reweight_critic = reweight_critic + self.reweight_actor = reweight_actor # this may be called policy_net in other implementations self.actor_net = actor_network.to(self.device) # this may be called soft_q_net in other implementations From f14639e3dc150b0e6fbaec43f17158503cad4f4b Mon Sep 17 00:00:00 2001 From: tony Date: Mon, 24 Jun 2024 11:51:09 +1200 Subject: [PATCH 42/91] typo --- .../algorithm/mbrl/DynaSAC_BIV.py | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_BIV.py b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_BIV.py index 6272aee6..f728ba60 100644 --- a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_BIV.py +++ b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_BIV.py @@ -388,7 +388,7 @@ def sampling(self, pred_means, pred_vars): if self.mode == 3: total_var = var_r - xi = self.get_optimal_xi(total_var.detach().numpy()) + xi = self.get_optimal_xi(total_var.detach().squeeze().numpy()) xi = torch.FloatTensor(xi).to(self.device) total_var += xi @@ -416,7 +416,7 @@ def compute_eff_bs(self, weights): # print(eff_bs) return eff_bs - def get_optimal_xi(self, variances, minimal_size): + def get_optimal_xi(self, variances): minimal_size = self.threshold_scale minimal_size = min(variances.shape[0] - 1, minimal_size) if self.compute_eff_bs(self.get_iv_weights(variances)) >= minimal_size: @@ -427,13 +427,6 @@ def get_optimal_xi(self, variances, minimal_size): xi = 0 if xi is None else xi return xi - def compute_ebs(self, weights): - weights_sum = torch.sum(weights) - weights_square = weights.pow(2) - # ebs = square of sum / sum of square. - ebs = weights_sum.pow(2) / torch.sum(weights_square) - return ebs - def set_statistics(self, stats: dict) -> None: self.world_model.set_statistics(stats) From 6491f5499731221aeb3b0398af4e939e754eed3c Mon Sep 17 00:00:00 2001 From: tony Date: Mon, 24 Jun 2024 13:05:07 +1200 Subject: [PATCH 43/91] typo --- .../algorithm/mbrl/DynaSAC_BIV.py | 43 +++++++++++-------- 1 file changed, 25 insertions(+), 18 deletions(-) diff --git a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_BIV.py b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_BIV.py index f728ba60..be8bcb69 100644 --- a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_BIV.py +++ b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_BIV.py @@ -388,17 +388,33 @@ def sampling(self, pred_means, pred_vars): if self.mode == 3: total_var = var_r - xi = self.get_optimal_xi(total_var.detach().squeeze().numpy()) - xi = torch.FloatTensor(xi).to(self.device) + # # Exacerbate the sample difference. + # min_var = torch.min(total_var) + # max_var = torch.max(total_var) + # scale_var = max_var - min_var + # # 0 - 1 + # total_var -= min_var + # total_var /= scale_var + # total_var += 0.000001 + xi = self.get_optimal_xi(total_var.detach().cpu().squeeze().numpy()) total_var += xi - # Weight = inverse of sum of weights * inverse of varaince. - total_stds = 1.0 / total_var - ratio = 1.0 / torch.sum(total_stds) - total_stds = ratio * total_stds - + weights = 1.0 / total_var + ratio = 1.0 / torch.sum(weights) + total_stds = ratio * weights return total_stds.detach() + + def get_optimal_xi(self, variances): + minimal_size = self.threshold_scale + if self.compute_eff_bs(self.get_iv_weights(variances)) >= minimal_size: + return 0 + fn = lambda x: np.abs(self.compute_eff_bs(self.get_iv_weights(variances + np.abs(x))) - minimal_size) + epsilon = minimize(fn, 0, method='Nelder-Mead', options={'fatol': 1.0, 'maxiter': 100}) + xi = np.abs(epsilon.x[0]) + xi = 0 if xi is None else xi + return xi + def get_iv_weights(self, variances): ''' Returns Inverse Variance weights @@ -413,19 +429,10 @@ def get_iv_weights(self, variances): def compute_eff_bs(self, weights): # Compute original effective mini-batch size eff_bs = 1 / np.sum(np.square(weights)) - # print(eff_bs) + eff_bs = eff_bs / np.shape(weights)[0] return eff_bs - def get_optimal_xi(self, variances): - minimal_size = self.threshold_scale - minimal_size = min(variances.shape[0] - 1, minimal_size) - if self.compute_eff_bs(self.get_iv_weights(variances)) >= minimal_size: - return 0 - fn = lambda x: np.abs(self.compute_eff_bs(self.get_iv_weights(variances + np.abs(x))) - minimal_size) - epsilon = minimize(fn, 0, method='Nelder-Mead', options={'fatol': 1.0, 'maxiter': 100}) - xi = np.abs(epsilon.x[0]) - xi = 0 if xi is None else xi - return xi + def set_statistics(self, stats: dict) -> None: self.world_model.set_statistics(stats) From ec3f836cdd62288f360b958e0729eefcfd104a62 Mon Sep 17 00:00:00 2001 From: tony Date: Sun, 7 Jul 2024 06:51:26 +1200 Subject: [PATCH 44/91] ablation_actor --- .../algorithm/mbrl/DynaSAC_BIV.py | 8 -- .../mbrl/DynaSAC_ScaleBatchReweight.py | 114 +++++++++--------- 2 files changed, 57 insertions(+), 65 deletions(-) diff --git a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_BIV.py b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_BIV.py index be8bcb69..9d80a541 100644 --- a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_BIV.py +++ b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_BIV.py @@ -388,14 +388,6 @@ def sampling(self, pred_means, pred_vars): if self.mode == 3: total_var = var_r - # # Exacerbate the sample difference. - # min_var = torch.min(total_var) - # max_var = torch.max(total_var) - # scale_var = max_var - min_var - # # 0 - 1 - # total_var -= min_var - # total_var /= scale_var - # total_var += 0.000001 xi = self.get_optimal_xi(total_var.detach().cpu().squeeze().numpy()) total_var += xi # Weight = inverse of sum of weights * inverse of varaince. diff --git a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_ScaleBatchReweight.py b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_ScaleBatchReweight.py index 51dec6a2..6e1703c6 100644 --- a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_ScaleBatchReweight.py +++ b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_ScaleBatchReweight.py @@ -15,7 +15,6 @@ from cares_reinforcement_learning.memory import PrioritizedReplayBuffer import torch.nn.functional as F - from cares_reinforcement_learning.networks.world_models.ensemble_world import ( EnsembleWorldAndOneReward, ) @@ -25,25 +24,26 @@ class DynaSAC_ScaleBatchReweight: """ Max as ? """ + def __init__( - self, - actor_network: torch.nn.Module, - critic_network: torch.nn.Module, - world_network: EnsembleWorldAndOneReward, - gamma: float, - tau: float, - action_num: int, - actor_lr: float, - critic_lr: float, - alpha_lr: float, - num_samples: int, - horizon: int, - threshold_scale: float, - reweight_critic: bool, - reweight_actor: bool, - mode: int, - sample_times: int, - device: torch.device, + self, + actor_network: torch.nn.Module, + critic_network: torch.nn.Module, + world_network: EnsembleWorldAndOneReward, + gamma: float, + tau: float, + action_num: int, + actor_lr: float, + critic_lr: float, + alpha_lr: float, + num_samples: int, + horizon: int, + threshold_scale: float, + reweight_critic: bool, + reweight_actor: bool, + mode: int, + sample_times: int, + device: torch.device, ): self.type = "mbrl" self.device = device @@ -91,7 +91,7 @@ def _alpha(self) -> float: # pylint: disable-next=unused-argument to keep the same interface def select_action_from_policy( - self, state: np.ndarray, evaluation: bool = False, noise_scale: float = 0 + self, state: np.ndarray, evaluation: bool = False, noise_scale: float = 0 ) -> np.ndarray: # note that when evaluating this algorithm we need to select mu as self.actor_net.eval() @@ -106,13 +106,13 @@ def select_action_from_policy( return action def _train_policy( - self, - states: torch.Tensor, - actions: torch.Tensor, - rewards: torch.Tensor, - next_states: torch.Tensor, - dones: torch.Tensor, - weights: torch.Tensor, + self, + states: torch.Tensor, + actions: torch.Tensor, + rewards: torch.Tensor, + next_states: torch.Tensor, + dones: torch.Tensor, + weights: torch.Tensor, ) -> None: ################## Update the Critic First #################### # Have more target values? @@ -122,7 +122,7 @@ def _train_policy( next_states, next_actions ) target_q_values = ( - torch.minimum(target_q_one, target_q_two) - self._alpha * next_log_pi + torch.minimum(target_q_one, target_q_two) - self._alpha * next_log_pi ) q_target = rewards + self.gamma * (1 - dones) * target_q_values @@ -179,7 +179,7 @@ def _train_policy( # Update the temperature alpha_loss = -( - self.log_alpha * (first_log_p + self.target_entropy).detach() + self.log_alpha * (first_log_p + self.target_entropy).detach() ).mean() self.log_alpha_optimizer.zero_grad() @@ -188,14 +188,14 @@ def _train_policy( if self.learn_counter % self.policy_update_freq == 0: for target_param, param in zip( - self.target_critic_net.parameters(), self.critic_net.parameters() + self.target_critic_net.parameters(), self.critic_net.parameters() ): target_param.data.copy_( param.data * self.tau + target_param.data * (1.0 - self.tau) ) def train_world_model( - self, memory: PrioritizedReplayBuffer, batch_size: int + self, memory: PrioritizedReplayBuffer, batch_size: int ) -> None: experiences = memory.sample_uniform(batch_size) states, actions, rewards, next_states, _, _ = experiences @@ -308,17 +308,18 @@ def sampling(self, pred_means, pred_vars): qs = [] # Varying the next_state's distribution. for i in range(self.sample_times): - # 5 models, each sampled 10 times = 50, - pred_rwd1 = self.world_model.pred_rewards(sample1[i]) - pred_rwd2 = self.world_model.pred_rewards(sample2[i]) - pred_rwd3 = self.world_model.pred_rewards(sample3[i]) - pred_rwd4 = self.world_model.pred_rewards(sample4[i]) - pred_rwd5 = self.world_model.pred_rewards(sample5[i]) - rs.append(pred_rwd1) - rs.append(pred_rwd2) - rs.append(pred_rwd3) - rs.append(pred_rwd4) - rs.append(pred_rwd5) + if self.mode == 0: + # 5 models, each sampled 10 times = 50, + pred_rwd1 = self.world_model.pred_rewards(sample1[i]) + pred_rwd2 = self.world_model.pred_rewards(sample2[i]) + pred_rwd3 = self.world_model.pred_rewards(sample3[i]) + pred_rwd4 = self.world_model.pred_rewards(sample4[i]) + pred_rwd5 = self.world_model.pred_rewards(sample5[i]) + rs.append(pred_rwd1) + rs.append(pred_rwd2) + rs.append(pred_rwd3) + rs.append(pred_rwd4) + rs.append(pred_rwd5) # Each times, 5 models predict different actions. # [2560, 17] pred_act1, log_pi1, _ = self.actor_net(sample1[i]) @@ -355,38 +356,37 @@ def sampling(self, pred_means, pred_vars): acts = torch.stack(acts) qs = torch.stack(qs) - var_r = torch.var(rs, dim=0) - - if self.mode < 3: + if self.mode == 0: + var_r = torch.var(rs, dim=0) var_a = torch.var(acts, dim=0) var_q = torch.var(qs, dim=0) - # Computing covariance. - if self.mode < 2: mean_a = torch.mean(acts, dim=0, keepdim=True) mean_q = torch.mean(qs, dim=0, keepdim=True) diff_a = acts - mean_a diff_q = qs - mean_q cov_aq = torch.mean(diff_a * diff_q, dim=0) - if self.mode < 1: mean_r = torch.mean(rs, dim=0, keepdim=True) diff_r = rs - mean_r cov_rq = torch.mean(diff_r * diff_q, dim=0) - cov_ra = torch.mean(diff_r * diff_a, dim=0) - gamma_sq = self.gamma * self.gamma - # Ablation - if self.mode == 0: + gamma_sq = self.gamma * self.gamma total_var = var_r + gamma_sq * var_a + gamma_sq * var_q + gamma_sq * 2 * cov_aq + \ gamma_sq * 2 * cov_rq + gamma_sq * 2 * cov_ra + if self.mode == 1: - total_var = var_r + gamma_sq * var_a + gamma_sq * var_q + gamma_sq * 2 * cov_aq - if self.mode == 2: - total_var = var_r + gamma_sq * var_a + gamma_sq * var_q - if self.mode == 3: - total_var = var_r + mean_a = torch.mean(acts, dim=0, keepdim=True) + mean_q = torch.mean(qs, dim=0, keepdim=True) + diff_a = acts - mean_a + diff_q = qs - mean_q + cov_aq = torch.mean(diff_a * diff_q, dim=0) + + var_a = torch.var(acts, dim=0) + var_q = torch.var(qs, dim=0) + # For actor: alpha^2 * var_a + var_q + total_var = (self._alpha ** 2) * var_a + var_q + cov_aq # Exacerbate the sample difference. old_mean_var = torch.mean(total_var) From ad8437f2c6de8d01ffce4b04c9caa195959570b9 Mon Sep 17 00:00:00 2001 From: tony Date: Sun, 7 Jul 2024 07:45:40 +1200 Subject: [PATCH 45/91] typo --- .../algorithm/mbrl/DynaSAC_ScaleBatchReweight.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_ScaleBatchReweight.py b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_ScaleBatchReweight.py index 6e1703c6..92791001 100644 --- a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_ScaleBatchReweight.py +++ b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_ScaleBatchReweight.py @@ -308,7 +308,7 @@ def sampling(self, pred_means, pred_vars): qs = [] # Varying the next_state's distribution. for i in range(self.sample_times): - if self.mode == 0: + if self.reweight_critic == 0: # 5 models, each sampled 10 times = 50, pred_rwd1 = self.world_model.pred_rewards(sample1[i]) pred_rwd2 = self.world_model.pred_rewards(sample2[i]) @@ -351,12 +351,12 @@ def sampling(self, pred_means, pred_vars): qs.append(qc) qs.append(qd) qs.append(qe) - - rs = torch.stack(rs) + if self.reweight_critic == 0: + rs = torch.stack(rs) acts = torch.stack(acts) qs = torch.stack(qs) - if self.mode == 0: + if self.reweight_critic: var_r = torch.var(rs, dim=0) var_a = torch.var(acts, dim=0) var_q = torch.var(qs, dim=0) @@ -376,7 +376,7 @@ def sampling(self, pred_means, pred_vars): total_var = var_r + gamma_sq * var_a + gamma_sq * var_q + gamma_sq * 2 * cov_aq + \ gamma_sq * 2 * cov_rq + gamma_sq * 2 * cov_ra - if self.mode == 1: + if self.reweight_actor: mean_a = torch.mean(acts, dim=0, keepdim=True) mean_q = torch.mean(qs, dim=0, keepdim=True) diff_a = acts - mean_a From 46e1ed850db45eba7a056e276ca19833178c1378 Mon Sep 17 00:00:00 2001 From: tony Date: Sun, 7 Jul 2024 20:59:05 +1200 Subject: [PATCH 46/91] typo --- .../algorithm/mbrl/DynaSAC_ScaleBatchReweight.py | 4 ++-- cares_reinforcement_learning/util/configurations.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_ScaleBatchReweight.py b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_ScaleBatchReweight.py index 92791001..593d8eb0 100644 --- a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_ScaleBatchReweight.py +++ b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_ScaleBatchReweight.py @@ -308,7 +308,7 @@ def sampling(self, pred_means, pred_vars): qs = [] # Varying the next_state's distribution. for i in range(self.sample_times): - if self.reweight_critic == 0: + if self.reweight_critic == 1: # 5 models, each sampled 10 times = 50, pred_rwd1 = self.world_model.pred_rewards(sample1[i]) pred_rwd2 = self.world_model.pred_rewards(sample2[i]) @@ -351,7 +351,7 @@ def sampling(self, pred_means, pred_vars): qs.append(qc) qs.append(qd) qs.append(qe) - if self.reweight_critic == 0: + if self.reweight_critic == 1: rs = torch.stack(rs) acts = torch.stack(acts) qs = torch.stack(qs) diff --git a/cares_reinforcement_learning/util/configurations.py b/cares_reinforcement_learning/util/configurations.py index b3d846c6..f512f1a9 100644 --- a/cares_reinforcement_learning/util/configurations.py +++ b/cares_reinforcement_learning/util/configurations.py @@ -246,7 +246,7 @@ class DynaSAC_BIVReweightConfig(AlgorithmConfig): reweight_critic: Optional[bool] = True reweight_actor: Optional[bool] = False - mode: Optional[int] = 1 + mode: Optional[int] = 0 sample_times: Optional[int] = 10 From 455c60a4014269f98f4b618ca9357bec883f72ad Mon Sep 17 00:00:00 2001 From: tony Date: Tue, 9 Jul 2024 03:11:25 +1200 Subject: [PATCH 47/91] typo --- .../algorithm/mbrl/DynaSAC_ScaleBatchReweight.py | 1 + 1 file changed, 1 insertion(+) diff --git a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_ScaleBatchReweight.py b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_ScaleBatchReweight.py index 593d8eb0..1c3c2f56 100644 --- a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_ScaleBatchReweight.py +++ b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_ScaleBatchReweight.py @@ -392,6 +392,7 @@ def sampling(self, pred_means, pred_vars): old_mean_var = torch.mean(total_var) # normalize vars to sum = 1 total_var /= old_mean_var + total_var += 0.00000001 min_var = torch.min(total_var) max_var = torch.max(total_var) # As (max-min) decrease, threshold should go down. From b22ad962f87054efa29a2fec06b76c640dcfc339 Mon Sep 17 00:00:00 2001 From: tony Date: Fri, 12 Jul 2024 17:25:59 +1200 Subject: [PATCH 48/91] typo --- cares_reinforcement_learning/algorithm/mbrl/DynaSAC_UWAC.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_UWAC.py b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_UWAC.py index 33c915b7..2b2f6543 100644 --- a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_UWAC.py +++ b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_UWAC.py @@ -388,7 +388,7 @@ def sampling(self, pred_means, pred_vars): if self.mode == 3: total_var = var_r - total_stds = torch.minimum(self.threshold_scale/total_var, torch.ones(total_var.shape) * 1.5) + total_stds = torch.minimum(self.threshold_scale/total_var, torch.ones(total_var.shape).to(self.device) * 1.5) return total_stds.detach() From 3c53a92a06a729482d436947d3f49b0b63f8a321 Mon Sep 17 00:00:00 2001 From: tony Date: Sun, 14 Jul 2024 18:02:42 +1200 Subject: [PATCH 49/91] Distinguish predict reward with next state and with current state and action. --- .../algorithm/mbrl/DynaSAC.py | 2 +- .../algorithm/mbrl/DynaSAC_BIV.py | 2 +- .../mbrl/DynaSAC_BinaryBatchReweight.py | 2 +- .../mbrl/DynaSAC_MaxBatchReweight.py | 2 +- .../DynaSAC_NormalizedSigmoidBatchReweight.py | 2 +- .../algorithm/mbrl/DynaSAC_SA.py | 251 +++++++++++ .../algorithm/mbrl/DynaSAC_SABR.py | 391 ++++++++++++++++++ .../algorithm/mbrl/DynaSAC_SUNRISE.py | 8 +- .../mbrl/DynaSAC_ScaleBatchReweight.py | 2 +- .../algorithm/mbrl/DynaSAC_UWAC.py | 8 +- .../algorithm/mbrl/__init__.py | 2 + .../networks/world_models/__init__.py | 5 +- .../world_models/ensemble_integrated.py | 2 +- ...ensemble_world.py => ensemble_world_sn.py} | 2 +- .../world_models/ensmeble_world_sa.py | 161 ++++++++ .../networks/world_models/simple_reward_sa.py | 51 +++ ...{simple_rewards.py => simple_reward_sn.py} | 0 .../util/configurations.py | 43 ++ .../util/network_factory.py | 85 ++++ 19 files changed, 999 insertions(+), 22 deletions(-) create mode 100644 cares_reinforcement_learning/algorithm/mbrl/DynaSAC_SA.py create mode 100644 cares_reinforcement_learning/algorithm/mbrl/DynaSAC_SABR.py rename cares_reinforcement_learning/networks/world_models/{ensemble_world.py => ensemble_world_sn.py} (99%) create mode 100644 cares_reinforcement_learning/networks/world_models/ensmeble_world_sa.py create mode 100644 cares_reinforcement_learning/networks/world_models/simple_reward_sa.py rename cares_reinforcement_learning/networks/world_models/{simple_rewards.py => simple_reward_sn.py} (100%) diff --git a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC.py b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC.py index d0c9a046..646302e2 100644 --- a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC.py +++ b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC.py @@ -16,7 +16,7 @@ from cares_reinforcement_learning.memory import PrioritizedReplayBuffer -from cares_reinforcement_learning.networks.world_models.ensemble_world import ( +from cares_reinforcement_learning.networks.world_models.ensemble_world_sn import ( EnsembleWorldAndOneReward, ) diff --git a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_BIV.py b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_BIV.py index 9d80a541..ab374559 100644 --- a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_BIV.py +++ b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_BIV.py @@ -15,7 +15,7 @@ from cares_reinforcement_learning.memory import PrioritizedReplayBuffer import torch.nn.functional as F -from cares_reinforcement_learning.networks.world_models.ensemble_world import ( +from cares_reinforcement_learning.networks.world_models.ensemble_world_sn import ( EnsembleWorldAndOneReward, ) diff --git a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_BinaryBatchReweight.py b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_BinaryBatchReweight.py index 8bf8508b..9cc71a94 100644 --- a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_BinaryBatchReweight.py +++ b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_BinaryBatchReweight.py @@ -14,7 +14,7 @@ import torch from cares_reinforcement_learning.memory import PrioritizedReplayBuffer -from cares_reinforcement_learning.networks.world_models.ensemble_world import ( +from cares_reinforcement_learning.networks.world_models.ensemble_world_sn import ( EnsembleWorldAndOneReward, ) diff --git a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_MaxBatchReweight.py b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_MaxBatchReweight.py index 7ec92fac..65693fd8 100644 --- a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_MaxBatchReweight.py +++ b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_MaxBatchReweight.py @@ -14,7 +14,7 @@ import torch from cares_reinforcement_learning.memory import PrioritizedReplayBuffer -from cares_reinforcement_learning.networks.world_models.ensemble_world import ( +from cares_reinforcement_learning.networks.world_models.ensemble_world_sn import ( EnsembleWorldAndOneReward, ) diff --git a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_NormalizedSigmoidBatchReweight.py b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_NormalizedSigmoidBatchReweight.py index 89597c8e..ad531974 100644 --- a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_NormalizedSigmoidBatchReweight.py +++ b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_NormalizedSigmoidBatchReweight.py @@ -16,7 +16,7 @@ import torch.nn.functional as F -from cares_reinforcement_learning.networks.world_models.ensemble_world import ( +from cares_reinforcement_learning.networks.world_models.ensemble_world_sn import ( EnsembleWorldAndOneReward, ) diff --git a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_SA.py b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_SA.py new file mode 100644 index 00000000..f5b23d0d --- /dev/null +++ b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_SA.py @@ -0,0 +1,251 @@ +""" +Sutton, Richard S. "Dyna, an integrated architecture for learning, planning, and reacting." + +Original Paper: https://dl.acm.org/doi/abs/10.1145/122344.122377 + +This code runs automatic entropy tuning +""" + +import copy +import logging +import os + +import numpy as np +import torch +import torch.nn.functional as F + +from cares_reinforcement_learning.memory import PrioritizedReplayBuffer + +from cares_reinforcement_learning.networks.world_models.ensmeble_world_sa import ( + EnsembleWorldAndOneSAReward, +) + + +class DynaSAC_SA: + def __init__( + self, + actor_network: torch.nn.Module, + critic_network: torch.nn.Module, + world_network: EnsembleWorldAndOneSAReward, + gamma: float, + tau: float, + action_num: int, + actor_lr: float, + critic_lr: float, + alpha_lr: float, + num_samples: int, + horizon: int, + device: torch.device, + ): + self.type = "mbrl" + self.device = device + + # this may be called policy_net in other implementations + self.actor_net = actor_network.to(self.device) + # this may be called soft_q_net in other implementations + self.critic_net = critic_network.to(self.device) + self.target_critic_net = copy.deepcopy(self.critic_net) + + self.gamma = gamma + self.tau = tau + + self.num_samples = num_samples + self.horizon = horizon + self.action_num = action_num + + self.learn_counter = 0 + self.policy_update_freq = 1 + + self.actor_net_optimiser = torch.optim.Adam( + self.actor_net.parameters(), lr=actor_lr + ) + self.critic_net_optimiser = torch.optim.Adam( + self.critic_net.parameters(), lr=critic_lr + ) + + # Set to initial alpha to 1.0 according to other baselines. + self.log_alpha = torch.tensor(np.log(1.0)).to(device) + self.log_alpha.requires_grad = True + self.target_entropy = -action_num + self.log_alpha_optimizer = torch.optim.Adam([self.log_alpha], lr=alpha_lr) + + # World model + self.world_model = world_network + + @property + def _alpha(self) -> float: + return self.log_alpha.exp() + + # pylint: disable-next=unused-argument to keep the same interface + def select_action_from_policy( + self, state: np.ndarray, evaluation: bool = False, noise_scale: float = 0 + ) -> np.ndarray: + # note that when evaluating this algorithm we need to select mu as + self.actor_net.eval() + with torch.no_grad(): + state_tensor = torch.FloatTensor(state).unsqueeze(0).to(self.device) + if evaluation is False: + (action, _, _) = self.actor_net(state_tensor) + else: + (_, _, action) = self.actor_net(state_tensor) + action = action.cpu().data.numpy().flatten() + self.actor_net.train() + return action + + def _train_policy( + self, + states: torch.Tensor, + actions: torch.Tensor, + rewards: torch.Tensor, + next_states: torch.Tensor, + dones: torch.Tensor, + ) -> None: + ################## Update the Critic First #################### + with torch.no_grad(): + next_actions, next_log_pi, _ = self.actor_net(next_states) + + target_q_one, target_q_two = self.target_critic_net( + next_states, next_actions + ) + target_q_values = ( + torch.minimum(target_q_one, target_q_two) - self._alpha * next_log_pi + ) + q_target = rewards + self.gamma * (1 - dones) * target_q_values + + q_values_one, q_values_two = self.critic_net(states, actions) + + critic_loss_one = ((q_values_one - q_target).pow(2)).mean() + critic_loss_two = ((q_values_two - q_target).pow(2)).mean() + + critic_loss_total = critic_loss_one + critic_loss_two + + # Update the Critic + self.critic_net_optimiser.zero_grad() + critic_loss_total.backward() + self.critic_net_optimiser.step() + + ################## Update the Actor Second #################### + pi, first_log_p, _ = self.actor_net(states) + qf1_pi, qf2_pi = self.critic_net(states, pi) + min_qf_pi = torch.minimum(qf1_pi, qf2_pi) + actor_loss = ((self._alpha * first_log_p) - min_qf_pi).mean() + + # Update the Actor + self.actor_net_optimiser.zero_grad() + actor_loss.backward() + self.actor_net_optimiser.step() + + # Update the temperature + alpha_loss = -( + self.log_alpha * (first_log_p + self.target_entropy).detach() + ).mean() + + self.log_alpha_optimizer.zero_grad() + alpha_loss.backward() + self.log_alpha_optimizer.step() + + if self.learn_counter % self.policy_update_freq == 0: + for target_param, param in zip( + self.target_critic_net.parameters(), self.critic_net.parameters() + ): + target_param.data.copy_( + param.data * self.tau + target_param.data * (1.0 - self.tau) + ) + + def train_world_model( + self, memory: PrioritizedReplayBuffer, batch_size: int + ) -> None: + + experiences = memory.sample_uniform(batch_size) + states, actions, rewards, next_states, _, _ = experiences + + states = torch.FloatTensor(np.asarray(states)).to(self.device) + actions = torch.FloatTensor(np.asarray(actions)).to(self.device) + rewards = torch.FloatTensor(np.asarray(rewards)).to(self.device).unsqueeze(1) + next_states = torch.FloatTensor(np.asarray(next_states)).to(self.device) + + self.world_model.train_world( + states=states, + actions=actions, + next_states=next_states, + ) + self.world_model.train_reward( + states=states, + actions=actions, + rewards=rewards, + ) + + def train_policy(self, memory: PrioritizedReplayBuffer, batch_size: int) -> None: + self.learn_counter += 1 + + experiences = memory.sample_uniform(batch_size) + states, actions, rewards, next_states, dones, _ = experiences + + # Convert into tensor + states = torch.FloatTensor(np.asarray(states)).to(self.device) + actions = torch.FloatTensor(np.asarray(actions)).to(self.device) + rewards = torch.FloatTensor(np.asarray(rewards)).to(self.device).unsqueeze(1) + next_states = torch.FloatTensor(np.asarray(next_states)).to(self.device) + dones = torch.LongTensor(np.asarray(dones)).to(self.device).unsqueeze(1) + + # Step 2 train as usual + self._train_policy( + states=states, + actions=actions, + rewards=rewards, + next_states=next_states, + dones=dones, + ) + # # # Step 3 Dyna add more data + self._dyna_generate_and_train(next_states=next_states) + + def _dyna_generate_and_train(self, next_states: torch.Tensor) -> None: + pred_states = [] + pred_actions = [] + pred_rs = [] + pred_n_states = [] + + with torch.no_grad(): + pred_state = next_states + for _ in range(self.horizon): + pred_state = torch.repeat_interleave(pred_state, self.num_samples, dim=0) + # This part is controversial. But random actions is empirically better. + rand_acts = np.random.uniform(-1, 1, (pred_state.shape[0], self.action_num)) + pred_acts = torch.FloatTensor(rand_acts).to(self.device) + pred_next_state, _, _, _ = self.world_model.pred_next_states( + pred_state, pred_acts + ) + pred_reward = self.world_model.pred_rewards(pred_state, pred_acts) + pred_states.append(pred_state) + pred_actions.append(pred_acts.detach()) + pred_rs.append(pred_reward.detach()) + pred_n_states.append(pred_next_state.detach()) + pred_state = pred_next_state.detach() + pred_states = torch.vstack(pred_states) + pred_actions = torch.vstack(pred_actions) + pred_rs = torch.vstack(pred_rs) + pred_n_states = torch.vstack(pred_n_states) + # Pay attention to here! It is dones in the Cares RL Code! + pred_dones = torch.FloatTensor(np.zeros(pred_rs.shape)).to(self.device) + # states, actions, rewards, next_states, not_dones + self._train_policy( + pred_states, pred_actions, pred_rs, pred_n_states, pred_dones + ) + + def set_statistics(self, stats: dict) -> None: + self.world_model.set_statistics(stats) + + def save_models(self, filename: str, filepath: str = "models") -> None: + path = f"{filepath}/models" if filepath != "models" else filepath + dir_exists = os.path.exists(path) + if not dir_exists: + os.makedirs(path) + torch.save(self.actor_net.state_dict(), f"{path}/{filename}_actor.pth") + torch.save(self.critic_net.state_dict(), f"{path}/{filename}_critic.pth") + logging.info("models has been saved...") + + def load_models(self, filepath: str, filename: str) -> None: + path = f"{filepath}/models" if filepath != "models" else filepath + self.actor_net.load_state_dict(torch.load(f"{path}/{filename}_actor.pth")) + self.critic_net.load_state_dict(torch.load(f"{path}/{filename}_critic.pth")) + logging.info("models has been loaded...") diff --git a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_SABR.py b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_SABR.py new file mode 100644 index 00000000..8395daad --- /dev/null +++ b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_SABR.py @@ -0,0 +1,391 @@ +""" +Sutton, Richard S. "Dyna, an integrated architecture for learning, planning, and reacting." + +Original Paper: https://dl.acm.org/doi/abs/10.1145/122344.122377 + +This code runs automatic entropy tuning +""" + +import copy +import logging +import os + +import numpy as np +import torch +from cares_reinforcement_learning.memory import PrioritizedReplayBuffer +import torch.nn.functional as F + +from cares_reinforcement_learning.networks.world_models.ensmeble_world_sa import ( + EnsembleWorldAndOneSAReward, +) + + +class DynaSAC_SABR: + """ + Max as ? + """ + + def __init__( + self, + actor_network: torch.nn.Module, + critic_network: torch.nn.Module, + world_network: EnsembleWorldAndOneSAReward, + gamma: float, + tau: float, + action_num: int, + actor_lr: float, + critic_lr: float, + alpha_lr: float, + num_samples: int, + horizon: int, + threshold_scale: float, + reweight_critic: bool, + reweight_actor: bool, + mode: int, + sample_times: int, + device: torch.device, + ): + self.type = "mbrl" + self.device = device + self.reweight_critic = reweight_critic + self.reweight_actor = reweight_actor + # this may be called policy_net in other implementations + self.actor_net = actor_network.to(self.device) + # this may be called soft_q_net in other implementations + self.critic_net = critic_network.to(self.device) + self.target_critic_net = copy.deepcopy(self.critic_net) + + self.gamma = gamma + self.tau = tau + + self.num_samples = num_samples + self.horizon = horizon + self.action_num = action_num + + self.learn_counter = 0 + self.policy_update_freq = 1 + + self.actor_net_optimiser = torch.optim.Adam( + self.actor_net.parameters(), lr=actor_lr + ) + self.critic_net_optimiser = torch.optim.Adam( + self.critic_net.parameters(), lr=critic_lr + ) + + # Set to initial alpha to 1.0 according to other baselines. + self.log_alpha = torch.tensor(np.log(1.0)).to(device) + self.log_alpha.requires_grad = True + self.target_entropy = -action_num + self.log_alpha_optimizer = torch.optim.Adam([self.log_alpha], lr=alpha_lr) + + # World model + self.world_model = world_network + # Parameter + self.threshold_scale = threshold_scale + self.mode = mode + self.sample_times = sample_times + + @property + def _alpha(self) -> float: + return self.log_alpha.exp() + + # pylint: disable-next=unused-argument to keep the same interface + def select_action_from_policy( + self, state: np.ndarray, evaluation: bool = False, noise_scale: float = 0 + ) -> np.ndarray: + # note that when evaluating this algorithm we need to select mu as + self.actor_net.eval() + with torch.no_grad(): + state_tensor = torch.FloatTensor(state).unsqueeze(0).to(self.device) + if evaluation is False: + (action, _, _) = self.actor_net(state_tensor) + else: + (_, _, action) = self.actor_net(state_tensor) + action = action.cpu().data.numpy().flatten() + self.actor_net.train() + return action + + def _train_policy( + self, + states: torch.Tensor, + actions: torch.Tensor, + rewards: torch.Tensor, + next_states: torch.Tensor, + dones: torch.Tensor, + weights: torch.Tensor, + ) -> None: + ################## Update the Critic First #################### + # Have more target values? + with torch.no_grad(): + next_actions, next_log_pi, _ = self.actor_net(next_states) + target_q_one, target_q_two = self.target_critic_net( + next_states, next_actions + ) + target_q_values = ( + torch.minimum(target_q_one, target_q_two) - self._alpha * next_log_pi + ) + q_target = rewards + self.gamma * (1 - dones) * target_q_values + + q_values_one, q_values_two = self.critic_net(states, actions) + + if self.reweight_critic: + # Reweighted loss function. weight not participant in training. + l2_loss_one = (q_values_one - q_target).pow(2) + l2_loss_two = (q_values_two - q_target).pow(2) + + weights = weights.detach() + disc_l2_loss_one = l2_loss_one * weights + disc_l2_loss_two = l2_loss_two * weights + # A ratio to scale the loss back to original loss scale. + + ratio_1 = torch.mean(l2_loss_one) / torch.mean(disc_l2_loss_one) + ratio_1 = ratio_1.detach() + ratio_2 = torch.mean(l2_loss_two) / torch.mean(disc_l2_loss_two) + ratio_2 = ratio_2.detach() + + critic_loss_one = disc_l2_loss_one.mean() * ratio_1 + critic_loss_two = disc_l2_loss_two.mean() * ratio_2 + + critic_loss_total = critic_loss_one + critic_loss_two + else: + critic_loss_one = F.mse_loss(q_values_one, q_target) + critic_loss_two = F.mse_loss(q_values_two, q_target) + critic_loss_total = critic_loss_one + critic_loss_two + + # Update the Critic + self.critic_net_optimiser.zero_grad() + critic_loss_total.backward() + self.critic_net_optimiser.step() + + ################## Update the Actor Second #################### + pi, first_log_p, _ = self.actor_net(states) + qf1_pi, qf2_pi = self.critic_net(states, pi) + min_qf_pi = torch.minimum(qf1_pi, qf2_pi) + + if self.reweight_actor: + weights = weights.detach() + a_loss = (self._alpha * first_log_p) - min_qf_pi + disc_actor_loss = a_loss * weights + ratio = torch.mean(a_loss) / torch.mean(disc_actor_loss) + ratio = ratio.detach() + actor_loss = ratio * torch.mean(disc_actor_loss) + else: + actor_loss = ((self._alpha * first_log_p) - min_qf_pi).mean() + + # Update the Actor + self.actor_net_optimiser.zero_grad() + actor_loss.backward() + self.actor_net_optimiser.step() + + # Update the temperature + alpha_loss = -( + self.log_alpha * (first_log_p + self.target_entropy).detach() + ).mean() + + self.log_alpha_optimizer.zero_grad() + alpha_loss.backward() + self.log_alpha_optimizer.step() + + if self.learn_counter % self.policy_update_freq == 0: + for target_param, param in zip( + self.target_critic_net.parameters(), self.critic_net.parameters() + ): + target_param.data.copy_( + param.data * self.tau + target_param.data * (1.0 - self.tau) + ) + + def train_world_model( + self, memory: PrioritizedReplayBuffer, batch_size: int + ) -> None: + experiences = memory.sample_uniform(batch_size) + states, actions, rewards, next_states, _, _ = experiences + + states = torch.FloatTensor(np.asarray(states)).to(self.device) + actions = torch.FloatTensor(np.asarray(actions)).to(self.device) + rewards = torch.FloatTensor(np.asarray(rewards)).to(self.device).unsqueeze(1) + next_states = torch.FloatTensor(np.asarray(next_states)).to(self.device) + + self.world_model.train_world( + states=states, + actions=actions, + next_states=next_states, + ) + self.world_model.train_reward( + states=states, + actions=actions, + rewards=rewards, + ) + + def train_policy(self, memory: PrioritizedReplayBuffer, batch_size: int) -> None: + self.learn_counter += 1 + + experiences = memory.sample_uniform(batch_size) + states, actions, rewards, next_states, dones, _ = experiences + + # Convert into tensor + states = torch.FloatTensor(np.asarray(states)).to(self.device) + actions = torch.FloatTensor(np.asarray(actions)).to(self.device) + rewards = torch.FloatTensor(np.asarray(rewards)).to(self.device).unsqueeze(1) + next_states = torch.FloatTensor(np.asarray(next_states)).to(self.device) + dones = torch.LongTensor(np.asarray(dones)).to(self.device).unsqueeze(1) + full_weights = torch.ones(rewards.shape).to(self.device) + # Step 2 train as usual + self._train_policy( + states=states, + actions=actions, + rewards=rewards, + next_states=next_states, + dones=dones, + weights=full_weights, + ) + # # # Step 3 Dyna add more data + self._dyna_generate_and_train(next_states=next_states) + + def _dyna_generate_and_train(self, next_states): + """ + Only off-policy Dyna will work. + :param next_states: + """ + pred_states = [] + pred_actions = [] + pred_rs = [] + pred_n_states = [] + pred_uncerts = [] + with torch.no_grad(): + pred_state = next_states + for _ in range(self.horizon): + pred_state = torch.repeat_interleave(pred_state, self.num_samples, dim=0) + # This part is controversial. But random actions is empirically better. + rand_acts = np.random.uniform(-1, 1, (pred_state.shape[0], self.action_num)) + pred_acts = torch.FloatTensor(rand_acts).to(self.device) + + pred_next_state, _, pred_mean, pred_var = self.world_model.pred_next_states( + pred_state, pred_acts + ) + uncert = self.sampling(pred_means=pred_mean, pred_vars=pred_var) + uncert = uncert.unsqueeze(dim=1).to(self.device) + pred_uncerts.append(uncert) + + pred_reward = self.world_model.pred_rewards(pred_state, pred_acts) + pred_states.append(pred_state) + pred_actions.append(pred_acts.detach()) + pred_rs.append(pred_reward.detach()) + pred_n_states.append(pred_next_state.detach()) + pred_state = pred_next_state.detach() + pred_states = torch.vstack(pred_states) + pred_actions = torch.vstack(pred_actions) + pred_rs = torch.vstack(pred_rs) + pred_n_states = torch.vstack(pred_n_states) + pred_weights = torch.vstack(pred_uncerts) + # Pay attention to here! It is dones in the Cares RL Code! + pred_dones = torch.FloatTensor(np.zeros(pred_rs.shape)).to(self.device) + # states, actions, rewards, next_states, not_dones + self._train_policy( + pred_states, pred_actions, pred_rs, pred_n_states, pred_dones, pred_weights + ) + + def sampling(self, pred_means, pred_vars): + """ + High std means low uncertainty. Therefore, divided by 1 + + :param pred_means: + :param pred_vars: + :return: + """ + with torch.no_grad(): + # 5 models. Each predict 10 next_states. + sample1 = torch.distributions.Normal(pred_means[0], pred_vars[0]).sample( + [self.sample_times]) + sample2 = torch.distributions.Normal(pred_means[1], pred_vars[1]).sample( + [self.sample_times]) + sample3 = torch.distributions.Normal(pred_means[2], pred_vars[2]).sample( + [self.sample_times]) + sample4 = torch.distributions.Normal(pred_means[3], pred_vars[3]).sample( + [self.sample_times]) + sample5 = torch.distributions.Normal(pred_means[4], pred_vars[4]).sample( + [self.sample_times]) + acts = [] + qs = [] + # Varying the next_state's distribution. + for i in range(self.sample_times): + # Each times, 5 models predict different actions. + # [2560, 17] + pred_act1, log_pi1, _ = self.actor_net(sample1[i]) + pred_act2, log_pi2, _ = self.actor_net(sample2[i]) + pred_act3, log_pi3, _ = self.actor_net(sample3[i]) + pred_act4, log_pi4, _ = self.actor_net(sample4[i]) + pred_act5, log_pi5, _ = self.actor_net(sample5[i]) + acts.append(log_pi1) + acts.append(log_pi2) + acts.append(log_pi3) + acts.append(log_pi4) + acts.append(log_pi5) + # How to become the same next state, different action. + # Now: sample1 sample2... same next state, different model. + # Pred_act1 pred_act2 same next_state, different actions. + # 5[] * 10[var of state] + qa1, qa2 = self.target_critic_net(sample1[i], pred_act1) + qa = torch.minimum(qa1, qa2) + qb1, qb2 = self.target_critic_net(sample2[i], pred_act2) + qb = torch.minimum(qb1, qb2) + qc1, qc2 = self.target_critic_net(sample3[i], pred_act3) + qc = torch.minimum(qc1, qc2) + qd1, qd2 = self.target_critic_net(sample4[i], pred_act4) + qd = torch.minimum(qd1, qd2) + qe1, qe2 = self.target_critic_net(sample5[i], pred_act5) + qe = torch.minimum(qe1, qe2) + qs.append(qa) + qs.append(qb) + qs.append(qc) + qs.append(qd) + qs.append(qe) + acts = torch.stack(acts) + qs = torch.stack(qs) + + var_a = torch.var(acts, dim=0) + var_q = torch.var(qs, dim=0) + mean_a = torch.mean(acts, dim=0, keepdim=True) + mean_q = torch.mean(qs, dim=0, keepdim=True) + diff_a = acts - mean_a + diff_q = qs - mean_q + cov_aq = torch.mean(diff_a * diff_q, dim=0) + + if self.reweight_critic: + gamma_sq = self.gamma * self.gamma + total_var = gamma_sq * var_a + gamma_sq * var_q + gamma_sq * 2 * cov_aq + + if self.reweight_actor: + # For actor: alpha^2 * var_a + var_q + total_var = (self._alpha ** 2) * var_a + var_q + cov_aq + + # Exacerbate the sample difference. + old_mean_var = torch.mean(total_var) + # normalize vars to sum = 1 + total_var /= old_mean_var + total_var += 0.00000001 + min_var = torch.min(total_var) + max_var = torch.max(total_var) + # As (max-min) decrease, threshold should go down. + threshold = self.threshold_scale * (max_var - min_var) + min_var + total_var[total_var <= threshold] = threshold + total_stds = 1 / total_var + + return total_stds.detach() + + def set_statistics(self, stats: dict) -> None: + self.world_model.set_statistics(stats) + + def save_models(self, filename: str, filepath: str = "models") -> None: + path = f"{filepath}/models" if filepath != "models" else filepath + dir_exists = os.path.exists(path) + if not dir_exists: + os.makedirs(path) + torch.save(self.actor_net.state_dict(), f"{path}/{filename}_actor.pth") + torch.save(self.critic_net.state_dict(), f"{path}/{filename}_critic.pth") + logging.info("models has been saved...") + + def load_models(self, filepath: str, filename: str) -> None: + path = f"{filepath}/models" if filepath != "models" else filepath + self.actor_net.load_state_dict(torch.load(f"{path}/{filename}_actor.pth")) + self.critic_net.load_state_dict(torch.load(f"{path}/{filename}_critic.pth")) + logging.info("models has been loaded...") diff --git a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_SUNRISE.py b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_SUNRISE.py index c5f174d6..16932b82 100644 --- a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_SUNRISE.py +++ b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_SUNRISE.py @@ -15,7 +15,7 @@ from cares_reinforcement_learning.memory import PrioritizedReplayBuffer import torch.nn.functional as F -from cares_reinforcement_learning.networks.world_models.ensemble_world import ( +from cares_reinforcement_learning.networks.world_models.ensemble_world_sn import ( EnsembleWorldAndOneReward, ) @@ -381,12 +381,6 @@ def sampling(self, pred_means, pred_vars): if self.mode == 0: total_var = var_r + gamma_sq * var_a + gamma_sq * var_q + gamma_sq * 2 * cov_aq + \ gamma_sq * 2 * cov_rq + gamma_sq * 2 * cov_ra - if self.mode == 1: - total_var = var_r + gamma_sq * var_a + gamma_sq * var_q + gamma_sq * 2 * cov_aq - if self.mode == 2: - total_var = var_r + gamma_sq * var_a + gamma_sq * var_q - if self.mode == 3: - total_var = var_r total_stds = torch.sigmoid(-1 * torch.sqrt(total_var) * self.threshold_scale) + 0.5 diff --git a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_ScaleBatchReweight.py b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_ScaleBatchReweight.py index 1c3c2f56..bbed2014 100644 --- a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_ScaleBatchReweight.py +++ b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_ScaleBatchReweight.py @@ -15,7 +15,7 @@ from cares_reinforcement_learning.memory import PrioritizedReplayBuffer import torch.nn.functional as F -from cares_reinforcement_learning.networks.world_models.ensemble_world import ( +from cares_reinforcement_learning.networks.world_models.ensemble_world_sn import ( EnsembleWorldAndOneReward, ) diff --git a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_UWAC.py b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_UWAC.py index 2b2f6543..0eb1353b 100644 --- a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_UWAC.py +++ b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_UWAC.py @@ -15,7 +15,7 @@ from cares_reinforcement_learning.memory import PrioritizedReplayBuffer import torch.nn.functional as F -from cares_reinforcement_learning.networks.world_models.ensemble_world import ( +from cares_reinforcement_learning.networks.world_models.ensemble_world_sn import ( EnsembleWorldAndOneReward, ) @@ -382,11 +382,7 @@ def sampling(self, pred_means, pred_vars): total_var = var_r + gamma_sq * var_a + gamma_sq * var_q + gamma_sq * 2 * cov_aq + \ gamma_sq * 2 * cov_rq + gamma_sq * 2 * cov_ra if self.mode == 1: - total_var = var_r + gamma_sq * var_a + gamma_sq * var_q + gamma_sq * 2 * cov_aq - if self.mode == 2: - total_var = var_r + gamma_sq * var_a + gamma_sq * var_q - if self.mode == 3: - total_var = var_r + total_var = gamma_sq * var_a + gamma_sq * var_q + gamma_sq * 2 * cov_aq total_stds = torch.minimum(self.threshold_scale/total_var, torch.ones(total_var.shape).to(self.device) * 1.5) diff --git a/cares_reinforcement_learning/algorithm/mbrl/__init__.py b/cares_reinforcement_learning/algorithm/mbrl/__init__.py index 9614f39e..24e542ec 100644 --- a/cares_reinforcement_learning/algorithm/mbrl/__init__.py +++ b/cares_reinforcement_learning/algorithm/mbrl/__init__.py @@ -6,3 +6,5 @@ from .DynaSAC_UWAC import DynaSAC_UWACReweight from .DynaSAC_BIV import DynaSAC_BIVReweight from .DynaSAC_NormalizedSigmoidBatchReweight import DynaSAC_NormalizedSigmoidBatchReweight +from .DynaSAC_SA import DynaSAC_SA +from .DynaSAC_SABR import DynaSAC_SABR diff --git a/cares_reinforcement_learning/networks/world_models/__init__.py b/cares_reinforcement_learning/networks/world_models/__init__.py index 0fede213..3542ec33 100644 --- a/cares_reinforcement_learning/networks/world_models/__init__.py +++ b/cares_reinforcement_learning/networks/world_models/__init__.py @@ -1,6 +1,9 @@ from cares_reinforcement_learning.networks.world_models.ensemble_integrated import ( EnsembleWorldReward, ) -from cares_reinforcement_learning.networks.world_models.ensemble_world import ( +from cares_reinforcement_learning.networks.world_models.ensemble_world_sn import ( EnsembleWorldAndOneReward, +) +from cares_reinforcement_learning.networks.world_models.ensmeble_world_sa import ( + EnsembleWorldAndOneSAReward, ) \ No newline at end of file diff --git a/cares_reinforcement_learning/networks/world_models/ensemble_integrated.py b/cares_reinforcement_learning/networks/world_models/ensemble_integrated.py index d7b10f87..534ff71e 100644 --- a/cares_reinforcement_learning/networks/world_models/ensemble_integrated.py +++ b/cares_reinforcement_learning/networks/world_models/ensemble_integrated.py @@ -12,7 +12,7 @@ from cares_reinforcement_learning.networks.world_models.simple_dynamics import ( SimpleDynamics, ) -from cares_reinforcement_learning.networks.world_models.simple_rewards import ( +from cares_reinforcement_learning.networks.world_models.simple_reward_sn import ( SimpleReward, ) # from cares_reinforcement_learning.networks.world_models.probability_rewards import ( diff --git a/cares_reinforcement_learning/networks/world_models/ensemble_world.py b/cares_reinforcement_learning/networks/world_models/ensemble_world_sn.py similarity index 99% rename from cares_reinforcement_learning/networks/world_models/ensemble_world.py rename to cares_reinforcement_learning/networks/world_models/ensemble_world_sn.py index 570b0659..f2dc2762 100644 --- a/cares_reinforcement_learning/networks/world_models/ensemble_world.py +++ b/cares_reinforcement_learning/networks/world_models/ensemble_world_sn.py @@ -12,7 +12,7 @@ from cares_reinforcement_learning.networks.world_models.simple_dynamics import ( SimpleDynamics, ) -from cares_reinforcement_learning.networks.world_models.simple_rewards import ( +from cares_reinforcement_learning.networks.world_models.simple_reward_sn import ( SimpleReward, ) from cares_reinforcement_learning.util.helpers import normalize_observation_delta diff --git a/cares_reinforcement_learning/networks/world_models/ensmeble_world_sa.py b/cares_reinforcement_learning/networks/world_models/ensmeble_world_sa.py new file mode 100644 index 00000000..fc4e1f87 --- /dev/null +++ b/cares_reinforcement_learning/networks/world_models/ensmeble_world_sa.py @@ -0,0 +1,161 @@ +import logging +import math +import random +import sys + +import numpy as np +import torch +import torch.nn.functional as F +import torch.utils +from torch import optim + +from cares_reinforcement_learning.networks.world_models.simple_dynamics import ( + SimpleDynamics, +) +from cares_reinforcement_learning.networks.world_models.simple_reward_sa import ( + SimpleRewardSA, +) +from cares_reinforcement_learning.util.helpers import normalize_observation_delta + + +class EnsembleWorldAndOneSAReward: + def __init__( + self, + observation_size: int, + num_actions: int, + num_models: int, + lr: float, + device: str, + hidden_size: int = 128, + ): + self.num_models = num_models + self.observation_size = observation_size + self.num_actions = num_actions + + self.reward_network = SimpleRewardSA( + observation_size=observation_size, + num_actions=num_actions, + hidden_size=hidden_size, + ) + self.reward_optimizer = optim.Adam(self.reward_network.parameters(), lr=lr) + + self.models = [ + SimpleDynamics( + observation_size=observation_size, + num_actions=num_actions, + hidden_size=hidden_size, + ) + for _ in range(self.num_models) + ] + + self.optimizers = [optim.Adam(self.models[i].parameters(), lr=lr) for i in range(self.num_models)] + + self.statistics = {} + + # Bring all reward prediction and dynamic rediction networks to device. + self.device = device + self.reward_network.to(self.device) + for model in self.models: + model.to(device) + + def set_statistics(self, statistics: dict) -> None: + """ + Update all statistics for normalization for all world models and the + ensemble itself. + + :param (Dictionary) statistics: + """ + for key, value in statistics.items(): + if isinstance(value, np.ndarray): + statistics[key] = torch.FloatTensor(statistics[key]).to(self.device) + + self.statistics = statistics + for model in self.models: + model.statistics = statistics + + def pred_rewards(self, observation: torch.Tensor, action: torch.Tensor): + pred_rewards = self.reward_network(observation, action) + return pred_rewards + + def pred_next_states( + self, observation: torch.Tensor, actions: torch.Tensor + ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: + assert ( + observation.shape[1] + actions.shape[1] + == self.observation_size + self.num_actions + ) + means = [] + norm_means = [] + norm_vars = [] + # Iterate over the neural networks and get the predictions + for model in self.models: + # Predict delta + mean, n_mean, n_var = model.forward(observation, actions) + means.append(mean) + norm_means.append(n_mean) + norm_vars.append(n_var) + # Normalized + predictions_means = torch.stack(means) + predictions_norm_means = torch.stack(norm_means) + predictions_vars = torch.stack(norm_vars) + # Get rid of the nans + not_nans = [] + for i in range(self.num_models): + if not torch.any(torch.isnan(predictions_means[i])): + not_nans.append(i) + if len(not_nans) == 0: + logging.info("Predicting all Nans") + sys.exit() + # Random Take next state. + rand_ind = random.randint(0, len(not_nans) - 1) + prediction = predictions_means[not_nans[rand_ind]] + # next = current + delta + prediction += observation + all_predictions = torch.stack(means) + for j in range(all_predictions.shape[0]): + all_predictions[j] += observation + return prediction, all_predictions, predictions_norm_means, predictions_vars + + def train_world( + self, + states: torch.Tensor, + actions: torch.Tensor, + next_states: torch.Tensor, + ) -> None: + + assert len(states.shape) >= 2 + assert len(actions.shape) == 2 + assert ( + states.shape[1] + actions.shape[1] + == self.num_actions + self.observation_size + ) + # For each model, train with different data. + mini_batch_size = int(math.floor(states.shape[0] / self.num_models)) + + for i in range(self.num_models): + sub_states = states[i * mini_batch_size: (i + 1) * mini_batch_size] + sub_actions = actions[i * mini_batch_size: (i + 1) * mini_batch_size] + sub_next_states = next_states[i * mini_batch_size: (i + 1) * mini_batch_size] + sub_target = sub_next_states - sub_states + + delta_targets_normalized = normalize_observation_delta(sub_target, self.statistics) + _, n_mean, n_var = self.models[i].forward(sub_states, sub_actions) + model_loss = F.gaussian_nll_loss(input=n_mean, target=delta_targets_normalized, var=n_var).mean() + + self.optimizers[i].zero_grad() + model_loss.backward() + self.optimizers[i].step() + + def train_reward( + self, + states: torch.Tensor, + actions: torch.Tensor, + rewards: torch.Tensor, + ) -> None: + self.reward_optimizer.zero_grad() + rwd_mean = self.reward_network.forward(states, actions) + reward_loss = F.mse_loss(rwd_mean, rewards) + reward_loss.backward() + self.reward_optimizer.step() + + diff --git a/cares_reinforcement_learning/networks/world_models/simple_reward_sa.py b/cares_reinforcement_learning/networks/world_models/simple_reward_sa.py new file mode 100644 index 00000000..4626ca25 --- /dev/null +++ b/cares_reinforcement_learning/networks/world_models/simple_reward_sa.py @@ -0,0 +1,51 @@ +import torch +from torch import nn +import torch.nn.functional as F +from cares_reinforcement_learning.util.helpers import weight_init + + +class SimpleRewardSA(nn.Module): + def __init__(self, observation_size: int, num_actions: int, hidden_size: int): + """ + Note, This reward function is limited to 0 ~ 1 for dm_control. + A reward model with fully connected layers. It takes current states (s) + and current actions (a), and predict rewards (r). + + :param (int) observation_size -- dimension of states + :param (int) num_actions -- dimension of actions + :param (int) hidden_size -- size of neurons in hidden layers. + """ + super().__init__() + self.observation_size = observation_size + self.num_actions = num_actions + self.linear1 = nn.Linear(observation_size + num_actions, hidden_size) + self.linear2 = nn.Linear(hidden_size, hidden_size) + self.linear3 = nn.Linear(hidden_size, 1) + self.apply(weight_init) + + def forward( + self, observation: torch.Tensor, actions:torch.Tensor, normalized: bool = False + ) -> torch.Tensor: + """ + Forward the inputs throught the network. + Note: For DMCS environment, the reward is from 0~1. + + :param (Tensors) obs -- dimension of states + :param (Tensors) actions -- dimension of actions + :param (Bool) normalized -- whether normalized reward to 0~1 + + :return (Tensors) x -- predicted rewards. + """ + assert ( + observation.shape[1] + actions.shape[1] + == self.observation_size + self.num_actions + ) + x = torch.cat((observation, actions), dim=1) + x = self.linear1(x) + x = F.relu(x) + x = self.linear2(x) + x = F.relu(x) + rwd_mean = self.linear3(x) + if normalized: + rwd_mean = F.sigmoid(rwd_mean) + return rwd_mean diff --git a/cares_reinforcement_learning/networks/world_models/simple_rewards.py b/cares_reinforcement_learning/networks/world_models/simple_reward_sn.py similarity index 100% rename from cares_reinforcement_learning/networks/world_models/simple_rewards.py rename to cares_reinforcement_learning/networks/world_models/simple_reward_sn.py diff --git a/cares_reinforcement_learning/util/configurations.py b/cares_reinforcement_learning/util/configurations.py index f512f1a9..60b379fb 100644 --- a/cares_reinforcement_learning/util/configurations.py +++ b/cares_reinforcement_learning/util/configurations.py @@ -138,6 +138,49 @@ class SACConfig(AlgorithmConfig): reward_scale: Optional[float] = 1.0 +class DynaSAC_SAConfig(AlgorithmConfig): + algorithm: str = Field("DynaSAC_SA", Literal=True) + actor_lr: Optional[float] = 3e-4 + critic_lr: Optional[float] = 3e-4 + + alpha_lr: Optional[float] = 3e-4 + use_bounded_active: Optional[bool] = False + num_models: Optional[int] = 5 + + gamma: Optional[float] = 0.99 + tau: Optional[float] = 0.005 + reward_scale: Optional[float] = 1.0 + + horizon: Optional[int] = 1 + num_samples: Optional[int] = 10 + world_model_lr: Optional[float] = 0.001 + + +class DynaSAC_SABRConfig(AlgorithmConfig): + algorithm: str = Field("DynaSAC_SABR", Literal=True) + actor_lr: Optional[float] = 3e-4 + critic_lr: Optional[float] = 3e-4 + + alpha_lr: Optional[float] = 3e-4 + use_bounded_active: Optional[bool] = False + num_models: Optional[int] = 5 + + gamma: Optional[float] = 0.99 + tau: Optional[float] = 0.005 + reward_scale: Optional[float] = 1.0 + + horizon: Optional[int] = 1 + num_samples: Optional[int] = 10 + world_model_lr: Optional[float] = 0.001 + + threshold_scale: Optional[float] = 0.7 + reweight_critic: Optional[bool] = True + reweight_actor: Optional[bool] = False + + mode: Optional[int] = 1 + sample_times: Optional[int] = 10 + + class DynaSACConfig(AlgorithmConfig): algorithm: str = Field("DynaSAC", Literal=True) actor_lr: Optional[float] = 3e-4 diff --git a/cares_reinforcement_learning/util/network_factory.py b/cares_reinforcement_learning/util/network_factory.py index 1ad24fb0..515f2552 100644 --- a/cares_reinforcement_learning/util/network_factory.py +++ b/cares_reinforcement_learning/util/network_factory.py @@ -77,6 +77,91 @@ def create_PPO(observation_size, action_num, config: AlgorithmConfig): return agent +def create_DynaSAC_SA(observation_size, action_num, config: AlgorithmConfig): + """ + Create networks for model-based SAC agent. The Actor and Critic is same. + An extra world model is added. + + """ + from cares_reinforcement_learning.algorithm.mbrl import DynaSAC_SA + from cares_reinforcement_learning.networks.SAC import Actor, Critic + from cares_reinforcement_learning.networks.world_models.ensmeble_world_sa import EnsembleWorldAndOneSAReward + + actor = Actor(observation_size, action_num) + critic = Critic(observation_size, action_num) + + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + world_model = EnsembleWorldAndOneSAReward( + observation_size=observation_size, + num_actions=action_num, + num_models=config.num_models, + lr=config.world_model_lr, + device=device, + ) + + agent = DynaSAC_SA( + actor_network=actor, + critic_network=critic, + world_network=world_model, + actor_lr=config.actor_lr, + critic_lr=config.critic_lr, + gamma=config.gamma, + tau=config.tau, + action_num=action_num, + alpha_lr=config.alpha_lr, + horizon=config.horizon, + num_samples=config.num_samples, + device=device, + ) + return agent + + +def create_DynaSAC_SABR(observation_size, action_num, config: AlgorithmConfig): + """ + Create networks for model-based SAC agent. The Actor and Critic is same. + An extra world model is added. + + """ + from cares_reinforcement_learning.algorithm.mbrl import DynaSAC_SABR + from cares_reinforcement_learning.networks.SAC import Actor, Critic + from cares_reinforcement_learning.networks.world_models.ensmeble_world_sa import EnsembleWorldAndOneSAReward + + actor = Actor(observation_size, action_num) + critic = Critic(observation_size, action_num) + + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + world_model = EnsembleWorldAndOneSAReward( + observation_size=observation_size, + num_actions=action_num, + num_models=config.num_models, + device=device, + lr=config.world_model_lr, + ) + + agent = DynaSAC_SABR( + actor_network=actor, + critic_network=critic, + world_network=world_model, + actor_lr=config.actor_lr, + critic_lr=config.critic_lr, + gamma=config.gamma, + tau=config.tau, + action_num=action_num, + device=device, + alpha_lr=config.alpha_lr, + horizon=config.horizon, + num_samples=config.num_samples, + threshold_scale=config.threshold_scale, + reweight_critic=config.reweight_critic, + reweight_actor=config.reweight_actor, + mode=config.mode, + sample_times=config.sample_times, + ) + return agent + + def create_DynaSAC_ScaleBatchReweight(observation_size, action_num, config: AlgorithmConfig): """ Create networks for model-based SAC agent. The Actor and Critic is same. From e67eebe0b9d47d2bb30e51838150ed5561f99188 Mon Sep 17 00:00:00 2001 From: tony Date: Tue, 16 Jul 2024 17:57:17 +1200 Subject: [PATCH 50/91] Clean Up and Add the Combo --- ...DynaSAC_BIV.py => DynaSAC_BIV_Reweight.py} | 0 ...eweight.py => DynaSAC_Immerse_Reweight.py} | 16 +- ...t.py => DynaSAC_Immerse_Reweight_Combo.py} | 177 ++++---- .../mbrl/DynaSAC_MaxBatchReweight.py | 400 ----------------- .../DynaSAC_NormalizedSigmoidBatchReweight.py | 420 ------------------ ...SABR.py => DynaSAC_SA_Immerse_Reweight.py} | 0 ...SUNRISE.py => DynaSAC_SUNRISE_Reweight.py} | 0 ...naSAC_UWAC.py => DynaSAC_UWAC_Reweight.py} | 0 .../algorithm/mbrl/__init__.py | 14 +- .../networks/world_models/simple_reward_sa.py | 2 +- .../util/configurations.py | 64 +-- .../util/network_factory.py | 119 +---- 12 files changed, 135 insertions(+), 1077 deletions(-) rename cares_reinforcement_learning/algorithm/mbrl/{DynaSAC_BIV.py => DynaSAC_BIV_Reweight.py} (100%) rename cares_reinforcement_learning/algorithm/mbrl/{DynaSAC_ScaleBatchReweight.py => DynaSAC_Immerse_Reweight.py} (98%) rename cares_reinforcement_learning/algorithm/mbrl/{DynaSAC_BinaryBatchReweight.py => DynaSAC_Immerse_Reweight_Combo.py} (74%) delete mode 100644 cares_reinforcement_learning/algorithm/mbrl/DynaSAC_MaxBatchReweight.py delete mode 100644 cares_reinforcement_learning/algorithm/mbrl/DynaSAC_NormalizedSigmoidBatchReweight.py rename cares_reinforcement_learning/algorithm/mbrl/{DynaSAC_SABR.py => DynaSAC_SA_Immerse_Reweight.py} (100%) rename cares_reinforcement_learning/algorithm/mbrl/{DynaSAC_SUNRISE.py => DynaSAC_SUNRISE_Reweight.py} (100%) rename cares_reinforcement_learning/algorithm/mbrl/{DynaSAC_UWAC.py => DynaSAC_UWAC_Reweight.py} (100%) diff --git a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_BIV.py b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_BIV_Reweight.py similarity index 100% rename from cares_reinforcement_learning/algorithm/mbrl/DynaSAC_BIV.py rename to cares_reinforcement_learning/algorithm/mbrl/DynaSAC_BIV_Reweight.py diff --git a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_ScaleBatchReweight.py b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_Immerse_Reweight.py similarity index 98% rename from cares_reinforcement_learning/algorithm/mbrl/DynaSAC_ScaleBatchReweight.py rename to cares_reinforcement_learning/algorithm/mbrl/DynaSAC_Immerse_Reweight.py index bbed2014..ab26efeb 100644 --- a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_ScaleBatchReweight.py +++ b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_Immerse_Reweight.py @@ -386,18 +386,22 @@ def sampling(self, pred_means, pred_vars): var_a = torch.var(acts, dim=0) var_q = torch.var(qs, dim=0) # For actor: alpha^2 * var_a + var_q - total_var = (self._alpha ** 2) * var_a + var_q + cov_aq + total_var = (self._alpha ** 2) * var_a + var_q + (self._alpha ** 2) * cov_aq - # Exacerbate the sample difference. - old_mean_var = torch.mean(total_var) - # normalize vars to sum = 1 - total_var /= old_mean_var - total_var += 0.00000001 min_var = torch.min(total_var) max_var = torch.max(total_var) # As (max-min) decrease, threshold should go down. threshold = self.threshold_scale * (max_var - min_var) + min_var total_var[total_var <= threshold] = threshold + + # Exacerbate the sample difference. + mean_var = torch.mean(total_var) + + ratio = mean_var / ((1.0 / total_var.shape[0]) * (torch.prod(total_var))) + # normalize vars to sum = 1 + total_var *= () + + total_var += 0.00000001 total_stds = 1 / total_var return total_stds.detach() diff --git a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_BinaryBatchReweight.py b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_Immerse_Reweight_Combo.py similarity index 74% rename from cares_reinforcement_learning/algorithm/mbrl/DynaSAC_BinaryBatchReweight.py rename to cares_reinforcement_learning/algorithm/mbrl/DynaSAC_Immerse_Reweight_Combo.py index 9cc71a94..0b5cc06b 100644 --- a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_BinaryBatchReweight.py +++ b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_Immerse_Reweight_Combo.py @@ -13,36 +13,40 @@ import numpy as np import torch from cares_reinforcement_learning.memory import PrioritizedReplayBuffer +import torch.nn.functional as F from cares_reinforcement_learning.networks.world_models.ensemble_world_sn import ( EnsembleWorldAndOneReward, ) -class DynaSAC_BinaryBatchReweight: +class DynaSAC_Immerse_Reweight_Combo: """ Max as ? """ + def __init__( - self, - actor_network: torch.nn.Module, - critic_network: torch.nn.Module, - world_network: EnsembleWorldAndOneReward, - gamma: float, - tau: float, - action_num: int, - actor_lr: float, - critic_lr: float, - alpha_lr: float, - num_samples: int, - horizon: int, - threshold_scale: float, - mode: int, - sample_times: int, - device: torch.device, + self, + actor_network: torch.nn.Module, + critic_network: torch.nn.Module, + world_network: EnsembleWorldAndOneReward, + gamma: float, + tau: float, + action_num: int, + actor_lr: float, + critic_lr: float, + alpha_lr: float, + num_samples: int, + horizon: int, + threshold_scale_actor: float, + threshold_scale_critic: float, + sample_times: int, + device: torch.device, ): self.type = "mbrl" self.device = device + self.threshold_scale_actor = threshold_scale_actor + self.threshold_scale_critic = threshold_scale_critic # this may be called policy_net in other implementations self.actor_net = actor_network.to(self.device) @@ -76,8 +80,6 @@ def __init__( # World model self.world_model = world_network # Parameter - self.threshold_scale = threshold_scale - self.mode = mode self.sample_times = sample_times @property @@ -86,7 +88,7 @@ def _alpha(self) -> float: # pylint: disable-next=unused-argument to keep the same interface def select_action_from_policy( - self, state: np.ndarray, evaluation: bool = False, noise_scale: float = 0 + self, state: np.ndarray, evaluation: bool = False, noise_scale: float = 0 ) -> np.ndarray: # note that when evaluating this algorithm we need to select mu as self.actor_net.eval() @@ -101,13 +103,14 @@ def select_action_from_policy( return action def _train_policy( - self, - states: torch.Tensor, - actions: torch.Tensor, - rewards: torch.Tensor, - next_states: torch.Tensor, - dones: torch.Tensor, - weights: torch.Tensor, + self, + states: torch.Tensor, + actions: torch.Tensor, + rewards: torch.Tensor, + next_states: torch.Tensor, + dones: torch.Tensor, + critic_weights: torch.Tensor, + actor_weights: torch.Tensor, ) -> None: ################## Update the Critic First #################### # Have more target values? @@ -117,30 +120,25 @@ def _train_policy( next_states, next_actions ) target_q_values = ( - torch.minimum(target_q_one, target_q_two) - self._alpha * next_log_pi + torch.minimum(target_q_one, target_q_two) - self._alpha * next_log_pi ) q_target = rewards + self.gamma * (1 - dones) * target_q_values q_values_one, q_values_two = self.critic_net(states, actions) - # Original loss function + # Reweighted loss function. weight not participant in training. l2_loss_one = (q_values_one - q_target).pow(2) l2_loss_two = (q_values_two - q_target).pow(2) - - # Reweighted loss function. weight not participant in training. - weights = weights.detach() - disc_l2_loss_one = l2_loss_one * weights - disc_l2_loss_two = l2_loss_two * weights + critic_weights = critic_weights.detach() + disc_l2_loss_one = l2_loss_one * critic_weights + disc_l2_loss_two = l2_loss_two * critic_weights # A ratio to scale the loss back to original loss scale. - ratio_1 = torch.mean(l2_loss_one) / torch.mean(disc_l2_loss_one) ratio_1 = ratio_1.detach() ratio_2 = torch.mean(l2_loss_two) / torch.mean(disc_l2_loss_two) ratio_2 = ratio_2.detach() - critic_loss_one = disc_l2_loss_one.mean() * ratio_1 critic_loss_two = disc_l2_loss_two.mean() * ratio_2 - critic_loss_total = critic_loss_one + critic_loss_two # Update the Critic @@ -152,7 +150,13 @@ def _train_policy( pi, first_log_p, _ = self.actor_net(states) qf1_pi, qf2_pi = self.critic_net(states, pi) min_qf_pi = torch.minimum(qf1_pi, qf2_pi) - actor_loss = ((self._alpha * first_log_p) - min_qf_pi).mean() + + actor_weights = actor_weights.detach() + a_loss = (self._alpha * first_log_p) - min_qf_pi + disc_actor_loss = a_loss * actor_weights + actor_ratio = torch.mean(a_loss) / torch.mean(disc_actor_loss) + actor_ratio = actor_ratio.detach() + actor_loss = actor_ratio * torch.mean(disc_actor_loss) # Update the Actor self.actor_net_optimiser.zero_grad() @@ -161,7 +165,7 @@ def _train_policy( # Update the temperature alpha_loss = -( - self.log_alpha * (first_log_p + self.target_entropy).detach() + self.log_alpha * (first_log_p + self.target_entropy).detach() ).mean() self.log_alpha_optimizer.zero_grad() @@ -170,14 +174,14 @@ def _train_policy( if self.learn_counter % self.policy_update_freq == 0: for target_param, param in zip( - self.target_critic_net.parameters(), self.critic_net.parameters() + self.target_critic_net.parameters(), self.critic_net.parameters() ): target_param.data.copy_( param.data * self.tau + target_param.data * (1.0 - self.tau) ) def train_world_model( - self, memory: PrioritizedReplayBuffer, batch_size: int + self, memory: PrioritizedReplayBuffer, batch_size: int ) -> None: experiences = memory.sample_uniform(batch_size) states, actions, rewards, next_states, _, _ = experiences @@ -217,7 +221,8 @@ def train_policy(self, memory: PrioritizedReplayBuffer, batch_size: int) -> None rewards=rewards, next_states=next_states, dones=dones, - weights=full_weights, + critic_weights=full_weights, + actor_weights=full_weights, ) # # # Step 3 Dyna add more data self._dyna_generate_and_train(next_states=next_states) @@ -231,7 +236,9 @@ def _dyna_generate_and_train(self, next_states): pred_actions = [] pred_rs = [] pred_n_states = [] - pred_uncerts = [] + pred_uncerts_actor = [] + pred_uncerts_critic = [] + with torch.no_grad(): pred_state = next_states for _ in range(self.horizon): @@ -243,9 +250,11 @@ def _dyna_generate_and_train(self, next_states): pred_next_state, _, pred_mean, pred_var = self.world_model.pred_next_states( pred_state, pred_acts ) - uncert = self.sampling(pred_means=pred_mean, pred_vars=pred_var) - uncert = uncert.unsqueeze(dim=1).to(self.device) - pred_uncerts.append(uncert) + critic_uncert, actor_uncert = self.sampling(pred_means=pred_mean, pred_vars=pred_var) + critic_uncert = critic_uncert.unsqueeze(dim=1).to(self.device) + actor_uncert = actor_uncert.unsqueeze(dim=1).to(self.device) + pred_uncerts_critic.append(critic_uncert) + pred_uncerts_actor.append(actor_uncert) pred_reward = self.world_model.pred_rewards(pred_next_state) pred_states.append(pred_state) @@ -257,12 +266,13 @@ def _dyna_generate_and_train(self, next_states): pred_actions = torch.vstack(pred_actions) pred_rs = torch.vstack(pred_rs) pred_n_states = torch.vstack(pred_n_states) - pred_weights = torch.vstack(pred_uncerts) + pred_uncerts_actor = torch.vstack(pred_uncerts_actor) + pred_uncerts_critic = torch.vstack(pred_uncerts_critic) # Pay attention to here! It is dones in the Cares RL Code! pred_dones = torch.FloatTensor(np.zeros(pred_rs.shape)).to(self.device) # states, actions, rewards, next_states, not_dones self._train_policy( - pred_states, pred_actions, pred_rs, pred_n_states, pred_dones, pred_weights + pred_states, pred_actions, pred_rs, pred_n_states, pred_dones, pred_uncerts_critic, pred_uncerts_actor ) def sampling(self, pred_means, pred_vars): @@ -338,47 +348,46 @@ def sampling(self, pred_means, pred_vars): qs = torch.stack(qs) var_r = torch.var(rs, dim=0) + var_a = torch.var(acts, dim=0) + var_q = torch.var(qs, dim=0) - if self.mode < 3: - var_a = torch.var(acts, dim=0) - var_q = torch.var(qs, dim=0) - - # Computing covariance. - if self.mode < 2: - mean_a = torch.mean(acts, dim=0, keepdim=True) - mean_q = torch.mean(qs, dim=0, keepdim=True) - diff_a = acts - mean_a - diff_q = qs - mean_q - cov_aq = torch.mean(diff_a * diff_q, dim=0) + mean_a = torch.mean(acts, dim=0, keepdim=True) + mean_q = torch.mean(qs, dim=0, keepdim=True) + diff_a = acts - mean_a + diff_q = qs - mean_q + cov_aq = torch.mean(diff_a * diff_q, dim=0) - if self.mode < 1: - mean_r = torch.mean(rs, dim=0, keepdim=True) - diff_r = rs - mean_r - cov_rq = torch.mean(diff_r * diff_q, dim=0) + mean_r = torch.mean(rs, dim=0, keepdim=True) + diff_r = rs - mean_r + cov_rq = torch.mean(diff_r * diff_q, dim=0) - cov_ra = torch.mean(diff_r * diff_a, dim=0) + cov_ra = torch.mean(diff_r * diff_a, dim=0) gamma_sq = self.gamma * self.gamma - # Ablation - if self.mode == 0: - total_var = var_r + gamma_sq * var_a + gamma_sq * var_q + gamma_sq * 2 * cov_aq + \ - gamma_sq * 2 * cov_rq + gamma_sq * 2 * cov_ra - if self.mode == 1: - total_var = var_r + gamma_sq * var_a + gamma_sq * var_q + gamma_sq * 2 * cov_aq - if self.mode == 2: - total_var = var_r + gamma_sq * var_a + gamma_sq * var_q - if self.mode == 3: - total_var = var_r - - # Exacerbate the sample difference. - min_var = torch.min(total_var) - max_var = torch.max(total_var) - # scale_var = max_var - min_var - mean_var = torch.mean(total_var) - threshold = (self.threshold_scale * (max_var - mean_var)) + mean_var - total_var[total_var <= threshold] = min_var - total_stds = 1 / total_var - return total_stds.detach() + + critic_total_var = var_r + gamma_sq * var_a + gamma_sq * var_q + gamma_sq * 2 * cov_aq + \ + gamma_sq * 2 * cov_rq + gamma_sq * 2 * cov_ra + + # For actor: alpha^2 * var_a + var_q + actor_total_var = (self._alpha ** 2) * var_a + var_q + (self._alpha ** 2) * cov_aq + + critic_min_var = torch.min(critic_total_var) + critic_max_var = torch.max(critic_total_var) + # As (max-min) decrease, threshold should go down. + critic_threshold = self.threshold_scale_critic * (critic_max_var - critic_min_var) + critic_min_var + critic_total_var[critic_total_var <= critic_threshold] = critic_threshold + + actor_min_var = torch.min(actor_total_var) + actor_max_var = torch.max(actor_total_var) + actor_threshold = self.threshold_scale_actor * (actor_max_var - actor_min_var) + actor_min_var + actor_total_var[actor_total_var <= actor_threshold] = actor_threshold + + actor_total_var += 0.00000001 + critic_total_var += 0.00000001 + critic_total_stds = 1 / critic_total_var + actor_total_stds = 1 / actor_total_var + + return critic_total_stds.detach(), actor_total_stds.detach() def set_statistics(self, stats: dict) -> None: self.world_model.set_statistics(stats) diff --git a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_MaxBatchReweight.py b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_MaxBatchReweight.py deleted file mode 100644 index 65693fd8..00000000 --- a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_MaxBatchReweight.py +++ /dev/null @@ -1,400 +0,0 @@ -""" -Sutton, Richard S. "Dyna, an integrated architecture for learning, planning, and reacting." - -Original Paper: https://dl.acm.org/doi/abs/10.1145/122344.122377 - -This code runs automatic entropy tuning -""" - -import copy -import logging -import os - -import numpy as np -import torch -from cares_reinforcement_learning.memory import PrioritizedReplayBuffer - -from cares_reinforcement_learning.networks.world_models.ensemble_world_sn import ( - EnsembleWorldAndOneReward, -) - - -class DynaSAC_MaxBatchReweight: - """ - Max as ? - """ - def __init__( - self, - actor_network: torch.nn.Module, - critic_network: torch.nn.Module, - world_network: EnsembleWorldAndOneReward, - gamma: float, - tau: float, - action_num: int, - actor_lr: float, - critic_lr: float, - alpha_lr: float, - num_samples: int, - horizon: int, - threshold_scale: float, - variance_scale: float, - mode: int, - sample_times: int, - device: torch.device, - ): - self.type = "mbrl" - self.device = device - - # this may be called policy_net in other implementations - self.actor_net = actor_network.to(self.device) - # this may be called soft_q_net in other implementations - self.critic_net = critic_network.to(self.device) - self.target_critic_net = copy.deepcopy(self.critic_net) - - self.gamma = gamma - self.tau = tau - - self.num_samples = num_samples - self.horizon = horizon - self.action_num = action_num - - self.learn_counter = 0 - self.policy_update_freq = 1 - - self.actor_net_optimiser = torch.optim.Adam( - self.actor_net.parameters(), lr=actor_lr - ) - self.critic_net_optimiser = torch.optim.Adam( - self.critic_net.parameters(), lr=critic_lr - ) - - # Set to initial alpha to 1.0 according to other baselines. - self.log_alpha = torch.tensor(np.log(1.0)).to(device) - self.log_alpha.requires_grad = True - self.target_entropy = -action_num - self.log_alpha_optimizer = torch.optim.Adam([self.log_alpha], lr=alpha_lr) - - # World model - self.world_model = world_network - # Parameter - self.threshold_scale = threshold_scale - self.variance_scale = variance_scale - self.mode = mode - self.sample_times = sample_times - - @property - def _alpha(self) -> float: - return self.log_alpha.exp() - - # pylint: disable-next=unused-argument to keep the same interface - def select_action_from_policy( - self, state: np.ndarray, evaluation: bool = False, noise_scale: float = 0 - ) -> np.ndarray: - # note that when evaluating this algorithm we need to select mu as - self.actor_net.eval() - with torch.no_grad(): - state_tensor = torch.FloatTensor(state).unsqueeze(0).to(self.device) - if evaluation is False: - (action, _, _) = self.actor_net(state_tensor) - else: - (_, _, action) = self.actor_net(state_tensor) - action = action.cpu().data.numpy().flatten() - self.actor_net.train() - return action - - def _train_policy( - self, - states: torch.Tensor, - actions: torch.Tensor, - rewards: torch.Tensor, - next_states: torch.Tensor, - dones: torch.Tensor, - weights: torch.Tensor, - ) -> None: - ################## Update the Critic First #################### - # Have more target values? - with torch.no_grad(): - next_actions, next_log_pi, _ = self.actor_net(next_states) - target_q_one, target_q_two = self.target_critic_net( - next_states, next_actions - ) - target_q_values = ( - torch.minimum(target_q_one, target_q_two) - self._alpha * next_log_pi - ) - q_target = rewards + self.gamma * (1 - dones) * target_q_values - - q_values_one, q_values_two = self.critic_net(states, actions) - - # Original loss function - l2_loss_one = (q_values_one - q_target).pow(2) - l2_loss_two = (q_values_two - q_target).pow(2) - - # Reweighted loss function. weight not participant in training. - weights = weights.detach() - disc_l2_loss_one = l2_loss_one * weights - disc_l2_loss_two = l2_loss_two * weights - # A ratio to scale the loss back to original loss scale. - - ratio_1 = torch.mean(l2_loss_one) / torch.mean(disc_l2_loss_one) - ratio_1 = ratio_1.detach() - ratio_2 = torch.mean(l2_loss_two) / torch.mean(disc_l2_loss_two) - ratio_2 = ratio_2.detach() - - critic_loss_one = disc_l2_loss_one.mean() * ratio_1 - critic_loss_two = disc_l2_loss_two.mean() * ratio_2 - - critic_loss_total = critic_loss_one + critic_loss_two - - # Update the Critic - self.critic_net_optimiser.zero_grad() - critic_loss_total.backward() - self.critic_net_optimiser.step() - - ################## Update the Actor Second #################### - pi, first_log_p, _ = self.actor_net(states) - qf1_pi, qf2_pi = self.critic_net(states, pi) - min_qf_pi = torch.minimum(qf1_pi, qf2_pi) - actor_loss = ((self._alpha * first_log_p) - min_qf_pi).mean() - - # Update the Actor - self.actor_net_optimiser.zero_grad() - actor_loss.backward() - self.actor_net_optimiser.step() - - # Update the temperature - alpha_loss = -( - self.log_alpha * (first_log_p + self.target_entropy).detach() - ).mean() - - self.log_alpha_optimizer.zero_grad() - alpha_loss.backward() - self.log_alpha_optimizer.step() - - if self.learn_counter % self.policy_update_freq == 0: - for target_param, param in zip( - self.target_critic_net.parameters(), self.critic_net.parameters() - ): - target_param.data.copy_( - param.data * self.tau + target_param.data * (1.0 - self.tau) - ) - - def train_world_model( - self, memory: PrioritizedReplayBuffer, batch_size: int - ) -> None: - experiences = memory.sample_uniform(batch_size) - states, actions, rewards, next_states, _, _ = experiences - - states = torch.FloatTensor(np.asarray(states)).to(self.device) - actions = torch.FloatTensor(np.asarray(actions)).to(self.device) - rewards = torch.FloatTensor(np.asarray(rewards)).to(self.device).unsqueeze(1) - next_states = torch.FloatTensor(np.asarray(next_states)).to(self.device) - - self.world_model.train_world( - states=states, - actions=actions, - next_states=next_states, - ) - self.world_model.train_reward( - next_states=next_states, - rewards=rewards, - ) - - def train_policy(self, memory: PrioritizedReplayBuffer, batch_size: int) -> None: - self.learn_counter += 1 - - experiences = memory.sample_uniform(batch_size) - states, actions, rewards, next_states, dones, _ = experiences - - # Convert into tensor - states = torch.FloatTensor(np.asarray(states)).to(self.device) - actions = torch.FloatTensor(np.asarray(actions)).to(self.device) - rewards = torch.FloatTensor(np.asarray(rewards)).to(self.device).unsqueeze(1) - next_states = torch.FloatTensor(np.asarray(next_states)).to(self.device) - dones = torch.LongTensor(np.asarray(dones)).to(self.device).unsqueeze(1) - full_weights = torch.ones(rewards.shape).to(self.device) - # Step 2 train as usual - self._train_policy( - states=states, - actions=actions, - rewards=rewards, - next_states=next_states, - dones=dones, - weights=full_weights, - ) - # # # Step 3 Dyna add more data - self._dyna_generate_and_train(next_states=next_states) - - def _dyna_generate_and_train(self, next_states): - """ - Only off-policy Dyna will work. - :param next_states: - """ - pred_states = [] - pred_actions = [] - pred_rs = [] - pred_n_states = [] - pred_uncerts = [] - with torch.no_grad(): - pred_state = next_states - for _ in range(self.horizon): - pred_state = torch.repeat_interleave(pred_state, self.num_samples, dim=0) - # This part is controversial. But random actions is empirically better. - rand_acts = np.random.uniform(-1, 1, (pred_state.shape[0], self.action_num)) - pred_acts = torch.FloatTensor(rand_acts).to(self.device) - - pred_next_state, _, pred_mean, pred_var = self.world_model.pred_next_states( - pred_state, pred_acts - ) - uncert = self.sampling(pred_means=pred_mean, pred_vars=pred_var) - uncert = uncert.unsqueeze(dim=1).to(self.device) - pred_uncerts.append(uncert) - - pred_reward = self.world_model.pred_rewards(pred_next_state) - pred_states.append(pred_state) - pred_actions.append(pred_acts.detach()) - pred_rs.append(pred_reward.detach()) - pred_n_states.append(pred_next_state.detach()) - pred_state = pred_next_state.detach() - pred_states = torch.vstack(pred_states) - pred_actions = torch.vstack(pred_actions) - pred_rs = torch.vstack(pred_rs) - pred_n_states = torch.vstack(pred_n_states) - pred_weights = torch.vstack(pred_uncerts) - # Pay attention to here! It is dones in the Cares RL Code! - pred_dones = torch.FloatTensor(np.zeros(pred_rs.shape)).to(self.device) - # states, actions, rewards, next_states, not_dones - self._train_policy( - pred_states, pred_actions, pred_rs, pred_n_states, pred_dones, pred_weights - ) - - def sampling(self, pred_means, pred_vars): - """ - High std means low uncertainty. Therefore, divided by 1 - - :param pred_means: - :param pred_vars: - :return: - """ - with torch.no_grad(): - # 5 models. Each predict 10 next_states. - sample1 = torch.distributions.Normal(pred_means[0], pred_vars[0]).sample( - [self.sample_times]) - sample2 = torch.distributions.Normal(pred_means[1], pred_vars[1]).sample( - [self.sample_times]) - sample3 = torch.distributions.Normal(pred_means[2], pred_vars[2]).sample( - [self.sample_times]) - sample4 = torch.distributions.Normal(pred_means[3], pred_vars[3]).sample( - [self.sample_times]) - sample5 = torch.distributions.Normal(pred_means[4], pred_vars[4]).sample( - [self.sample_times]) - rs = [] - acts = [] - qs = [] - # Varying the next_state's distribution. - for i in range(self.sample_times): - # 5 models, each sampled 10 times = 50, - pred_rwd1 = self.world_model.pred_rewards(sample1[i]) - pred_rwd2 = self.world_model.pred_rewards(sample2[i]) - pred_rwd3 = self.world_model.pred_rewards(sample3[i]) - pred_rwd4 = self.world_model.pred_rewards(sample4[i]) - pred_rwd5 = self.world_model.pred_rewards(sample5[i]) - rs.append(pred_rwd1) - rs.append(pred_rwd2) - rs.append(pred_rwd3) - rs.append(pred_rwd4) - rs.append(pred_rwd5) - # Each times, 5 models predict different actions. - # [2560, 17] - pred_act1, log_pi1, _ = self.actor_net(sample1[i]) - pred_act2, log_pi2, _ = self.actor_net(sample2[i]) - pred_act3, log_pi3, _ = self.actor_net(sample3[i]) - pred_act4, log_pi4, _ = self.actor_net(sample4[i]) - pred_act5, log_pi5, _ = self.actor_net(sample5[i]) - acts.append(log_pi1) - acts.append(log_pi2) - acts.append(log_pi3) - acts.append(log_pi4) - acts.append(log_pi5) - # How to become the same next state, different action. - # Now: sample1 sample2... same next state, different model. - # Pred_act1 pred_act2 same next_state, different actions. - # 5[] * 10[var of state] - qa1, qa2 = self.target_critic_net(sample1[i], pred_act1) - qa = torch.minimum(qa1, qa2) - qb1, qb2 = self.target_critic_net(sample2[i], pred_act2) - qb = torch.minimum(qb1, qb2) - qc1, qc2 = self.target_critic_net(sample3[i], pred_act3) - qc = torch.minimum(qc1, qc2) - qd1, qd2 = self.target_critic_net(sample4[i], pred_act4) - qd = torch.minimum(qd1, qd2) - qe1, qe2 = self.target_critic_net(sample5[i], pred_act5) - qe = torch.minimum(qe1, qe2) - qs.append(qa) - qs.append(qb) - qs.append(qc) - qs.append(qd) - qs.append(qe) - - rs = torch.stack(rs) - acts = torch.stack(acts) - qs = torch.stack(qs) - - var_r = torch.var(rs, dim=0) - - if self.mode < 3: - var_a = torch.var(acts, dim=0) - var_q = torch.var(qs, dim=0) - - # Computing covariance. - if self.mode < 2: - mean_a = torch.mean(acts, dim=0, keepdim=True) - mean_q = torch.mean(qs, dim=0, keepdim=True) - diff_a = acts - mean_a - diff_q = qs - mean_q - cov_aq = torch.mean(diff_a * diff_q, dim=0) - - if self.mode < 1: - mean_r = torch.mean(rs, dim=0, keepdim=True) - diff_r = rs - mean_r - cov_rq = torch.mean(diff_r * diff_q, dim=0) - - cov_ra = torch.mean(diff_r * diff_a, dim=0) - - gamma_sq = self.gamma * self.gamma - # Ablation - if self.mode == 0: - total_var = var_r + gamma_sq * var_a + gamma_sq * var_q + gamma_sq * 2 * cov_aq + \ - gamma_sq * 2 * cov_rq + gamma_sq * 2 * cov_ra - if self.mode == 1: - total_var = var_r + gamma_sq * var_a + gamma_sq * var_q + gamma_sq * 2 * cov_aq - if self.mode == 2: - total_var = var_r + gamma_sq * var_a + gamma_sq * var_q - if self.mode == 3: - total_var = var_r - - # Exacerbate the sample difference. - min_var = torch.min(total_var) - max_var = torch.max(total_var) - total_var /= (max_var - min_var) - threshold = self.threshold_scale - total_var[total_var <= threshold] = self.variance_scale - total_stds = 1 / total_var - return total_stds.detach() - - def set_statistics(self, stats: dict) -> None: - self.world_model.set_statistics(stats) - - def save_models(self, filename: str, filepath: str = "models") -> None: - path = f"{filepath}/models" if filepath != "models" else filepath - dir_exists = os.path.exists(path) - if not dir_exists: - os.makedirs(path) - torch.save(self.actor_net.state_dict(), f"{path}/{filename}_actor.pth") - torch.save(self.critic_net.state_dict(), f"{path}/{filename}_critic.pth") - logging.info("models has been saved...") - - def load_models(self, filepath: str, filename: str) -> None: - path = f"{filepath}/models" if filepath != "models" else filepath - self.actor_net.load_state_dict(torch.load(f"{path}/{filename}_actor.pth")) - self.critic_net.load_state_dict(torch.load(f"{path}/{filename}_critic.pth")) - logging.info("models has been loaded...") diff --git a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_NormalizedSigmoidBatchReweight.py b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_NormalizedSigmoidBatchReweight.py deleted file mode 100644 index ad531974..00000000 --- a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_NormalizedSigmoidBatchReweight.py +++ /dev/null @@ -1,420 +0,0 @@ -""" -Sutton, Richard S. "Dyna, an integrated architecture for learning, planning, and reacting." - -Original Paper: https://dl.acm.org/doi/abs/10.1145/122344.122377 - -This code runs automatic entropy tuning -""" - -import copy -import logging -import os - -import numpy as np -import torch -from cares_reinforcement_learning.memory import PrioritizedReplayBuffer -import torch.nn.functional as F - - -from cares_reinforcement_learning.networks.world_models.ensemble_world_sn import ( - EnsembleWorldAndOneReward, -) - - -class DynaSAC_NormalizedSigmoidBatchReweight: - """ - Max as ? - """ - def __init__( - self, - actor_network: torch.nn.Module, - critic_network: torch.nn.Module, - world_network: EnsembleWorldAndOneReward, - gamma: float, - tau: float, - action_num: int, - actor_lr: float, - critic_lr: float, - alpha_lr: float, - num_samples: int, - horizon: int, - threshold_scale: float, - reweight_critic: bool, - reweight_actor: bool, - mode: int, - sample_times: int, - device: torch.device, - ): - self.type = "mbrl" - self.device = device - self.reweight_critic = reweight_critic - self.reweight_actor = reweight_actor - # this may be called policy_net in other implementations - self.actor_net = actor_network.to(self.device) - # this may be called soft_q_net in other implementations - self.critic_net = critic_network.to(self.device) - self.target_critic_net = copy.deepcopy(self.critic_net) - - self.gamma = gamma - self.tau = tau - - self.num_samples = num_samples - self.horizon = horizon - self.action_num = action_num - - self.learn_counter = 0 - self.policy_update_freq = 1 - - self.actor_net_optimiser = torch.optim.Adam( - self.actor_net.parameters(), lr=actor_lr - ) - self.critic_net_optimiser = torch.optim.Adam( - self.critic_net.parameters(), lr=critic_lr - ) - - # Set to initial alpha to 1.0 according to other baselines. - self.log_alpha = torch.tensor(np.log(1.0)).to(device) - self.log_alpha.requires_grad = True - self.target_entropy = -action_num - self.log_alpha_optimizer = torch.optim.Adam([self.log_alpha], lr=alpha_lr) - - # World model - self.world_model = world_network - # Parameter - self.threshold_scale = threshold_scale - self.mode = mode - self.sample_times = sample_times - - @property - def _alpha(self) -> float: - return self.log_alpha.exp() - - # pylint: disable-next=unused-argument to keep the same interface - def select_action_from_policy( - self, state: np.ndarray, evaluation: bool = False, noise_scale: float = 0 - ) -> np.ndarray: - # note that when evaluating this algorithm we need to select mu as - self.actor_net.eval() - with torch.no_grad(): - state_tensor = torch.FloatTensor(state).unsqueeze(0).to(self.device) - if evaluation is False: - (action, _, _) = self.actor_net(state_tensor) - else: - (_, _, action) = self.actor_net(state_tensor) - action = action.cpu().data.numpy().flatten() - self.actor_net.train() - return action - - def _train_policy( - self, - states: torch.Tensor, - actions: torch.Tensor, - rewards: torch.Tensor, - next_states: torch.Tensor, - dones: torch.Tensor, - weights: torch.Tensor, - ) -> None: - ################## Update the Critic First #################### - # Have more target values? - with torch.no_grad(): - next_actions, next_log_pi, _ = self.actor_net(next_states) - target_q_one, target_q_two = self.target_critic_net( - next_states, next_actions - ) - target_q_values = ( - torch.minimum(target_q_one, target_q_two) - self._alpha * next_log_pi - ) - q_target = rewards + self.gamma * (1 - dones) * target_q_values - - q_values_one, q_values_two = self.critic_net(states, actions) - - if self.reweight_critic: - # Reweighted loss function. weight not participant in training. - l2_loss_one = (q_values_one - q_target).pow(2) - l2_loss_two = (q_values_two - q_target).pow(2) - - weights = weights.detach() - disc_l2_loss_one = l2_loss_one * weights - disc_l2_loss_two = l2_loss_two * weights - # A ratio to scale the loss back to original loss scale. - - ratio_1 = torch.mean(l2_loss_one) / torch.mean(disc_l2_loss_one) - ratio_1 = ratio_1.detach() - ratio_2 = torch.mean(l2_loss_two) / torch.mean(disc_l2_loss_two) - ratio_2 = ratio_2.detach() - - critic_loss_one = disc_l2_loss_one.mean() * ratio_1 - critic_loss_two = disc_l2_loss_two.mean() * ratio_2 - - critic_loss_total = critic_loss_one + critic_loss_two - else: - critic_loss_one = F.mse_loss(q_values_one, q_target) - critic_loss_two = F.mse_loss(q_values_two, q_target) - critic_loss_total = critic_loss_one + critic_loss_two - - # Update the Critic - self.critic_net_optimiser.zero_grad() - critic_loss_total.backward() - self.critic_net_optimiser.step() - - ################## Update the Actor Second #################### - pi, first_log_p, _ = self.actor_net(states) - qf1_pi, qf2_pi = self.critic_net(states, pi) - min_qf_pi = torch.minimum(qf1_pi, qf2_pi) - - if self.reweight_actor: - weights = weights.detach() - a_loss = (self._alpha * first_log_p) - min_qf_pi - disc_actor_loss = a_loss * weights - ratio = torch.mean(a_loss) / torch.mean(disc_actor_loss) - ratio = ratio.detach() - actor_loss = ratio * torch.mean(disc_actor_loss) - else: - actor_loss = ((self._alpha * first_log_p) - min_qf_pi).mean() - - # Update the Actor - self.actor_net_optimiser.zero_grad() - actor_loss.backward() - self.actor_net_optimiser.step() - - # Update the temperature - alpha_loss = -( - self.log_alpha * (first_log_p + self.target_entropy).detach() - ).mean() - - self.log_alpha_optimizer.zero_grad() - alpha_loss.backward() - self.log_alpha_optimizer.step() - - if self.learn_counter % self.policy_update_freq == 0: - for target_param, param in zip( - self.target_critic_net.parameters(), self.critic_net.parameters() - ): - target_param.data.copy_( - param.data * self.tau + target_param.data * (1.0 - self.tau) - ) - - def train_world_model( - self, memory: PrioritizedReplayBuffer, batch_size: int - ) -> None: - experiences = memory.sample_uniform(batch_size) - states, actions, rewards, next_states, _, _ = experiences - - states = torch.FloatTensor(np.asarray(states)).to(self.device) - actions = torch.FloatTensor(np.asarray(actions)).to(self.device) - rewards = torch.FloatTensor(np.asarray(rewards)).to(self.device).unsqueeze(1) - next_states = torch.FloatTensor(np.asarray(next_states)).to(self.device) - - self.world_model.train_world( - states=states, - actions=actions, - next_states=next_states, - ) - self.world_model.train_reward( - next_states=next_states, - rewards=rewards, - ) - - def train_policy(self, memory: PrioritizedReplayBuffer, batch_size: int) -> None: - self.learn_counter += 1 - - experiences = memory.sample_uniform(batch_size) - states, actions, rewards, next_states, dones, _ = experiences - - # Convert into tensor - states = torch.FloatTensor(np.asarray(states)).to(self.device) - actions = torch.FloatTensor(np.asarray(actions)).to(self.device) - rewards = torch.FloatTensor(np.asarray(rewards)).to(self.device).unsqueeze(1) - next_states = torch.FloatTensor(np.asarray(next_states)).to(self.device) - dones = torch.LongTensor(np.asarray(dones)).to(self.device).unsqueeze(1) - full_weights = torch.ones(rewards.shape).to(self.device) - # Step 2 train as usual - self._train_policy( - states=states, - actions=actions, - rewards=rewards, - next_states=next_states, - dones=dones, - weights=full_weights, - ) - # # # Step 3 Dyna add more data - self._dyna_generate_and_train(next_states=next_states) - - def _dyna_generate_and_train(self, next_states): - """ - Only off-policy Dyna will work. - :param next_states: - """ - pred_states = [] - pred_actions = [] - pred_rs = [] - pred_n_states = [] - pred_uncerts = [] - with torch.no_grad(): - pred_state = next_states - for _ in range(self.horizon): - pred_state = torch.repeat_interleave(pred_state, self.num_samples, dim=0) - # This part is controversial. But random actions is empirically better. - rand_acts = np.random.uniform(-1, 1, (pred_state.shape[0], self.action_num)) - pred_acts = torch.FloatTensor(rand_acts).to(self.device) - - pred_next_state, _, pred_mean, pred_var = self.world_model.pred_next_states( - pred_state, pred_acts - ) - uncert = self.sampling(pred_means=pred_mean, pred_vars=pred_var) - uncert = uncert.unsqueeze(dim=1).to(self.device) - pred_uncerts.append(uncert) - - pred_reward = self.world_model.pred_rewards(pred_next_state) - pred_states.append(pred_state) - pred_actions.append(pred_acts.detach()) - pred_rs.append(pred_reward.detach()) - pred_n_states.append(pred_next_state.detach()) - pred_state = pred_next_state.detach() - pred_states = torch.vstack(pred_states) - pred_actions = torch.vstack(pred_actions) - pred_rs = torch.vstack(pred_rs) - pred_n_states = torch.vstack(pred_n_states) - pred_weights = torch.vstack(pred_uncerts) - # Pay attention to here! It is dones in the Cares RL Code! - pred_dones = torch.FloatTensor(np.zeros(pred_rs.shape)).to(self.device) - # states, actions, rewards, next_states, not_dones - self._train_policy( - pred_states, pred_actions, pred_rs, pred_n_states, pred_dones, pred_weights - ) - - def sampling(self, pred_means, pred_vars): - """ - High std means low uncertainty. Therefore, divided by 1 - - :param pred_means: - :param pred_vars: - :return: - """ - with torch.no_grad(): - # 5 models. Each predict 10 next_states. - sample1 = torch.distributions.Normal(pred_means[0], pred_vars[0]).sample( - [self.sample_times]) - sample2 = torch.distributions.Normal(pred_means[1], pred_vars[1]).sample( - [self.sample_times]) - sample3 = torch.distributions.Normal(pred_means[2], pred_vars[2]).sample( - [self.sample_times]) - sample4 = torch.distributions.Normal(pred_means[3], pred_vars[3]).sample( - [self.sample_times]) - sample5 = torch.distributions.Normal(pred_means[4], pred_vars[4]).sample( - [self.sample_times]) - rs = [] - acts = [] - qs = [] - # Varying the next_state's distribution. - for i in range(self.sample_times): - # 5 models, each sampled 10 times = 50, - pred_rwd1 = self.world_model.pred_rewards(sample1[i]) - pred_rwd2 = self.world_model.pred_rewards(sample2[i]) - pred_rwd3 = self.world_model.pred_rewards(sample3[i]) - pred_rwd4 = self.world_model.pred_rewards(sample4[i]) - pred_rwd5 = self.world_model.pred_rewards(sample5[i]) - rs.append(pred_rwd1) - rs.append(pred_rwd2) - rs.append(pred_rwd3) - rs.append(pred_rwd4) - rs.append(pred_rwd5) - # Each times, 5 models predict different actions. - # [2560, 17] - pred_act1, log_pi1, _ = self.actor_net(sample1[i]) - pred_act2, log_pi2, _ = self.actor_net(sample2[i]) - pred_act3, log_pi3, _ = self.actor_net(sample3[i]) - pred_act4, log_pi4, _ = self.actor_net(sample4[i]) - pred_act5, log_pi5, _ = self.actor_net(sample5[i]) - acts.append(log_pi1) - acts.append(log_pi2) - acts.append(log_pi3) - acts.append(log_pi4) - acts.append(log_pi5) - # How to become the same next state, different action. - # Now: sample1 sample2... same next state, different model. - # Pred_act1 pred_act2 same next_state, different actions. - # 5[] * 10[var of state] - qa1, qa2 = self.target_critic_net(sample1[i], pred_act1) - qa = torch.minimum(qa1, qa2) - qb1, qb2 = self.target_critic_net(sample2[i], pred_act2) - qb = torch.minimum(qb1, qb2) - qc1, qc2 = self.target_critic_net(sample3[i], pred_act3) - qc = torch.minimum(qc1, qc2) - qd1, qd2 = self.target_critic_net(sample4[i], pred_act4) - qd = torch.minimum(qd1, qd2) - qe1, qe2 = self.target_critic_net(sample5[i], pred_act5) - qe = torch.minimum(qe1, qe2) - qs.append(qa) - qs.append(qb) - qs.append(qc) - qs.append(qd) - qs.append(qe) - - rs = torch.stack(rs) - acts = torch.stack(acts) - qs = torch.stack(qs) - - var_r = torch.var(rs, dim=0) - - if self.mode < 3: - var_a = torch.var(acts, dim=0) - var_q = torch.var(qs, dim=0) - - # Computing covariance. - if self.mode < 2: - mean_a = torch.mean(acts, dim=0, keepdim=True) - mean_q = torch.mean(qs, dim=0, keepdim=True) - diff_a = acts - mean_a - diff_q = qs - mean_q - cov_aq = torch.mean(diff_a * diff_q, dim=0) - - if self.mode < 1: - mean_r = torch.mean(rs, dim=0, keepdim=True) - diff_r = rs - mean_r - cov_rq = torch.mean(diff_r * diff_q, dim=0) - - cov_ra = torch.mean(diff_r * diff_a, dim=0) - - gamma_sq = self.gamma * self.gamma - # Ablation - if self.mode == 0: - total_var = var_r + gamma_sq * var_a + gamma_sq * var_q + gamma_sq * 2 * cov_aq + \ - gamma_sq * 2 * cov_rq + gamma_sq * 2 * cov_ra - if self.mode == 1: - total_var = var_r + gamma_sq * var_a + gamma_sq * var_q + gamma_sq * 2 * cov_aq - if self.mode == 2: - total_var = var_r + gamma_sq * var_a + gamma_sq * var_q - if self.mode == 3: - total_var = var_r - - # Exacerbate the sample difference. - min_var = torch.min(total_var) - max_var = torch.max(total_var) - scale_var = max_var - min_var - # 0 - 1 - total_var -= min_var - total_var /= scale_var - # 0 - scale - total_var *= self.threshold_scale - total_stds = torch.sigmoid(-1.0 * total_var) + 0.5 - - return total_stds.detach() - - def set_statistics(self, stats: dict) -> None: - self.world_model.set_statistics(stats) - - def save_models(self, filename: str, filepath: str = "models") -> None: - path = f"{filepath}/models" if filepath != "models" else filepath - dir_exists = os.path.exists(path) - if not dir_exists: - os.makedirs(path) - torch.save(self.actor_net.state_dict(), f"{path}/{filename}_actor.pth") - torch.save(self.critic_net.state_dict(), f"{path}/{filename}_critic.pth") - logging.info("models has been saved...") - - def load_models(self, filepath: str, filename: str) -> None: - path = f"{filepath}/models" if filepath != "models" else filepath - self.actor_net.load_state_dict(torch.load(f"{path}/{filename}_actor.pth")) - self.critic_net.load_state_dict(torch.load(f"{path}/{filename}_critic.pth")) - logging.info("models has been loaded...") diff --git a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_SABR.py b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_SA_Immerse_Reweight.py similarity index 100% rename from cares_reinforcement_learning/algorithm/mbrl/DynaSAC_SABR.py rename to cares_reinforcement_learning/algorithm/mbrl/DynaSAC_SA_Immerse_Reweight.py diff --git a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_SUNRISE.py b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_SUNRISE_Reweight.py similarity index 100% rename from cares_reinforcement_learning/algorithm/mbrl/DynaSAC_SUNRISE.py rename to cares_reinforcement_learning/algorithm/mbrl/DynaSAC_SUNRISE_Reweight.py diff --git a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_UWAC.py b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_UWAC_Reweight.py similarity index 100% rename from cares_reinforcement_learning/algorithm/mbrl/DynaSAC_UWAC.py rename to cares_reinforcement_learning/algorithm/mbrl/DynaSAC_UWAC_Reweight.py diff --git a/cares_reinforcement_learning/algorithm/mbrl/__init__.py b/cares_reinforcement_learning/algorithm/mbrl/__init__.py index 24e542ec..b5692e5b 100644 --- a/cares_reinforcement_learning/algorithm/mbrl/__init__.py +++ b/cares_reinforcement_learning/algorithm/mbrl/__init__.py @@ -1,10 +1,8 @@ from .DynaSAC import DynaSAC -from .DynaSAC_ScaleBatchReweight import DynaSAC_ScaleBatchReweight -from .DynaSAC_BinaryBatchReweight import DynaSAC_BinaryBatchReweight -from .DynaSAC_MaxBatchReweight import DynaSAC_MaxBatchReweight -from .DynaSAC_SUNRISE import DynaSAC_SUNRISEReweight -from .DynaSAC_UWAC import DynaSAC_UWACReweight -from .DynaSAC_BIV import DynaSAC_BIVReweight -from .DynaSAC_NormalizedSigmoidBatchReweight import DynaSAC_NormalizedSigmoidBatchReweight +from .DynaSAC_Immerse_Reweight import DynaSAC_ScaleBatchReweight +from .DynaSAC_Immerse_Reweight_Combo import DynaSAC_Immerse_Reweight_Combo +from .DynaSAC_SUNRISE_Reweight import DynaSAC_SUNRISEReweight +from .DynaSAC_UWAC_Reweight import DynaSAC_UWACReweight +from .DynaSAC_BIV_Reweight import DynaSAC_BIVReweight from .DynaSAC_SA import DynaSAC_SA -from .DynaSAC_SABR import DynaSAC_SABR +from .DynaSAC_SA_Immerse_Reweight import DynaSAC_SABR diff --git a/cares_reinforcement_learning/networks/world_models/simple_reward_sa.py b/cares_reinforcement_learning/networks/world_models/simple_reward_sa.py index 4626ca25..d94ff29c 100644 --- a/cares_reinforcement_learning/networks/world_models/simple_reward_sa.py +++ b/cares_reinforcement_learning/networks/world_models/simple_reward_sa.py @@ -24,7 +24,7 @@ def __init__(self, observation_size: int, num_actions: int, hidden_size: int): self.apply(weight_init) def forward( - self, observation: torch.Tensor, actions:torch.Tensor, normalized: bool = False + self, observation: torch.Tensor, actions: torch.Tensor, normalized: bool = False ) -> torch.Tensor: """ Forward the inputs throught the network. diff --git a/cares_reinforcement_learning/util/configurations.py b/cares_reinforcement_learning/util/configurations.py index 60b379fb..85841c65 100644 --- a/cares_reinforcement_learning/util/configurations.py +++ b/cares_reinforcement_learning/util/configurations.py @@ -199,8 +199,8 @@ class DynaSACConfig(AlgorithmConfig): world_model_lr: Optional[float] = 0.001 -class DynaSAC_BinaryBatchReweightConfig(AlgorithmConfig): - algorithm: str = Field("DynaSAC_BinaryBatchReweight", Literal=True) +class DynaSAC_ScaleBatchReweightConfig(AlgorithmConfig): + algorithm: str = Field("DynaSAC_ScaleBatchReweight", Literal=True) actor_lr: Optional[float] = 3e-4 critic_lr: Optional[float] = 3e-4 @@ -217,35 +217,15 @@ class DynaSAC_BinaryBatchReweightConfig(AlgorithmConfig): world_model_lr: Optional[float] = 0.001 threshold_scale: Optional[float] = 0.7 - mode: Optional[int] = 1 - sample_times: Optional[int] = 10 - - -class DynaSAC_MaxBatchReweightConfig(AlgorithmConfig): - algorithm: str = Field("DynaSAC_MaxBatchReweight", Literal=True) - actor_lr: Optional[float] = 3e-4 - critic_lr: Optional[float] = 3e-4 - - alpha_lr: Optional[float] = 3e-4 - use_bounded_active: Optional[bool] = False - num_models: Optional[int] = 5 - - gamma: Optional[float] = 0.99 - tau: Optional[float] = 0.005 - reward_scale: Optional[float] = 1.0 - - horizon: Optional[int] = 1 - num_samples: Optional[int] = 10 - world_model_lr: Optional[float] = 0.001 + reweight_critic: Optional[bool] = True + reweight_actor: Optional[bool] = False - threshold_scale: Optional[float] = 0.7 - variance_scale: Optional[float] = 0.1 mode: Optional[int] = 1 sample_times: Optional[int] = 10 -class DynaSAC_ScaleBatchReweightConfig(AlgorithmConfig): - algorithm: str = Field("DynaSAC_ScaleBatchReweight", Literal=True) +class DynaSAC_Immerse_Reweight_ComboConfig(AlgorithmConfig): + algorithm: str = Field("DynaSAC_Immerse_Reweight_Combo", Literal=True) actor_lr: Optional[float] = 3e-4 critic_lr: Optional[float] = 3e-4 @@ -261,13 +241,12 @@ class DynaSAC_ScaleBatchReweightConfig(AlgorithmConfig): num_samples: Optional[int] = 10 world_model_lr: Optional[float] = 0.001 - threshold_scale: Optional[float] = 0.7 - reweight_critic: Optional[bool] = True - reweight_actor: Optional[bool] = False + threshold_scale_critic: Optional[float] = 0.7 + threshold_scale_actor: Optional[float] = 0.7 - mode: Optional[int] = 1 sample_times: Optional[int] = 10 + class DynaSAC_BIVReweightConfig(AlgorithmConfig): algorithm: str = Field("DynaSAC_BIVReweight", Literal=True) actor_lr: Optional[float] = 3e-4 @@ -342,31 +321,6 @@ class DynaSAC_UWACReweightConfig(AlgorithmConfig): sample_times: Optional[int] = 10 -class DynaSAC_NormalizedSigmoidBatchReweightConfig(AlgorithmConfig): - algorithm: str = Field("DynaSAC_NormalizedSigmoidBatchReweight", Literal=True) - actor_lr: Optional[float] = 3e-4 - critic_lr: Optional[float] = 3e-4 - - alpha_lr: Optional[float] = 3e-4 - use_bounded_active: Optional[bool] = False - num_models: Optional[int] = 5 - - gamma: Optional[float] = 0.99 - tau: Optional[float] = 0.005 - reward_scale: Optional[float] = 1.0 - - horizon: Optional[int] = 1 - num_samples: Optional[int] = 10 - world_model_lr: Optional[float] = 0.001 - - threshold_scale: Optional[float] = 4.0 - reweight_critic: Optional[bool] = True - reweight_actor: Optional[bool] = False - - mode: Optional[int] = 1 - sample_times: Optional[int] = 10 - - class NaSATD3Config(AlgorithmConfig): algorithm: str = Field("NaSATD3", Literal=True) diff --git a/cares_reinforcement_learning/util/network_factory.py b/cares_reinforcement_learning/util/network_factory.py index 515f2552..77ad6fa0 100644 --- a/cares_reinforcement_learning/util/network_factory.py +++ b/cares_reinforcement_learning/util/network_factory.py @@ -206,13 +206,14 @@ def create_DynaSAC_ScaleBatchReweight(observation_size, action_num, config: Algo ) return agent -def create_DynaSAC_BIVReweight(observation_size, action_num, config: AlgorithmConfig): + +def create_DynaSAC_Immerse_Reweight_Combo(observation_size, action_num, config: AlgorithmConfig): """ Create networks for model-based SAC agent. The Actor and Critic is same. An extra world model is added. """ - from cares_reinforcement_learning.algorithm.mbrl import DynaSAC_BIVReweight + from cares_reinforcement_learning.algorithm.mbrl import DynaSAC_Immerse_Reweight_Combo from cares_reinforcement_learning.networks.SAC import Actor, Critic from cares_reinforcement_learning.networks.world_models import EnsembleWorldAndOneReward @@ -229,7 +230,7 @@ def create_DynaSAC_BIVReweight(observation_size, action_num, config: AlgorithmCo lr=config.world_model_lr, ) - agent = DynaSAC_BIVReweight( + agent = DynaSAC_Immerse_Reweight_Combo( actor_network=actor, critic_network=critic, world_network=world_model, @@ -242,21 +243,20 @@ def create_DynaSAC_BIVReweight(observation_size, action_num, config: AlgorithmCo alpha_lr=config.alpha_lr, horizon=config.horizon, num_samples=config.num_samples, - threshold_scale=config.threshold_scale, - reweight_critic=config.reweight_critic, - reweight_actor=config.reweight_actor, - mode=config.mode, + threshold_scale_critic=config.threshold_scale_critic, + threshold_scale_actor=config.threshold_scale_actor, sample_times=config.sample_times, ) return agent -def create_DynaSAC_SUNRISEReweight(observation_size, action_num, config: AlgorithmConfig): + +def create_DynaSAC_BIVReweight(observation_size, action_num, config: AlgorithmConfig): """ Create networks for model-based SAC agent. The Actor and Critic is same. An extra world model is added. """ - from cares_reinforcement_learning.algorithm.mbrl import DynaSAC_SUNRISEReweight + from cares_reinforcement_learning.algorithm.mbrl import DynaSAC_BIVReweight from cares_reinforcement_learning.networks.SAC import Actor, Critic from cares_reinforcement_learning.networks.world_models import EnsembleWorldAndOneReward @@ -273,7 +273,7 @@ def create_DynaSAC_SUNRISEReweight(observation_size, action_num, config: Algorit lr=config.world_model_lr, ) - agent = DynaSAC_SUNRISEReweight( + agent = DynaSAC_BIVReweight( actor_network=actor, critic_network=critic, world_network=world_model, @@ -294,13 +294,13 @@ def create_DynaSAC_SUNRISEReweight(observation_size, action_num, config: Algorit ) return agent -def create_DynaSAC_UWACReweight(observation_size, action_num, config: AlgorithmConfig): +def create_DynaSAC_SUNRISEReweight(observation_size, action_num, config: AlgorithmConfig): """ Create networks for model-based SAC agent. The Actor and Critic is same. An extra world model is added. """ - from cares_reinforcement_learning.algorithm.mbrl import DynaSAC_UWACReweight + from cares_reinforcement_learning.algorithm.mbrl import DynaSAC_SUNRISEReweight from cares_reinforcement_learning.networks.SAC import Actor, Critic from cares_reinforcement_learning.networks.world_models import EnsembleWorldAndOneReward @@ -317,7 +317,7 @@ def create_DynaSAC_UWACReweight(observation_size, action_num, config: AlgorithmC lr=config.world_model_lr, ) - agent = DynaSAC_UWACReweight( + agent = DynaSAC_SUNRISEReweight( actor_network=actor, critic_network=critic, world_network=world_model, @@ -339,13 +339,13 @@ def create_DynaSAC_UWACReweight(observation_size, action_num, config: AlgorithmC return agent -def create_DynaSAC_NormalizedSigmoidBatchReweight(observation_size, action_num, config: AlgorithmConfig): +def create_DynaSAC_UWACReweight(observation_size, action_num, config: AlgorithmConfig): """ Create networks for model-based SAC agent. The Actor and Critic is same. An extra world model is added. """ - from cares_reinforcement_learning.algorithm.mbrl import DynaSAC_NormalizedSigmoidBatchReweight + from cares_reinforcement_learning.algorithm.mbrl import DynaSAC_UWACReweight from cares_reinforcement_learning.networks.SAC import Actor, Critic from cares_reinforcement_learning.networks.world_models import EnsembleWorldAndOneReward @@ -362,7 +362,7 @@ def create_DynaSAC_NormalizedSigmoidBatchReweight(observation_size, action_num, lr=config.world_model_lr, ) - agent = DynaSAC_NormalizedSigmoidBatchReweight( + agent = DynaSAC_UWACReweight( actor_network=actor, critic_network=critic, world_network=world_model, @@ -384,93 +384,6 @@ def create_DynaSAC_NormalizedSigmoidBatchReweight(observation_size, action_num, return agent -def create_DynaSAC_MaxBatchReweight(observation_size, action_num, config: AlgorithmConfig): - """ - Create networks for model-based SAC agent. The Actor and Critic is same. - An extra world model is added. - - """ - from cares_reinforcement_learning.algorithm.mbrl import DynaSAC_MaxBatchReweight - from cares_reinforcement_learning.networks.SAC import Actor, Critic - from cares_reinforcement_learning.networks.world_models import EnsembleWorldAndOneReward - - actor = Actor(observation_size, action_num) - critic = Critic(observation_size, action_num) - - device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - - world_model = EnsembleWorldAndOneReward( - observation_size=observation_size, - num_actions=action_num, - num_models=config.num_models, - device=device, - lr=config.world_model_lr, - ) - - agent = DynaSAC_MaxBatchReweight( - actor_network=actor, - critic_network=critic, - world_network=world_model, - actor_lr=config.actor_lr, - critic_lr=config.critic_lr, - gamma=config.gamma, - tau=config.tau, - action_num=action_num, - device=device, - alpha_lr=config.alpha_lr, - horizon=config.horizon, - num_samples=config.num_samples, - threshold_scale=config.threshold_scale, - variance_scale=config.variance_scale, - mode=config.mode, - sample_times=config.sample_times, - ) - return agent - - -def create_DynaSAC_BinaryBatchReweight(observation_size, action_num, config: AlgorithmConfig): - """ - Create networks for model-based SAC agent. The Actor and Critic is same. - An extra world model is added. - - """ - from cares_reinforcement_learning.algorithm.mbrl import DynaSAC_BinaryBatchReweight - from cares_reinforcement_learning.networks.SAC import Actor, Critic - from cares_reinforcement_learning.networks.world_models import EnsembleWorldAndOneReward - - actor = Actor(observation_size, action_num) - critic = Critic(observation_size, action_num) - - device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - - world_model = EnsembleWorldAndOneReward( - observation_size=observation_size, - num_actions=action_num, - num_models=config.num_models, - device=device, - lr=config.world_model_lr, - ) - - agent = DynaSAC_BinaryBatchReweight( - actor_network=actor, - critic_network=critic, - world_network=world_model, - actor_lr=config.actor_lr, - critic_lr=config.critic_lr, - gamma=config.gamma, - tau=config.tau, - action_num=action_num, - device=device, - alpha_lr=config.alpha_lr, - horizon=config.horizon, - num_samples=config.num_samples, - threshold_scale=config.threshold_scale, - mode=config.mode, - sample_times=config.sample_times, - ) - return agent - - def create_DynaSAC(observation_size, action_num, config: AlgorithmConfig): """ Create networks for model-based SAC agent. The Actor and Critic is same. From dc948adde51e2ba1e3a65c46ac051b0cfe27e0a7 Mon Sep 17 00:00:00 2001 From: tony Date: Wed, 24 Jul 2024 19:49:08 +1200 Subject: [PATCH 51/91] Clean Up and Add the Combo --- .../algorithm/mbrl/DynaSAC_Immerse_Reweight.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_Immerse_Reweight.py b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_Immerse_Reweight.py index ab26efeb..ec709286 100644 --- a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_Immerse_Reweight.py +++ b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_Immerse_Reweight.py @@ -395,11 +395,10 @@ def sampling(self, pred_means, pred_vars): total_var[total_var <= threshold] = threshold # Exacerbate the sample difference. - mean_var = torch.mean(total_var) - - ratio = mean_var / ((1.0 / total_var.shape[0]) * (torch.prod(total_var))) + # mean_var = torch.mean(total_var) + # ratio = mean_var / ((1.0 / total_var.shape[0]) * (torch.prod(total_var))) # normalize vars to sum = 1 - total_var *= () + # total_var *= () total_var += 0.00000001 total_stds = 1 / total_var From 36dea8b58504b147f9eb2ebcf61c1dd456f1dcde Mon Sep 17 00:00:00 2001 From: tony Date: Fri, 2 Aug 2024 22:23:15 +1200 Subject: [PATCH 52/91] Tidy up ensemble models. --- .../algorithm/mbrl/DynaSAC.py | 49 +-- .../algorithm/mbrl/DynaSAC_BIV_Reweight.py | 2 +- .../mbrl/DynaSAC_Immerse_Reweight.py | 2 +- .../mbrl/DynaSAC_Immerse_Reweight_Combo.py | 2 +- .../algorithm/mbrl/DynaSAC_SA.py | 2 +- .../mbrl/DynaSAC_SA_Immerse_Reweight.py | 2 +- .../mbrl/DynaSAC_SUNRISE_Reweight.py | 2 +- .../algorithm/mbrl/DynaSAC_UWAC_Reweight.py | 2 +- .../algorithm/mbrl/STEVESAC.py | 309 ++++++++++++++++++ .../algorithm/mbrl/__init__.py | 3 +- .../networks/world_models/__init__.py | 4 +- .../networks/world_models/ensemble_all.py | 194 +++++++++++ .../world_models/ensemble_integrated.py | 15 +- ...emble_world_sn.py => ensemble_ns_world.py} | 12 +- ...meble_world_sa.py => ensmeble_sa_world.py} | 12 +- ..._dynamics.py => probabilistic_dynamics.py} | 3 +- .../world_models/probabilistic_sas_reward.py | 54 +++ ...imple_reward_sn.py => simple_ns_reward.py} | 2 +- ...imple_reward_sa.py => simple_sa_reward.py} | 2 +- ...bability_rewards.py => simple_sas_done.py} | 21 +- .../util/configurations.py | 16 + .../util/network_factory.py | 48 ++- 22 files changed, 667 insertions(+), 91 deletions(-) create mode 100644 cares_reinforcement_learning/networks/world_models/ensemble_all.py rename cares_reinforcement_learning/networks/world_models/{ensemble_world_sn.py => ensemble_ns_world.py} (94%) rename cares_reinforcement_learning/networks/world_models/{ensmeble_world_sa.py => ensmeble_sa_world.py} (94%) rename cares_reinforcement_learning/networks/world_models/{simple_dynamics.py => probabilistic_dynamics.py} (98%) create mode 100644 cares_reinforcement_learning/networks/world_models/probabilistic_sas_reward.py rename cares_reinforcement_learning/networks/world_models/{simple_reward_sn.py => simple_ns_reward.py} (98%) rename cares_reinforcement_learning/networks/world_models/{simple_reward_sa.py => simple_sa_reward.py} (98%) rename cares_reinforcement_learning/networks/world_models/{probability_rewards.py => simple_sas_done.py} (74%) diff --git a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC.py b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC.py index 646302e2..e10eadf1 100644 --- a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC.py +++ b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC.py @@ -16,17 +16,17 @@ from cares_reinforcement_learning.memory import PrioritizedReplayBuffer -from cares_reinforcement_learning.networks.world_models.ensemble_world_sn import ( - EnsembleWorldAndOneReward, +from cares_reinforcement_learning.networks.world_models.ensmeble_sa_world import ( + EnsembleWorldAndOneSAReward, ) -class DynaSAC: +class DynaSAC_SA: def __init__( self, actor_network: torch.nn.Module, critic_network: torch.nn.Module, - world_network: EnsembleWorldAndOneReward, + world_network: EnsembleWorldAndOneSAReward, gamma: float, tau: float, action_num: int, @@ -100,6 +100,7 @@ def _train_policy( next_states: torch.Tensor, dones: torch.Tensor, ) -> None: + ################## Update the Critic First #################### with torch.no_grad(): next_actions, next_log_pi, _ = self.actor_net(next_states) @@ -108,7 +109,7 @@ def _train_policy( next_states, next_actions ) target_q_values = ( - torch.minimum(target_q_one, target_q_two) - self._alpha * next_log_pi + torch.minimum(target_q_one, target_q_two) - self._alpha * next_log_pi ) q_target = rewards + self.gamma * (1 - dones) * target_q_values @@ -170,7 +171,8 @@ def train_world_model( next_states=next_states, ) self.world_model.train_reward( - next_states=next_states, + states=states, + actions=actions, rewards=rewards, ) @@ -195,41 +197,6 @@ def train_policy(self, memory: PrioritizedReplayBuffer, batch_size: int) -> None next_states=next_states, dones=dones, ) - # # # Step 3 Dyna add more data - self._dyna_generate_and_train(next_states=next_states) - - def _dyna_generate_and_train(self, next_states: torch.Tensor) -> None: - pred_states = [] - pred_actions = [] - pred_rs = [] - pred_n_states = [] - - with torch.no_grad(): - pred_state = next_states - for _ in range(self.horizon): - pred_state = torch.repeat_interleave(pred_state, self.num_samples, dim=0) - # This part is controversial. But random actions is empirically better. - rand_acts = np.random.uniform(-1, 1, (pred_state.shape[0], self.action_num)) - pred_acts = torch.FloatTensor(rand_acts).to(self.device) - pred_next_state, _, _, _ = self.world_model.pred_next_states( - pred_state, pred_acts - ) - pred_reward = self.world_model.pred_rewards(pred_next_state) - pred_states.append(pred_state) - pred_actions.append(pred_acts.detach()) - pred_rs.append(pred_reward.detach()) - pred_n_states.append(pred_next_state.detach()) - pred_state = pred_next_state.detach() - pred_states = torch.vstack(pred_states) - pred_actions = torch.vstack(pred_actions) - pred_rs = torch.vstack(pred_rs) - pred_n_states = torch.vstack(pred_n_states) - # Pay attention to here! It is dones in the Cares RL Code! - pred_dones = torch.FloatTensor(np.zeros(pred_rs.shape)).to(self.device) - # states, actions, rewards, next_states, not_dones - self._train_policy( - pred_states, pred_actions, pred_rs, pred_n_states, pred_dones - ) def set_statistics(self, stats: dict) -> None: self.world_model.set_statistics(stats) diff --git a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_BIV_Reweight.py b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_BIV_Reweight.py index ab374559..36f6902f 100644 --- a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_BIV_Reweight.py +++ b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_BIV_Reweight.py @@ -15,7 +15,7 @@ from cares_reinforcement_learning.memory import PrioritizedReplayBuffer import torch.nn.functional as F -from cares_reinforcement_learning.networks.world_models.ensemble_world_sn import ( +from cares_reinforcement_learning.networks.world_models.ensemble_ns_world import ( EnsembleWorldAndOneReward, ) diff --git a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_Immerse_Reweight.py b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_Immerse_Reweight.py index ec709286..5ba056f6 100644 --- a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_Immerse_Reweight.py +++ b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_Immerse_Reweight.py @@ -15,7 +15,7 @@ from cares_reinforcement_learning.memory import PrioritizedReplayBuffer import torch.nn.functional as F -from cares_reinforcement_learning.networks.world_models.ensemble_world_sn import ( +from cares_reinforcement_learning.networks.world_models.ensemble_ns_world import ( EnsembleWorldAndOneReward, ) diff --git a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_Immerse_Reweight_Combo.py b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_Immerse_Reweight_Combo.py index 0b5cc06b..c72a9832 100644 --- a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_Immerse_Reweight_Combo.py +++ b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_Immerse_Reweight_Combo.py @@ -15,7 +15,7 @@ from cares_reinforcement_learning.memory import PrioritizedReplayBuffer import torch.nn.functional as F -from cares_reinforcement_learning.networks.world_models.ensemble_world_sn import ( +from cares_reinforcement_learning.networks.world_models.ensemble_ns_world import ( EnsembleWorldAndOneReward, ) diff --git a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_SA.py b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_SA.py index f5b23d0d..c812574d 100644 --- a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_SA.py +++ b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_SA.py @@ -16,7 +16,7 @@ from cares_reinforcement_learning.memory import PrioritizedReplayBuffer -from cares_reinforcement_learning.networks.world_models.ensmeble_world_sa import ( +from cares_reinforcement_learning.networks.world_models.ensmeble_sa_world import ( EnsembleWorldAndOneSAReward, ) diff --git a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_SA_Immerse_Reweight.py b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_SA_Immerse_Reweight.py index 8395daad..3cc8804d 100644 --- a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_SA_Immerse_Reweight.py +++ b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_SA_Immerse_Reweight.py @@ -15,7 +15,7 @@ from cares_reinforcement_learning.memory import PrioritizedReplayBuffer import torch.nn.functional as F -from cares_reinforcement_learning.networks.world_models.ensmeble_world_sa import ( +from cares_reinforcement_learning.networks.world_models.ensmeble_sa_world import ( EnsembleWorldAndOneSAReward, ) diff --git a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_SUNRISE_Reweight.py b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_SUNRISE_Reweight.py index 16932b82..84c1ca0d 100644 --- a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_SUNRISE_Reweight.py +++ b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_SUNRISE_Reweight.py @@ -15,7 +15,7 @@ from cares_reinforcement_learning.memory import PrioritizedReplayBuffer import torch.nn.functional as F -from cares_reinforcement_learning.networks.world_models.ensemble_world_sn import ( +from cares_reinforcement_learning.networks.world_models.ensemble_ns_world import ( EnsembleWorldAndOneReward, ) diff --git a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_UWAC_Reweight.py b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_UWAC_Reweight.py index 0eb1353b..ad4abfbe 100644 --- a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_UWAC_Reweight.py +++ b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_UWAC_Reweight.py @@ -15,7 +15,7 @@ from cares_reinforcement_learning.memory import PrioritizedReplayBuffer import torch.nn.functional as F -from cares_reinforcement_learning.networks.world_models.ensemble_world_sn import ( +from cares_reinforcement_learning.networks.world_models.ensemble_ns_world import ( EnsembleWorldAndOneReward, ) diff --git a/cares_reinforcement_learning/algorithm/mbrl/STEVESAC.py b/cares_reinforcement_learning/algorithm/mbrl/STEVESAC.py index e69de29b..926e15ab 100644 --- a/cares_reinforcement_learning/algorithm/mbrl/STEVESAC.py +++ b/cares_reinforcement_learning/algorithm/mbrl/STEVESAC.py @@ -0,0 +1,309 @@ +""" +Sutton, Richard S. "Dyna, an integrated architecture for learning, planning, and reacting." + +Original Paper: https://dl.acm.org/doi/abs/10.1145/122344.122377 + +This code runs automatic entropy tuning +""" + +import copy +import logging +import os +import numpy as np +import torch + +from cares_reinforcement_learning.memory import PrioritizedReplayBuffer + +from cares_reinforcement_learning.networks.world_models.ensemble_all import ( + EnsembleWorldRewardDone, +) + + +class STEVE: + def __init__( + self, + actor_network: torch.nn.Module, + critic_network: torch.nn.Module, + world_network: EnsembleWorldRewardDone, + gamma: float, + tau: float, + action_num: int, + actor_lr: float, + critic_lr: float, + alpha_lr: float, + horizon: int, + L: int, + device: torch.device, + ): + self.L = L + self.horizon = horizon + self.type = "mbrl" + self.device = device + # this may be called policy_net in other implementations + self.actor_net = actor_network.to(self.device) + # this may be called soft_q_net in other implementations + self.critic_net = critic_network.to(self.device) + self.target_critic_net = copy.deepcopy(self.critic_net) + self.gamma = gamma + self.tau = tau + + self.action_num = action_num + + self.learn_counter = 0 + self.policy_update_freq = 1 + + self.actor_net_optimiser = torch.optim.Adam( + self.actor_net.parameters(), lr=actor_lr + ) + self.critic_net_optimiser = torch.optim.Adam( + self.critic_net.parameters(), lr=critic_lr + ) + + # Set to initial alpha to 1.0 according to other baselines. + self.log_alpha = torch.tensor(np.log(1.0)).to(device) + self.log_alpha.requires_grad = True + self.target_entropy = -action_num + self.log_alpha_optimizer = torch.optim.Adam([self.log_alpha], lr=alpha_lr) + + # World model + self.world_model = world_network + + @property + def _alpha(self) -> float: + return self.log_alpha.exp() + + # pylint: disable-next=unused-argument to keep the same interface + def select_action_from_policy( + self, state: np.ndarray, evaluation: bool = False, noise_scale: float = 0 + ) -> np.ndarray: + # note that when evaluating this algorithm we need to select mu as + self.actor_net.eval() + with torch.no_grad(): + state_tensor = torch.FloatTensor(state).unsqueeze(0).to(self.device) + if evaluation is False: + (action, _, _) = self.actor_net(state_tensor) + else: + (_, _, action) = self.actor_net(state_tensor) + action = action.cpu().data.numpy().flatten() + self.actor_net.train() + return action + + def _train_policy( + self, + states: torch.Tensor, + actions: torch.Tensor, + rewards: torch.Tensor, + next_states: torch.Tensor, + dones: torch.Tensor, + ) -> None: + ################## Update the Critic First #################### + with torch.no_grad(): + # cumulative_rewards = rewards + # pred_s = next_states + # tree_mask = dones.squeeze().bool() + # q_means = [] + # q_vars = [] + + # for hori in range(self.horizon): + # # As normal + # pred_a, _, _ = self.actor_net(pred_s) + # # Pred the future + # pred_s, pred_r, pred_done = self.env.tensor_query(pred_s, pred_a) + # pred_s = pred_s.to(self.device) + # pred_r = pred_r.to(self.device) + # pred_done = pred_done.bool().to(self.device) + # # Before adding pred to mask + # pred_r[tree_mask, :] = 0.0 + # cumulative_rewards += pred_r * (self.gamma ** (hori + 1)) + # # Kill the branch with the previous + # tree_mask = torch.logical_or(tree_mask, pred_done.squeeze()) + # q_target = cumulative_rewards + + # Expand the value estimation here to STEVE. + # Maintain a list of Q values for each horizon with the same size! + # Q = r + yr + y^2 * (q - pi) + # Propagate uncertainty with sampling. [256, 17] -> [10, 256, 17] -> [100, 256, 17] -> [1000, 256, 17] + # Q : [256, 1], [256, 1], [256, 1] + + # not_dones = (1 - dones).squeeze().bool() + # pred_all_next_obs = next_states.unsqueeze(dim=0) + # pred_all_next_rewards = torch.zeros(rewards.shape).unsqueeze(dim=0) + # + # q_means = [] + # q_vars = [] + # + # for hori in range(self.horizon): + # horizon_rewards_list = [] + # horizon_obs_list = [] + # horizon_q_list = [] + # + # for stat in range(pred_all_next_obs.shape[0]): + # # Optimal sampling + # pred_action, pred_log_pi, _ = self.actor_net.sample(pred_all_next_obs[stat]) + # + # pred_q1, pred_q2 = self.target_critic_net(pred_all_next_obs[stat], pred_action) + # # V = Q - alpha * logi + # pred_v1 = pred_q1 - self._alpha * pred_log_pi + # pred_v2 = pred_q2 - self._alpha * pred_log_pi + # + # # Predict a set of reward first + # _, pred_rewards = self.world_model.pred_rewards(observation=pred_all_next_obs[stat], + # action=pred_action) + # + # temp_disc_rewards = [] + # # For each predict reward. + # for rwd in range(pred_rewards.shape[0]): + # disc_pred_reward = not_dones * (self.gamma ** (hori + 1)) * pred_rewards[rwd] + # if hori > 0: + # # Horizon = 1, 2, 3, 4, 5 + # disc_sum_reward = pred_all_next_rewards[stat] + disc_pred_reward + # else: + # disc_sum_reward = not_dones * disc_pred_reward + # temp_disc_rewards.append(disc_sum_reward) + # assert rewards.shape == not_dones.shape == disc_sum_reward.shape + # # Q = r + disc_rewards + pred_v + # pred_tq1 = rewards + disc_sum_reward + not_dones * (self.gamma ** (hori + 2)) * pred_v1 + # pred_tq2 = rewards + disc_sum_reward + not_dones * (self.gamma ** (hori + 2)) * pred_v2 + # horizon_q_list.append(pred_tq1) + # horizon_q_list.append(pred_tq2) + # + # # Observation Level + # if hori < (self.horizon - 1): + # _, pred_obs, _, _ = self.world_model.pred_next_states(pred_all_next_obs[stat], pred_action) + # + # horizon_obs_list.append(pred_obs) + # horizon_rewards_list.append(torch.stack(temp_disc_rewards)) + # + # # Horizon level. + # if hori < (self.horizon - 1): + # pred_all_next_obs = torch.vstack(horizon_obs_list) + # pred_all_next_rewards = torch.vstack(horizon_rewards_list) + # + # # Statistics of target q + # h_0 = torch.stack(horizon_q_list) + # mean_0 = torch.mean(h_0, dim=0) + # q_means.append(mean_0) + # var_0 = torch.var(h_0, dim=0) + # var_0[torch.abs(var_0) < 0.001] = 0.001 + # var_0 = 1.0 / var_0 + # q_vars.append(var_0) + # all_means = torch.stack(q_means) + # all_vars = torch.stack(q_vars) + # total_vars = torch.sum(all_vars, dim=0) + # for n in range(self.horizon): + # all_vars[n] /= total_vars + # q_target = torch.sum(all_vars * all_means, dim=0) + + next_actions, next_log_pi, _ = self.actor_net(next_states) + + target_q_one, target_q_two = self.target_critic_net( + next_states, next_actions + ) + target_q_values = ( + torch.minimum(target_q_one, target_q_two) - self._alpha * next_log_pi + ) + q_target = rewards + self.gamma * (1 - dones) * target_q_values + + q_values_one, q_values_two = self.critic_net(states, actions) + critic_loss_one = ((q_values_one - q_target).pow(2)).mean() + critic_loss_two = ((q_values_two - q_target).pow(2)).mean() + critic_loss_total = critic_loss_one + critic_loss_two + # Update the Critic + self.critic_net_optimiser.zero_grad() + critic_loss_total.backward() + self.critic_net_optimiser.step() + + ################## Update the Actor Second #################### + pi, first_log_p, _ = self.actor_net(states) + qf1_pi, qf2_pi = self.critic_net(states, pi) + min_qf_pi = torch.minimum(qf1_pi, qf2_pi) + actor_loss = ((self._alpha * first_log_p) - min_qf_pi).mean() + + # Update the Actor + self.actor_net_optimiser.zero_grad() + actor_loss.backward() + self.actor_net_optimiser.step() + + # Update the temperature + alpha_loss = -( + self.log_alpha * (first_log_p + self.target_entropy).detach() + ).mean() + + self.log_alpha_optimizer.zero_grad() + alpha_loss.backward() + self.log_alpha_optimizer.step() + + if self.learn_counter % self.policy_update_freq == 0: + for target_param, param in zip( + self.target_critic_net.parameters(), self.critic_net.parameters() + ): + target_param.data.copy_( + param.data * self.tau + target_param.data * (1.0 - self.tau) + ) + + def train_world_model( + self, memory: PrioritizedReplayBuffer, batch_size: int + ) -> None: + + experiences = memory.sample_uniform(batch_size) + states, actions, rewards, next_states, _, _ = experiences + + states = torch.FloatTensor(np.asarray(states)).to(self.device) + actions = torch.FloatTensor(np.asarray(actions)).to(self.device) + rewards = torch.FloatTensor(np.asarray(rewards)).to(self.device).unsqueeze(1) + next_states = torch.FloatTensor(np.asarray(next_states)).to(self.device) + + # Brief Evaluate the world model and reward prediciton. + next_s, _, _, _ = self.world_model.pred_next_states(states, actions) + + self.world_model.train_world( + states=states, + actions=actions, + next_states=next_states, + ) + self.world_model.train_reward( + states=states, + actions=actions, + rewards=rewards, + ) + + + def train_policy(self, memory: PrioritizedReplayBuffer, batch_size: int) -> None: + self.learn_counter += 1 + + # experiences = memory.sample_uniform(batch_size) + # states, actions, rewards, next_states, dones, _ = experiences + # + # # Convert into tensor + # states = torch.FloatTensor(np.asarray(states)).to(self.device) + # actions = torch.FloatTensor(np.asarray(actions)).to(self.device) + # rewards = torch.FloatTensor(np.asarray(rewards)).to(self.device).unsqueeze(1) + # next_states = torch.FloatTensor(np.asarray(next_states)).to(self.device) + # dones = torch.LongTensor(np.asarray(dones)).to(self.device).unsqueeze(1) + # + # # Step 2 train as usual + # self._train_policy( + # states=states, + # actions=actions, + # rewards=rewards, + # next_states=next_states, + # dones=dones, + # ) + + def set_statistics(self, stats: dict) -> None: + self.world_model.set_statistics(stats) + + def save_models(self, filename: str, filepath: str = "models") -> None: + path = f"{filepath}/models" if filepath != "models" else filepath + dir_exists = os.path.exists(path) + if not dir_exists: + os.makedirs(path) + torch.save(self.actor_net.state_dict(), f"{path}/{filename}_actor.pth") + torch.save(self.critic_net.state_dict(), f"{path}/{filename}_critic.pth") + logging.info("models has been saved...") + + def load_models(self, filepath: str, filename: str) -> None: + path = f"{filepath}/models" if filepath != "models" else filepath + self.actor_net.load_state_dict(torch.load(f"{path}/{filename}_actor.pth")) + self.critic_net.load_state_dict(torch.load(f"{path}/{filename}_critic.pth")) + logging.info("models has been loaded...") diff --git a/cares_reinforcement_learning/algorithm/mbrl/__init__.py b/cares_reinforcement_learning/algorithm/mbrl/__init__.py index b5692e5b..bf17ec5a 100644 --- a/cares_reinforcement_learning/algorithm/mbrl/__init__.py +++ b/cares_reinforcement_learning/algorithm/mbrl/__init__.py @@ -1,4 +1,4 @@ -from .DynaSAC import DynaSAC +from .DynaSAC import DynaSAC_SA from .DynaSAC_Immerse_Reweight import DynaSAC_ScaleBatchReweight from .DynaSAC_Immerse_Reweight_Combo import DynaSAC_Immerse_Reweight_Combo from .DynaSAC_SUNRISE_Reweight import DynaSAC_SUNRISEReweight @@ -6,3 +6,4 @@ from .DynaSAC_BIV_Reweight import DynaSAC_BIVReweight from .DynaSAC_SA import DynaSAC_SA from .DynaSAC_SA_Immerse_Reweight import DynaSAC_SABR +from .STEVESAC import STEVE \ No newline at end of file diff --git a/cares_reinforcement_learning/networks/world_models/__init__.py b/cares_reinforcement_learning/networks/world_models/__init__.py index 3542ec33..211923e2 100644 --- a/cares_reinforcement_learning/networks/world_models/__init__.py +++ b/cares_reinforcement_learning/networks/world_models/__init__.py @@ -1,9 +1,9 @@ from cares_reinforcement_learning.networks.world_models.ensemble_integrated import ( EnsembleWorldReward, ) -from cares_reinforcement_learning.networks.world_models.ensemble_world_sn import ( +from cares_reinforcement_learning.networks.world_models.ensemble_ns_world import ( EnsembleWorldAndOneReward, ) -from cares_reinforcement_learning.networks.world_models.ensmeble_world_sa import ( +from cares_reinforcement_learning.networks.world_models.ensmeble_sa_world import ( EnsembleWorldAndOneSAReward, ) \ No newline at end of file diff --git a/cares_reinforcement_learning/networks/world_models/ensemble_all.py b/cares_reinforcement_learning/networks/world_models/ensemble_all.py new file mode 100644 index 00000000..53219d33 --- /dev/null +++ b/cares_reinforcement_learning/networks/world_models/ensemble_all.py @@ -0,0 +1,194 @@ +import logging +import math +import random +import sys +import numpy as np +import torch +import torch.nn.functional as F +import torch.utils +from torch import optim + +from cares_reinforcement_learning.networks.world_models.probabilistic_dynamics import ( + ProbabilisticDynamics, +) +from cares_reinforcement_learning.networks.world_models.probabilistic_sas_reward import ( + Probabilistic_SAS_Reward, +) +from cares_reinforcement_learning.networks.world_models.simple_sas_done import ( + SASDone, +) +from cares_reinforcement_learning.util.helpers import normalize_observation_delta + + +class EnsembleWorldRewardDone: + """ + This class consist of an ensemble of all components for critic update. + Q_label = REWARD + gamma * (1 - DONES) * Q(NEXT_STATES). + + """ + + def __init__( + self, + observation_size: int, + num_actions: int, + num_world_models: int, + num_reward_models: int, + num_done_models: int, + lr: float, + device: str, + hidden_size: int = 128, + ): + self.num_done_models = num_done_models + self.num_reward_models = num_reward_models + self.num_world_models = num_world_models + + self.observation_size = observation_size + self.num_actions = num_actions + self.device = device + + self.world_models = [ProbabilisticDynamics(observation_size=observation_size, num_actions=num_actions, + hidden_size=hidden_size) for _ in range(self.num_world_models)] + self.reward_models = [Probabilistic_SAS_Reward(observation_size=observation_size, num_actions=num_actions, + hidden_size=hidden_size) for _ in range(self.num_reward_models)] + self.world_optimizers = [optim.Adam(self.world_models[i].parameters(), lr=lr) for i in + range(self.num_done_models)] + self.reward_optimizers = [optim.Adam(self.reward_models[i].parameters(), lr=lr) for i in + range(self.num_reward_models)] + + # Bring all reward prediction and dynamic rediction networks to device. + for reward_model in self.world_models: + reward_model.to(self.device) + for world_model in self.world_models: + world_model.to(self.device) + + self.done_model = SASDone(observation_size=observation_size, num_actions=num_actions, + hidden_size=hidden_size) + self.done_optimizers = optim.Adam(self.done_model.parameters(), lr=lr) + self.done_model.to(self.device) + self.statistics = {} + + def set_statistics(self, statistics: dict) -> None: + """ + Update all statistics for normalization for all world models and the + ensemble itself. + + :param (Dictionary) statistics: + """ + for key, value in statistics.items(): + if isinstance(value, np.ndarray): + statistics[key] = torch.FloatTensor(statistics[key]).to(self.device) + self.statistics = statistics + for model in self.world_models: + model.statistics = statistics + + def pred_rewards(self, observation: torch.Tensor, action: torch.Tensor, next_observation: torch.Tensor): + """ + predict reward based on current observation and action and next state + """ + pred_rewards = [] + pred_reward_vars = [] + for i in range(self.num_reward_models): + pred_reward, reward_var = self.reward_models[i].forward(observation, action, next_observation) + pred_rewards.append(pred_reward) + pred_reward_vars.append(reward_var) + pred_rewards = torch.vstack(pred_rewards) + pred_reward_vars = torch.vstack(pred_reward_vars) + return pred_rewards, pred_reward_vars + + def pred_next_states( + self, observation: torch.Tensor, actions: torch.Tensor + ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: + assert ( + observation.shape[1] + actions.shape[1] + == self.observation_size + self.num_actions + ) + means = [] + norm_means = [] + norm_vars = [] + # Iterate over the neural networks and get the predictions + for model in self.world_models: + # Predict delta + mean, n_mean, n_var = model.forward(observation, actions) + means.append(mean) + norm_means.append(n_mean) + norm_vars.append(n_var) + # Normalized + predictions_means = torch.stack(means) + predictions_norm_means = torch.stack(norm_means) + predictions_vars = torch.stack(norm_vars) + # Get rid of the nans + not_nans = [] + for i in range(self.num_world_models): + if not torch.any(torch.isnan(predictions_means[i])): + not_nans.append(i) + if len(not_nans) == 0: + logging.info("Predicting all Nans") + sys.exit() + # Random Take next state. + rand_ind = random.randint(0, len(not_nans) - 1) + prediction = predictions_means[not_nans[rand_ind]] + # next = current + delta + prediction += observation + all_predictions = torch.stack(means) + for j in range(all_predictions.shape[0]): + all_predictions[j] += observation + return prediction, all_predictions, predictions_norm_means, predictions_vars + + def train_world( + self, + states: torch.Tensor, + actions: torch.Tensor, + next_states: torch.Tensor, + ) -> None: + assert len(states.shape) >= 2 + assert len(actions.shape) == 2 + assert ( + states.shape[1] + actions.shape[1] + == self.num_actions + self.observation_size + ) + # For each model, train with different data. + mini_batch_size = int(math.floor(states.shape[0] / self.num_world_models)) + + for i in range(self.num_world_models): + sub_states = states[i * mini_batch_size: (i + 1) * mini_batch_size] + sub_actions = actions[i * mini_batch_size: (i + 1) * mini_batch_size] + sub_next_states = next_states[i * mini_batch_size: (i + 1) * mini_batch_size] + sub_target = sub_next_states - sub_states + delta_targets_normalized = normalize_observation_delta(sub_target, self.statistics) + _, n_mean, n_var = self.world_models[i].forward(sub_states, sub_actions) + model_loss = F.gaussian_nll_loss(input=n_mean, target=delta_targets_normalized, var=n_var).mean() + self.world_optimizers[i].zero_grad() + model_loss.backward() + self.world_optimizers[i].step() + + def train_reward( + self, + states: torch.Tensor, + actions: torch.Tensor, + next_states: torch.Tensor, + rewards: torch.Tensor, + ) -> None: + mini_batch_size = int(math.floor(states.shape[0] / self.num_reward_models)) + for i in range(self.num_reward_models): + sub_states = states[i * mini_batch_size: (i + 1) * mini_batch_size] + sub_actions = actions[i * mini_batch_size: (i + 1) * mini_batch_size] + sub_next_states = next_states[i * mini_batch_size: (i + 1) * mini_batch_size] + sub_rewards = rewards[i * mini_batch_size: (i + 1) * mini_batch_size] + self.reward_optimizers[i].zero_grad() + rwd_mean, rwd_var = self.reward_models[i].forward(sub_states, sub_actions, sub_next_states) + # reward_loss = F.mse_loss(rwd_mean, sub_rewards) + reward_loss = F.gaussian_nll_loss(input=rwd_mean, target=sub_rewards, var=rwd_var).mean() + reward_loss.backward() + self.reward_optimizers[i].step() + + # def train_done( + # self, + # states: torch.Tensor, + # actions: torch.Tensor, + # dones: torch.Tensor, + # ) -> None: + # self.reward_optimizer.zero_grad() + # prob_dones = self.reward_network.forward(states, actions) + # reward_loss = F.binary_cross_entropy(prob_dones, dones) + # reward_loss.backward() + # self.reward_optimizer.step() diff --git a/cares_reinforcement_learning/networks/world_models/ensemble_integrated.py b/cares_reinforcement_learning/networks/world_models/ensemble_integrated.py index 534ff71e..a4f17f31 100644 --- a/cares_reinforcement_learning/networks/world_models/ensemble_integrated.py +++ b/cares_reinforcement_learning/networks/world_models/ensemble_integrated.py @@ -9,15 +9,12 @@ import torch.utils from torch import optim -from cares_reinforcement_learning.networks.world_models.simple_dynamics import ( - SimpleDynamics, +from cares_reinforcement_learning.networks.world_models.probabilistic_dynamics import ( + ProbabilisticDynamics, ) -from cares_reinforcement_learning.networks.world_models.simple_reward_sn import ( - SimpleReward, +from cares_reinforcement_learning.networks.world_models.simple_ns_reward import ( + Simple_NS_Reward, ) -# from cares_reinforcement_learning.networks.world_models.probability_rewards import ( -# ProbabilityReward, -# ) from cares_reinforcement_learning.util.helpers import normalize_observation_delta @@ -39,12 +36,12 @@ def __init__( hidden_size: int, lr: float = 0.001, ): - self.dyna_network = SimpleDynamics( + self.dyna_network = ProbabilisticDynamics( observation_size=observation_size, num_actions=num_actions, hidden_size=hidden_size, ) - self.reward_network = SimpleReward( + self.reward_network = Simple_NS_Reward( observation_size=observation_size, num_actions=num_actions, hidden_size=hidden_size, diff --git a/cares_reinforcement_learning/networks/world_models/ensemble_world_sn.py b/cares_reinforcement_learning/networks/world_models/ensemble_ns_world.py similarity index 94% rename from cares_reinforcement_learning/networks/world_models/ensemble_world_sn.py rename to cares_reinforcement_learning/networks/world_models/ensemble_ns_world.py index f2dc2762..003c4a02 100644 --- a/cares_reinforcement_learning/networks/world_models/ensemble_world_sn.py +++ b/cares_reinforcement_learning/networks/world_models/ensemble_ns_world.py @@ -9,11 +9,11 @@ import torch.utils from torch import optim -from cares_reinforcement_learning.networks.world_models.simple_dynamics import ( - SimpleDynamics, +from cares_reinforcement_learning.networks.world_models.probabilistic_dynamics import ( + ProbabilisticDynamics, ) -from cares_reinforcement_learning.networks.world_models.simple_reward_sn import ( - SimpleReward, +from cares_reinforcement_learning.networks.world_models.simple_ns_reward import ( + Simple_NS_Reward, ) from cares_reinforcement_learning.util.helpers import normalize_observation_delta @@ -32,7 +32,7 @@ def __init__( self.observation_size = observation_size self.num_actions = num_actions - self.reward_network = SimpleReward( + self.reward_network = Simple_NS_Reward( observation_size=observation_size, num_actions=num_actions, hidden_size=hidden_size, @@ -40,7 +40,7 @@ def __init__( self.reward_optimizer = optim.Adam(self.reward_network.parameters(), lr=lr) self.models = [ - SimpleDynamics( + ProbabilisticDynamics( observation_size=observation_size, num_actions=num_actions, hidden_size=hidden_size, diff --git a/cares_reinforcement_learning/networks/world_models/ensmeble_world_sa.py b/cares_reinforcement_learning/networks/world_models/ensmeble_sa_world.py similarity index 94% rename from cares_reinforcement_learning/networks/world_models/ensmeble_world_sa.py rename to cares_reinforcement_learning/networks/world_models/ensmeble_sa_world.py index fc4e1f87..4c9ee396 100644 --- a/cares_reinforcement_learning/networks/world_models/ensmeble_world_sa.py +++ b/cares_reinforcement_learning/networks/world_models/ensmeble_sa_world.py @@ -9,11 +9,11 @@ import torch.utils from torch import optim -from cares_reinforcement_learning.networks.world_models.simple_dynamics import ( - SimpleDynamics, +from cares_reinforcement_learning.networks.world_models.probabilistic_dynamics import ( + ProbabilisticDynamics, ) -from cares_reinforcement_learning.networks.world_models.simple_reward_sa import ( - SimpleRewardSA, +from cares_reinforcement_learning.networks.world_models.simple_sa_reward import ( + Simple_SA_Reward, ) from cares_reinforcement_learning.util.helpers import normalize_observation_delta @@ -32,7 +32,7 @@ def __init__( self.observation_size = observation_size self.num_actions = num_actions - self.reward_network = SimpleRewardSA( + self.reward_network = Simple_SA_Reward( observation_size=observation_size, num_actions=num_actions, hidden_size=hidden_size, @@ -40,7 +40,7 @@ def __init__( self.reward_optimizer = optim.Adam(self.reward_network.parameters(), lr=lr) self.models = [ - SimpleDynamics( + ProbabilisticDynamics( observation_size=observation_size, num_actions=num_actions, hidden_size=hidden_size, diff --git a/cares_reinforcement_learning/networks/world_models/simple_dynamics.py b/cares_reinforcement_learning/networks/world_models/probabilistic_dynamics.py similarity index 98% rename from cares_reinforcement_learning/networks/world_models/simple_dynamics.py rename to cares_reinforcement_learning/networks/world_models/probabilistic_dynamics.py index a7adcd82..ca559f55 100644 --- a/cares_reinforcement_learning/networks/world_models/simple_dynamics.py +++ b/cares_reinforcement_learning/networks/world_models/probabilistic_dynamics.py @@ -10,7 +10,7 @@ ) -class SimpleDynamics(nn.Module): +class ProbabilisticDynamics(nn.Module): """ A world model with fully connected layers. It takes current states (s) and current actions (a), and predict next states (s'). @@ -66,6 +66,7 @@ def forward( x = F.relu(x) normalized_mean = self.mean_layer(x) logvar = self.logvar_layer(x) + logvar = torch.tanh(logvar) normalized_var = torch.exp(logvar) # Always denormalized delta diff --git a/cares_reinforcement_learning/networks/world_models/probabilistic_sas_reward.py b/cares_reinforcement_learning/networks/world_models/probabilistic_sas_reward.py new file mode 100644 index 00000000..3d84f33a --- /dev/null +++ b/cares_reinforcement_learning/networks/world_models/probabilistic_sas_reward.py @@ -0,0 +1,54 @@ +import torch +from torch import nn, Tensor +import torch.nn.functional as F +from cares_reinforcement_learning.util.helpers import weight_init + + +class Probabilistic_SAS_Reward(nn.Module): + def __init__(self, observation_size: int, num_actions: int, hidden_size: int): + """ + Note, This reward function is limited to 0 ~ 1 for dm_control. + A reward model with fully connected layers. It takes current states (s) + and current actions (a), and predict rewards (r). + + :param (int) observation_size -- dimension of states + :param (int) num_actions -- dimension of actions + :param (int) hidden_size -- size of neurons in hidden layers. + """ + super().__init__() + self.observation_size = observation_size + self.num_actions = num_actions + self.linear1 = nn.Linear(2 * observation_size + num_actions, hidden_size) + self.linear2 = nn.Linear(hidden_size, hidden_size) + self.linear3 = nn.Linear(hidden_size, 1) + self.linear4 = nn.Linear(hidden_size, 1) + self.apply(weight_init) + + def forward( + self, observation: torch.Tensor, actions: torch.Tensor, next_observation: torch.Tensor) -> tuple[Tensor, Tensor]: + """ + Forward the inputs throught the network. + Note: For DMCS environment, the reward is from 0~1. + + :param (Tensors) obs -- dimension of states + :param (Tensors) actions -- dimension of actions + :param (Bool) normalized -- whether normalized reward to 0~1 + + :return (Tensors) x -- predicted rewards. + """ + assert ( + observation.shape[1] + actions.shape[1] + == self.observation_size + self.num_actions + ) + x = torch.cat((observation, actions, next_observation), dim=1) + x = self.linear1(x) + x = F.relu(x) + x = self.linear2(x) + x = F.relu(x) + rwd_mean = self.linear3(x) + var_mean = self.linear4(x) + logvar = torch.tanh(var_mean) + normalized_var = torch.exp(logvar) + # if normalized: + # rwd_mean = F.sigmoid(rwd_mean) + return rwd_mean, normalized_var diff --git a/cares_reinforcement_learning/networks/world_models/simple_reward_sn.py b/cares_reinforcement_learning/networks/world_models/simple_ns_reward.py similarity index 98% rename from cares_reinforcement_learning/networks/world_models/simple_reward_sn.py rename to cares_reinforcement_learning/networks/world_models/simple_ns_reward.py index d385de12..67491a87 100644 --- a/cares_reinforcement_learning/networks/world_models/simple_reward_sn.py +++ b/cares_reinforcement_learning/networks/world_models/simple_ns_reward.py @@ -4,7 +4,7 @@ from cares_reinforcement_learning.util.helpers import weight_init -class SimpleReward(nn.Module): +class Simple_NS_Reward(nn.Module): def __init__(self, observation_size: int, num_actions: int, hidden_size: int): """ Note, This reward function is limited to 0 ~ 1 for dm_control. diff --git a/cares_reinforcement_learning/networks/world_models/simple_reward_sa.py b/cares_reinforcement_learning/networks/world_models/simple_sa_reward.py similarity index 98% rename from cares_reinforcement_learning/networks/world_models/simple_reward_sa.py rename to cares_reinforcement_learning/networks/world_models/simple_sa_reward.py index d94ff29c..afc4f49b 100644 --- a/cares_reinforcement_learning/networks/world_models/simple_reward_sa.py +++ b/cares_reinforcement_learning/networks/world_models/simple_sa_reward.py @@ -4,7 +4,7 @@ from cares_reinforcement_learning.util.helpers import weight_init -class SimpleRewardSA(nn.Module): +class Simple_SA_Reward(nn.Module): def __init__(self, observation_size: int, num_actions: int, hidden_size: int): """ Note, This reward function is limited to 0 ~ 1 for dm_control. diff --git a/cares_reinforcement_learning/networks/world_models/probability_rewards.py b/cares_reinforcement_learning/networks/world_models/simple_sas_done.py similarity index 74% rename from cares_reinforcement_learning/networks/world_models/probability_rewards.py rename to cares_reinforcement_learning/networks/world_models/simple_sas_done.py index 3e53d5a3..bff8cc14 100644 --- a/cares_reinforcement_learning/networks/world_models/probability_rewards.py +++ b/cares_reinforcement_learning/networks/world_models/simple_sas_done.py @@ -4,7 +4,7 @@ from cares_reinforcement_learning.util.helpers import weight_init -class ProbabilityReward(nn.Module): +class SASDone(nn.Module): def __init__(self, observation_size: int, num_actions: int, hidden_size: int): """ Note, This reward function is limited to 0 ~ 1 for dm_control. @@ -18,14 +18,13 @@ def __init__(self, observation_size: int, num_actions: int, hidden_size: int): super().__init__() self.observation_size = observation_size self.num_actions = num_actions - self.linear1 = nn.Linear(observation_size + num_actions, hidden_size) + self.linear1 = nn.Linear(2 * observation_size + num_actions, hidden_size) self.linear2 = nn.Linear(hidden_size, hidden_size) - self.mean = nn.Linear(hidden_size, 1) - self.var = nn.Linear(hidden_size, 1) + self.linear3 = nn.Linear(hidden_size, 1) self.apply(weight_init) def forward( - self, observation: torch.Tensor, actions: torch.Tensor, normalized: bool = False + self, observation: torch.Tensor, actions: torch.Tensor, next_observation: torch.Tensor, normalized: bool = False ) -> torch.Tensor: """ Forward the inputs throught the network. @@ -41,15 +40,11 @@ def forward( observation.shape[1] + actions.shape[1] == self.observation_size + self.num_actions ) - x = torch.cat((observation, actions), dim=1) + x = torch.cat((observation, actions, next_observation), dim=1) x = self.linear1(x) x = F.relu(x) x = self.linear2(x) x = F.relu(x) - rwd_mean = self.mean(x) - rwd_var = self.var(x) - logvar = torch.tanh(rwd_var) - rwd_var = torch.exp(logvar) - if normalized: - rwd_mean = F.sigmoid(rwd_mean) - return rwd_mean, rwd_var + x = self.linear3(x) + prob_x = F.sigmoid(x) + return prob_x diff --git a/cares_reinforcement_learning/util/configurations.py b/cares_reinforcement_learning/util/configurations.py index 85841c65..8d5fdc35 100644 --- a/cares_reinforcement_learning/util/configurations.py +++ b/cares_reinforcement_learning/util/configurations.py @@ -138,6 +138,21 @@ class SACConfig(AlgorithmConfig): reward_scale: Optional[float] = 1.0 +class STEVEConfig(AlgorithmConfig): + algorithm: str = Field("STEVE", Literal=True) + actor_lr: Optional[float] = 3e-4 + critic_lr: Optional[float] = 3e-4 + alpha_lr: Optional[float] = 3e-4 + gamma: Optional[float] = 0.99 + tau: Optional[float] = 0.005 + reward_scale: Optional[float] = 1.0 + + horizon = 3 + num_world_models: Optional[int] = 5 + num_reward_models: Optional[int] = 5 + num_critic_models: Optional[int] = 5 + + class DynaSAC_SAConfig(AlgorithmConfig): algorithm: str = Field("DynaSAC_SA", Literal=True) actor_lr: Optional[float] = 3e-4 @@ -296,6 +311,7 @@ class DynaSAC_SUNRISEReweightConfig(AlgorithmConfig): mode: Optional[int] = 1 sample_times: Optional[int] = 10 + class DynaSAC_UWACReweightConfig(AlgorithmConfig): algorithm: str = Field("DynaSAC_UWACReweight", Literal=True) actor_lr: Optional[float] = 3e-4 diff --git a/cares_reinforcement_learning/util/network_factory.py b/cares_reinforcement_learning/util/network_factory.py index 77ad6fa0..fef04fe2 100644 --- a/cares_reinforcement_learning/util/network_factory.py +++ b/cares_reinforcement_learning/util/network_factory.py @@ -77,6 +77,48 @@ def create_PPO(observation_size, action_num, config: AlgorithmConfig): return agent +def create_STEVE(observation_size, action_num, config: AlgorithmConfig): + """ + Create networks for model-based SAC agent. The Actor and Critic is same. + An extra world model is added. + + """ + from cares_reinforcement_learning.algorithm.mbrl import STEVE + from cares_reinforcement_learning.networks.SAC import Actor, Critic + from cares_reinforcement_learning.networks.world_models.ensemble_all import EnsembleWorldRewardDone + + actor = Actor(observation_size, action_num) + critic = Critic(observation_size, action_num) + + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + world_model = EnsembleWorldRewardDone( + observation_size=observation_size, + num_actions=action_num, + num_world_models=config.num_world_models, + num_reward_models=config.num_reward_models, + num_done_models=config.num_done_models, + lr=config.world_model_lr, + device=device, + ) + + agent = STEVE( + actor_network=actor, + critic_network=critic, + world_network=world_model, + gamma=config.gamma, + tau=config.tau, + action_num=action_num, + actor_lr=config.actor_lr, + critic_lr=config.critic_lr, + alpha_lr=config.alpha_lr, + horizon=config.horizon, + L=config.num_critic_models, + device=device, + ) + return agent + + def create_DynaSAC_SA(observation_size, action_num, config: AlgorithmConfig): """ Create networks for model-based SAC agent. The Actor and Critic is same. @@ -85,7 +127,7 @@ def create_DynaSAC_SA(observation_size, action_num, config: AlgorithmConfig): """ from cares_reinforcement_learning.algorithm.mbrl import DynaSAC_SA from cares_reinforcement_learning.networks.SAC import Actor, Critic - from cares_reinforcement_learning.networks.world_models.ensmeble_world_sa import EnsembleWorldAndOneSAReward + from cares_reinforcement_learning.networks.world_models.ensmeble_sa_world import EnsembleWorldAndOneSAReward actor = Actor(observation_size, action_num) critic = Critic(observation_size, action_num) @@ -125,7 +167,7 @@ def create_DynaSAC_SABR(observation_size, action_num, config: AlgorithmConfig): """ from cares_reinforcement_learning.algorithm.mbrl import DynaSAC_SABR from cares_reinforcement_learning.networks.SAC import Actor, Critic - from cares_reinforcement_learning.networks.world_models.ensmeble_world_sa import EnsembleWorldAndOneSAReward + from cares_reinforcement_learning.networks.world_models.ensmeble_sa_world import EnsembleWorldAndOneSAReward actor = Actor(observation_size, action_num) critic = Critic(observation_size, action_num) @@ -294,6 +336,7 @@ def create_DynaSAC_BIVReweight(observation_size, action_num, config: AlgorithmCo ) return agent + def create_DynaSAC_SUNRISEReweight(observation_size, action_num, config: AlgorithmConfig): """ Create networks for model-based SAC agent. The Actor and Critic is same. @@ -424,7 +467,6 @@ def create_DynaSAC(observation_size, action_num, config: AlgorithmConfig): return agent - def create_SAC(observation_size, action_num, config: AlgorithmConfig): from cares_reinforcement_learning.algorithm.policy import SAC from cares_reinforcement_learning.networks.SAC import Actor, Critic From 5e41d3f7ddd5f1c8b09e508f788c67febd82c79d Mon Sep 17 00:00:00 2001 From: tony Date: Wed, 7 Aug 2024 10:26:03 +1200 Subject: [PATCH 53/91] steve --- .../mbrl/{STEVESAC.py => STEVE_MEAN_SAC.py} | 197 ++++++------------ .../algorithm/mbrl/__init__.py | 2 +- .../memory/memory_buffer.py | 0 .../networks/world_models/ensemble_all.py | 70 ++++++- .../util/configurations.py | 12 +- .../util/network_factory.py | 7 +- 6 files changed, 142 insertions(+), 146 deletions(-) rename cares_reinforcement_learning/algorithm/mbrl/{STEVESAC.py => STEVE_MEAN_SAC.py} (50%) delete mode 100644 cares_reinforcement_learning/memory/memory_buffer.py diff --git a/cares_reinforcement_learning/algorithm/mbrl/STEVESAC.py b/cares_reinforcement_learning/algorithm/mbrl/STEVE_MEAN_SAC.py similarity index 50% rename from cares_reinforcement_learning/algorithm/mbrl/STEVESAC.py rename to cares_reinforcement_learning/algorithm/mbrl/STEVE_MEAN_SAC.py index 926e15ab..12864a53 100644 --- a/cares_reinforcement_learning/algorithm/mbrl/STEVESAC.py +++ b/cares_reinforcement_learning/algorithm/mbrl/STEVE_MEAN_SAC.py @@ -11,6 +11,7 @@ import os import numpy as np import torch +import torch.nn.functional as F from cares_reinforcement_learning.memory import PrioritizedReplayBuffer @@ -19,7 +20,7 @@ ) -class STEVE: +class STEVE_MEAN: def __init__( self, actor_network: torch.nn.Module, @@ -98,111 +99,54 @@ def _train_policy( ) -> None: ################## Update the Critic First #################### with torch.no_grad(): - # cumulative_rewards = rewards - # pred_s = next_states - # tree_mask = dones.squeeze().bool() - # q_means = [] - # q_vars = [] - - # for hori in range(self.horizon): - # # As normal - # pred_a, _, _ = self.actor_net(pred_s) - # # Pred the future - # pred_s, pred_r, pred_done = self.env.tensor_query(pred_s, pred_a) - # pred_s = pred_s.to(self.device) - # pred_r = pred_r.to(self.device) - # pred_done = pred_done.bool().to(self.device) - # # Before adding pred to mask - # pred_r[tree_mask, :] = 0.0 - # cumulative_rewards += pred_r * (self.gamma ** (hori + 1)) - # # Kill the branch with the previous - # tree_mask = torch.logical_or(tree_mask, pred_done.squeeze()) - # q_target = cumulative_rewards - - # Expand the value estimation here to STEVE. - # Maintain a list of Q values for each horizon with the same size! - # Q = r + yr + y^2 * (q - pi) - # Propagate uncertainty with sampling. [256, 17] -> [10, 256, 17] -> [100, 256, 17] -> [1000, 256, 17] - # Q : [256, 1], [256, 1], [256, 1] - - # not_dones = (1 - dones).squeeze().bool() - # pred_all_next_obs = next_states.unsqueeze(dim=0) - # pred_all_next_rewards = torch.zeros(rewards.shape).unsqueeze(dim=0) - # - # q_means = [] - # q_vars = [] - # - # for hori in range(self.horizon): - # horizon_rewards_list = [] - # horizon_obs_list = [] - # horizon_q_list = [] - # - # for stat in range(pred_all_next_obs.shape[0]): - # # Optimal sampling - # pred_action, pred_log_pi, _ = self.actor_net.sample(pred_all_next_obs[stat]) - # - # pred_q1, pred_q2 = self.target_critic_net(pred_all_next_obs[stat], pred_action) - # # V = Q - alpha * logi - # pred_v1 = pred_q1 - self._alpha * pred_log_pi - # pred_v2 = pred_q2 - self._alpha * pred_log_pi - # - # # Predict a set of reward first - # _, pred_rewards = self.world_model.pred_rewards(observation=pred_all_next_obs[stat], - # action=pred_action) - # - # temp_disc_rewards = [] - # # For each predict reward. - # for rwd in range(pred_rewards.shape[0]): - # disc_pred_reward = not_dones * (self.gamma ** (hori + 1)) * pred_rewards[rwd] - # if hori > 0: - # # Horizon = 1, 2, 3, 4, 5 - # disc_sum_reward = pred_all_next_rewards[stat] + disc_pred_reward - # else: - # disc_sum_reward = not_dones * disc_pred_reward - # temp_disc_rewards.append(disc_sum_reward) - # assert rewards.shape == not_dones.shape == disc_sum_reward.shape - # # Q = r + disc_rewards + pred_v - # pred_tq1 = rewards + disc_sum_reward + not_dones * (self.gamma ** (hori + 2)) * pred_v1 - # pred_tq2 = rewards + disc_sum_reward + not_dones * (self.gamma ** (hori + 2)) * pred_v2 - # horizon_q_list.append(pred_tq1) - # horizon_q_list.append(pred_tq2) - # - # # Observation Level - # if hori < (self.horizon - 1): - # _, pred_obs, _, _ = self.world_model.pred_next_states(pred_all_next_obs[stat], pred_action) - # - # horizon_obs_list.append(pred_obs) - # horizon_rewards_list.append(torch.stack(temp_disc_rewards)) - # - # # Horizon level. - # if hori < (self.horizon - 1): - # pred_all_next_obs = torch.vstack(horizon_obs_list) - # pred_all_next_rewards = torch.vstack(horizon_rewards_list) - # - # # Statistics of target q - # h_0 = torch.stack(horizon_q_list) - # mean_0 = torch.mean(h_0, dim=0) - # q_means.append(mean_0) - # var_0 = torch.var(h_0, dim=0) - # var_0[torch.abs(var_0) < 0.001] = 0.001 - # var_0 = 1.0 / var_0 - # q_vars.append(var_0) - # all_means = torch.stack(q_means) - # all_vars = torch.stack(q_vars) - # total_vars = torch.sum(all_vars, dim=0) - # for n in range(self.horizon): - # all_vars[n] /= total_vars - # q_target = torch.sum(all_vars * all_means, dim=0) - - next_actions, next_log_pi, _ = self.actor_net(next_states) - - target_q_one, target_q_two = self.target_critic_net( - next_states, next_actions - ) - target_q_values = ( - torch.minimum(target_q_one, target_q_two) - self._alpha * next_log_pi - ) - q_target = rewards + self.gamma * (1 - dones) * target_q_values + not_dones = (1 - dones) + q_means = [] + q_weights = [] + accum_dist_rewards = torch.repeat_interleave(rewards.unsqueeze(dim=0), repeats=25, dim=0) + # 5 * 5 * 4 = 100 + for hori in range(self.horizon): + curr_hori_action, curr_hori_log_pi, _ = self.actor_net(next_states) + mean_predictions, all_mean_next, _, _ = self.world_model.pred_next_states(next_states, curr_hori_action) + pred_rewards, _ = self.world_model.pred_multiple_rewards(observation=next_states, + action=curr_hori_action, + next_observation=all_mean_next) + pred_rewards *= (self.gamma ** (hori + 1)) + accum_dist_rewards += pred_rewards + + # V = Q - alpha * logi + pred_q1, pred_q2 = self.target_critic_net(next_states, curr_hori_action) + pred_q3, pred_q4 = self.critic_net(next_states, curr_hori_action) + pred_v1 = pred_q1 - self._alpha * curr_hori_log_pi + pred_v2 = pred_q2 - self._alpha * curr_hori_log_pi + pred_v3 = pred_q3 - self._alpha * curr_hori_log_pi + pred_v4 = pred_q4 - self._alpha * curr_hori_log_pi + q_0 = [] + for i in range(pred_rewards.shape[0]): + pred_tq1 = accum_dist_rewards[i] + not_dones * (self.gamma ** (hori + 2)) * pred_v1 + pred_tq2 = accum_dist_rewards[i] + not_dones * (self.gamma ** (hori + 2)) * pred_v2 + pred_tq3 = accum_dist_rewards[i] + not_dones * (self.gamma ** (hori + 2)) * pred_v3 + pred_tq4 = accum_dist_rewards[i] + not_dones * (self.gamma ** (hori + 2)) * pred_v4 + q_0.append(pred_tq1) + q_0.append(pred_tq2) + q_0.append(pred_tq3) + q_0.append(pred_tq4) + q_0 = torch.stack(q_0) + # Compute var, mean and add them to the queue + # [100, 256, 1] -> [256, 1] + mean_0 = torch.mean(q_0, dim=0) + q_means.append(mean_0) + var_0 = torch.var(q_0, dim=0) + var_0[torch.abs(var_0) < 0.0001] = 0.0001 + weights_0 = 1.0 / var_0 + q_weights.append(weights_0) + + next_states = mean_predictions + all_means = torch.stack(q_means) + all_weights = torch.stack(q_weights) + total_weights = torch.sum(all_weights, dim=0) + for n in range(self.horizon): + all_weights[n] /= total_weights + q_target = torch.sum(all_weights * all_means, dim=0) q_values_one, q_values_two = self.critic_net(states, actions) critic_loss_one = ((q_values_one - q_target).pow(2)).mean() @@ -253,9 +197,6 @@ def train_world_model( rewards = torch.FloatTensor(np.asarray(rewards)).to(self.device).unsqueeze(1) next_states = torch.FloatTensor(np.asarray(next_states)).to(self.device) - # Brief Evaluate the world model and reward prediciton. - next_s, _, _, _ = self.world_model.pred_next_states(states, actions) - self.world_model.train_world( states=states, actions=actions, @@ -265,30 +206,30 @@ def train_world_model( states=states, actions=actions, rewards=rewards, + next_states=next_states ) - def train_policy(self, memory: PrioritizedReplayBuffer, batch_size: int) -> None: self.learn_counter += 1 - # experiences = memory.sample_uniform(batch_size) - # states, actions, rewards, next_states, dones, _ = experiences - # - # # Convert into tensor - # states = torch.FloatTensor(np.asarray(states)).to(self.device) - # actions = torch.FloatTensor(np.asarray(actions)).to(self.device) - # rewards = torch.FloatTensor(np.asarray(rewards)).to(self.device).unsqueeze(1) - # next_states = torch.FloatTensor(np.asarray(next_states)).to(self.device) - # dones = torch.LongTensor(np.asarray(dones)).to(self.device).unsqueeze(1) - # - # # Step 2 train as usual - # self._train_policy( - # states=states, - # actions=actions, - # rewards=rewards, - # next_states=next_states, - # dones=dones, - # ) + experiences = memory.sample_uniform(batch_size) + states, actions, rewards, next_states, dones, _ = experiences + + # Convert into tensor + states = torch.FloatTensor(np.asarray(states)).to(self.device) + actions = torch.FloatTensor(np.asarray(actions)).to(self.device) + rewards = torch.FloatTensor(np.asarray(rewards)).to(self.device).unsqueeze(1) + next_states = torch.FloatTensor(np.asarray(next_states)).to(self.device) + dones = torch.LongTensor(np.asarray(dones)).to(self.device).unsqueeze(1) + + # Step 2 train as usual + self._train_policy( + states=states, + actions=actions, + rewards=rewards, + next_states=next_states, + dones=dones, + ) def set_statistics(self, stats: dict) -> None: self.world_model.set_statistics(stats) diff --git a/cares_reinforcement_learning/algorithm/mbrl/__init__.py b/cares_reinforcement_learning/algorithm/mbrl/__init__.py index bf17ec5a..ac339772 100644 --- a/cares_reinforcement_learning/algorithm/mbrl/__init__.py +++ b/cares_reinforcement_learning/algorithm/mbrl/__init__.py @@ -6,4 +6,4 @@ from .DynaSAC_BIV_Reweight import DynaSAC_BIVReweight from .DynaSAC_SA import DynaSAC_SA from .DynaSAC_SA_Immerse_Reweight import DynaSAC_SABR -from .STEVESAC import STEVE \ No newline at end of file +from .STEVE_MEAN_SAC import STEVE_MEAN \ No newline at end of file diff --git a/cares_reinforcement_learning/memory/memory_buffer.py b/cares_reinforcement_learning/memory/memory_buffer.py deleted file mode 100644 index e69de29b..00000000 diff --git a/cares_reinforcement_learning/networks/world_models/ensemble_all.py b/cares_reinforcement_learning/networks/world_models/ensemble_all.py index 53219d33..62b894a4 100644 --- a/cares_reinforcement_learning/networks/world_models/ensemble_all.py +++ b/cares_reinforcement_learning/networks/world_models/ensemble_all.py @@ -33,12 +33,10 @@ def __init__( num_actions: int, num_world_models: int, num_reward_models: int, - num_done_models: int, lr: float, device: str, hidden_size: int = 128, ): - self.num_done_models = num_done_models self.num_reward_models = num_reward_models self.num_world_models = num_world_models @@ -51,7 +49,7 @@ def __init__( self.reward_models = [Probabilistic_SAS_Reward(observation_size=observation_size, num_actions=num_actions, hidden_size=hidden_size) for _ in range(self.num_reward_models)] self.world_optimizers = [optim.Adam(self.world_models[i].parameters(), lr=lr) for i in - range(self.num_done_models)] + range(self.num_world_models)] self.reward_optimizers = [optim.Adam(self.reward_models[i].parameters(), lr=lr) for i in range(self.num_reward_models)] @@ -81,23 +79,59 @@ def set_statistics(self, statistics: dict) -> None: for model in self.world_models: model.statistics = statistics - def pred_rewards(self, observation: torch.Tensor, action: torch.Tensor, next_observation: torch.Tensor): + def pred_multiple_rewards(self, observation: torch.Tensor, action: torch.Tensor, next_observation: torch.Tensor): """ predict reward based on current observation and action and next state """ - pred_rewards = [] + assert len(next_observation.shape) == 3 + pred_reward_means = [] + pred_reward_vars = [] + # 5 + for j in range(next_observation.shape[0]): + next_obs = next_observation[j] + # 5 + for i in range(self.num_reward_models): + pred_reward, reward_var = self.reward_models[i].forward(observation, action, next_obs) + pred_reward_means.append(pred_reward) + pred_reward_vars.append(reward_var) + pred_reward_means = torch.stack(pred_reward_means) + pred_reward_vars = torch.stack(pred_reward_vars) + return pred_reward_means, pred_reward_vars + + def pred_rewards(self, observation: torch.Tensor, + action: torch.Tensor, next_observation: torch.Tensor): + """ + predict reward based on current observation and action and next state + """ + pred_reward_means = [] pred_reward_vars = [] for i in range(self.num_reward_models): pred_reward, reward_var = self.reward_models[i].forward(observation, action, next_observation) - pred_rewards.append(pred_reward) + pred_reward_means.append(pred_reward) pred_reward_vars.append(reward_var) - pred_rewards = torch.vstack(pred_rewards) - pred_reward_vars = torch.vstack(pred_reward_vars) - return pred_rewards, pred_reward_vars + pred_reward_means = torch.stack(pred_reward_means) + pred_reward_vars = torch.stack(pred_reward_vars) + pred_rewards = torch.mean(pred_reward_means, dim=0) + + return pred_rewards, pred_reward_means, pred_reward_vars def pred_next_states( self, observation: torch.Tensor, actions: torch.Tensor ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: + """ + Predict the next state based on the current state and action. + + The output is + Args: + observation: + actions: + + Returns: + prediction: Single prediction, probably mean. + all_predictions: all means from different model. + predictions_norm_means: normalized means. + predictions_vars: normalized vars. + """ assert ( observation.shape[1] + actions.shape[1] == self.observation_size + self.num_actions @@ -132,6 +166,7 @@ def pred_next_states( all_predictions = torch.stack(means) for j in range(all_predictions.shape[0]): all_predictions[j] += observation + return prediction, all_predictions, predictions_norm_means, predictions_vars def train_world( @@ -140,6 +175,14 @@ def train_world( actions: torch.Tensor, next_states: torch.Tensor, ) -> None: + """ + Train the world with S, A, SN. Different sub-batch. + + Args: + states: + actions: + next_states: + """ assert len(states.shape) >= 2 assert len(actions.shape) == 2 assert ( @@ -168,6 +211,15 @@ def train_reward( next_states: torch.Tensor, rewards: torch.Tensor, ) -> None: + """ + Train the reward with S, A, SN to eliminate difference between them. + + Args: + states: + actions: + next_states: + rewards: + """ mini_batch_size = int(math.floor(states.shape[0] / self.num_reward_models)) for i in range(self.num_reward_models): sub_states = states[i * mini_batch_size: (i + 1) * mini_batch_size] diff --git a/cares_reinforcement_learning/util/configurations.py b/cares_reinforcement_learning/util/configurations.py index 8d5fdc35..863a6e61 100644 --- a/cares_reinforcement_learning/util/configurations.py +++ b/cares_reinforcement_learning/util/configurations.py @@ -138,19 +138,23 @@ class SACConfig(AlgorithmConfig): reward_scale: Optional[float] = 1.0 -class STEVEConfig(AlgorithmConfig): - algorithm: str = Field("STEVE", Literal=True) +class STEVE_MEANConfig(AlgorithmConfig): + algorithm: str = Field("STEVE_MEAN", Literal=True) + # SAC Parameters. actor_lr: Optional[float] = 3e-4 critic_lr: Optional[float] = 3e-4 alpha_lr: Optional[float] = 3e-4 gamma: Optional[float] = 0.99 tau: Optional[float] = 0.005 reward_scale: Optional[float] = 1.0 - + # World Model Parameters + world_model_lr: Optional[float] = 0.001 + num_samples: Optional[int] = 10 + # STEVE Parameters. horizon = 3 num_world_models: Optional[int] = 5 num_reward_models: Optional[int] = 5 - num_critic_models: Optional[int] = 5 + num_critic_models: Optional[int] = 4 class DynaSAC_SAConfig(AlgorithmConfig): diff --git a/cares_reinforcement_learning/util/network_factory.py b/cares_reinforcement_learning/util/network_factory.py index fef04fe2..451b8f2c 100644 --- a/cares_reinforcement_learning/util/network_factory.py +++ b/cares_reinforcement_learning/util/network_factory.py @@ -77,13 +77,13 @@ def create_PPO(observation_size, action_num, config: AlgorithmConfig): return agent -def create_STEVE(observation_size, action_num, config: AlgorithmConfig): +def create_STEVE_MEAN(observation_size, action_num, config: AlgorithmConfig): """ Create networks for model-based SAC agent. The Actor and Critic is same. An extra world model is added. """ - from cares_reinforcement_learning.algorithm.mbrl import STEVE + from cares_reinforcement_learning.algorithm.mbrl import STEVE_MEAN from cares_reinforcement_learning.networks.SAC import Actor, Critic from cares_reinforcement_learning.networks.world_models.ensemble_all import EnsembleWorldRewardDone @@ -97,12 +97,11 @@ def create_STEVE(observation_size, action_num, config: AlgorithmConfig): num_actions=action_num, num_world_models=config.num_world_models, num_reward_models=config.num_reward_models, - num_done_models=config.num_done_models, lr=config.world_model_lr, device=device, ) - agent = STEVE( + agent = STEVE_MEAN( actor_network=actor, critic_network=critic, world_network=world_model, From 158c6cf49e632b64654dfc148830d73ed62f1d8d Mon Sep 17 00:00:00 2001 From: tony Date: Wed, 7 Aug 2024 10:38:43 +1200 Subject: [PATCH 54/91] to device --- .../networks/world_models/ensemble_all.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cares_reinforcement_learning/networks/world_models/ensemble_all.py b/cares_reinforcement_learning/networks/world_models/ensemble_all.py index 62b894a4..9e3cf290 100644 --- a/cares_reinforcement_learning/networks/world_models/ensemble_all.py +++ b/cares_reinforcement_learning/networks/world_models/ensemble_all.py @@ -54,7 +54,7 @@ def __init__( range(self.num_reward_models)] # Bring all reward prediction and dynamic rediction networks to device. - for reward_model in self.world_models: + for reward_model in self.reward_models: reward_model.to(self.device) for world_model in self.world_models: world_model.to(self.device) From 777745ea5cb70878283d996672c3eed8ae08f874 Mon Sep 17 00:00:00 2001 From: tony Date: Fri, 9 Aug 2024 11:21:41 +1200 Subject: [PATCH 55/91] naming convention --- .../algorithm/mbrl/DynaSAC_BIV_Reweight.py | 4 +- .../mbrl/{DynaSAC.py => DynaSAC_NS.py} | 44 ++- ...ight.py => DynaSAC_NS_Immersive_Weight.py} | 11 +- .../algorithm/mbrl/DynaSAC_SAS.py | 251 ++++++++++++++++++ ...mbo.py => DynaSAC_SAS_Immersive_Weight.py} | 210 ++++++++------- ...ight.py => DynaSAC_SA_Immersive_Weight.py} | 7 +- .../mbrl/DynaSAC_SUNRISE_Reweight.py | 4 +- .../algorithm/mbrl/DynaSAC_UWAC_Reweight.py | 4 +- .../algorithm/mbrl/STEVE_MEAN_SAC.py | 9 +- .../algorithm/mbrl/__init__.py | 15 +- .../networks/world_models/__init__.py | 16 +- .../world_models/ensemble_ns_world.py | 6 +- .../world_models/ensemble_sas_world.py | 168 ++++++++++++ ... => ensemble_world_ensemble_sas_reward.py} | 20 +- .../world_models/ensmeble_sa_world.py | 8 +- .../world_models/probabilistic_dynamics.py | 2 +- .../networks/world_models/simple_sas_done.py | 2 +- .../world_models/simple_sas_reward.py | 51 ++++ ...integrated.py => z_ensemble_integrated.py} | 4 +- .../util/configurations.py | 32 ++- .../util/network_factory.py | 91 +++++-- 21 files changed, 773 insertions(+), 186 deletions(-) rename cares_reinforcement_learning/algorithm/mbrl/{DynaSAC.py => DynaSAC_NS.py} (79%) rename cares_reinforcement_learning/algorithm/mbrl/{DynaSAC_Immerse_Reweight.py => DynaSAC_NS_Immersive_Weight.py} (98%) create mode 100644 cares_reinforcement_learning/algorithm/mbrl/DynaSAC_SAS.py rename cares_reinforcement_learning/algorithm/mbrl/{DynaSAC_Immerse_Reweight_Combo.py => DynaSAC_SAS_Immersive_Weight.py} (70%) rename cares_reinforcement_learning/algorithm/mbrl/{DynaSAC_SA_Immerse_Reweight.py => DynaSAC_SA_Immersive_Weight.py} (98%) create mode 100644 cares_reinforcement_learning/networks/world_models/ensemble_sas_world.py rename cares_reinforcement_learning/networks/world_models/{ensemble_all.py => ensemble_world_ensemble_sas_reward.py} (94%) create mode 100644 cares_reinforcement_learning/networks/world_models/simple_sas_reward.py rename cares_reinforcement_learning/networks/world_models/{ensemble_integrated.py => z_ensemble_integrated.py} (99%) diff --git a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_BIV_Reweight.py b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_BIV_Reweight.py index 36f6902f..070e1efb 100644 --- a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_BIV_Reweight.py +++ b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_BIV_Reweight.py @@ -16,7 +16,7 @@ import torch.nn.functional as F from cares_reinforcement_learning.networks.world_models.ensemble_ns_world import ( - EnsembleWorldAndOneReward, + EnsembleWorldAndOneNSReward, ) @@ -29,7 +29,7 @@ def __init__( self, actor_network: torch.nn.Module, critic_network: torch.nn.Module, - world_network: EnsembleWorldAndOneReward, + world_network: EnsembleWorldAndOneNSReward, gamma: float, tau: float, action_num: int, diff --git a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC.py b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_NS.py similarity index 79% rename from cares_reinforcement_learning/algorithm/mbrl/DynaSAC.py rename to cares_reinforcement_learning/algorithm/mbrl/DynaSAC_NS.py index e10eadf1..71cd2f53 100644 --- a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC.py +++ b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_NS.py @@ -16,17 +16,17 @@ from cares_reinforcement_learning.memory import PrioritizedReplayBuffer -from cares_reinforcement_learning.networks.world_models.ensmeble_sa_world import ( - EnsembleWorldAndOneSAReward, +from cares_reinforcement_learning.networks.world_models.ensemble_ns_world import ( + EnsembleWorldAndOneNSReward, ) -class DynaSAC_SA: +class DynaSAC_NS: def __init__( self, actor_network: torch.nn.Module, critic_network: torch.nn.Module, - world_network: EnsembleWorldAndOneSAReward, + world_network: EnsembleWorldAndOneNSReward, gamma: float, tau: float, action_num: int, @@ -171,8 +171,7 @@ def train_world_model( next_states=next_states, ) self.world_model.train_reward( - states=states, - actions=actions, + next_states=next_states, rewards=rewards, ) @@ -198,6 +197,39 @@ def train_policy(self, memory: PrioritizedReplayBuffer, batch_size: int) -> None dones=dones, ) + def _dyna_generate_and_train(self, next_states: torch.Tensor) -> None: + pred_states = [] + pred_actions = [] + pred_rs = [] + pred_n_states = [] + + with torch.no_grad(): + pred_state = next_states + for _ in range(self.horizon): + pred_state = torch.repeat_interleave(pred_state, self.num_samples, dim=0) + # This part is controversial. But random actions is empirically better. + rand_acts = np.random.uniform(-1, 1, (pred_state.shape[0], self.action_num)) + pred_acts = torch.FloatTensor(rand_acts).to(self.device) + pred_next_state, _, _, _ = self.world_model.pred_next_states( + pred_state, pred_acts + ) + pred_reward = self.world_model.pred_rewards(pred_next_state) + pred_states.append(pred_state) + pred_actions.append(pred_acts.detach()) + pred_rs.append(pred_reward.detach()) + pred_n_states.append(pred_next_state.detach()) + pred_state = pred_next_state.detach() + pred_states = torch.vstack(pred_states) + pred_actions = torch.vstack(pred_actions) + pred_rs = torch.vstack(pred_rs) + pred_n_states = torch.vstack(pred_n_states) + # Pay attention to here! It is dones in the Cares RL Code! + pred_dones = torch.FloatTensor(np.zeros(pred_rs.shape)).to(self.device) + # states, actions, rewards, next_states, not_dones + self._train_policy( + pred_states, pred_actions, pred_rs, pred_n_states, pred_dones + ) + def set_statistics(self, stats: dict) -> None: self.world_model.set_statistics(stats) diff --git a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_Immerse_Reweight.py b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_NS_Immersive_Weight.py similarity index 98% rename from cares_reinforcement_learning/algorithm/mbrl/DynaSAC_Immerse_Reweight.py rename to cares_reinforcement_learning/algorithm/mbrl/DynaSAC_NS_Immersive_Weight.py index 5ba056f6..0724e904 100644 --- a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_Immerse_Reweight.py +++ b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_NS_Immersive_Weight.py @@ -16,10 +16,11 @@ import torch.nn.functional as F from cares_reinforcement_learning.networks.world_models.ensemble_ns_world import ( - EnsembleWorldAndOneReward, + EnsembleWorldAndOneNSReward, ) + class DynaSAC_ScaleBatchReweight: """ Max as ? @@ -29,7 +30,7 @@ def __init__( self, actor_network: torch.nn.Module, critic_network: torch.nn.Module, - world_network: EnsembleWorldAndOneReward, + world_network: EnsembleWorldAndOneNSReward, gamma: float, tau: float, action_num: int, @@ -394,12 +395,6 @@ def sampling(self, pred_means, pred_vars): threshold = self.threshold_scale * (max_var - min_var) + min_var total_var[total_var <= threshold] = threshold - # Exacerbate the sample difference. - # mean_var = torch.mean(total_var) - # ratio = mean_var / ((1.0 / total_var.shape[0]) * (torch.prod(total_var))) - # normalize vars to sum = 1 - # total_var *= () - total_var += 0.00000001 total_stds = 1 / total_var diff --git a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_SAS.py b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_SAS.py new file mode 100644 index 00000000..2f2ad91a --- /dev/null +++ b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_SAS.py @@ -0,0 +1,251 @@ +""" +Sutton, Richard S. "Dyna, an integrated architecture for learning, planning, and reacting." + +Original Paper: https://dl.acm.org/doi/abs/10.1145/122344.122377 + +This code runs automatic entropy tuning +""" + +import copy +import logging +import os + +import numpy as np +import torch +import torch.nn.functional as F + +from cares_reinforcement_learning.memory import PrioritizedReplayBuffer + +from cares_reinforcement_learning.networks.world_models.ensemble_sas_world import ( + EnsembleWorldAndOneSASReward, +) + + +class DynaSAC_SAS: + def __init__( + self, + actor_network: torch.nn.Module, + critic_network: torch.nn.Module, + world_network: EnsembleWorldAndOneSASReward, + gamma: float, + tau: float, + action_num: int, + actor_lr: float, + critic_lr: float, + alpha_lr: float, + num_samples: int, + horizon: int, + device: torch.device, + ): + self.type = "mbrl" + self.device = device + + # this may be called policy_net in other implementations + self.actor_net = actor_network.to(self.device) + # this may be called soft_q_net in other implementations + self.critic_net = critic_network.to(self.device) + self.target_critic_net = copy.deepcopy(self.critic_net) + + self.gamma = gamma + self.tau = tau + + self.num_samples = num_samples + self.horizon = horizon + self.action_num = action_num + + self.learn_counter = 0 + self.policy_update_freq = 1 + + self.actor_net_optimiser = torch.optim.Adam( + self.actor_net.parameters(), lr=actor_lr + ) + self.critic_net_optimiser = torch.optim.Adam( + self.critic_net.parameters(), lr=critic_lr + ) + + # Set to initial alpha to 1.0 according to other baselines. + self.log_alpha = torch.tensor(np.log(1.0)).to(device) + self.log_alpha.requires_grad = True + self.target_entropy = -action_num + self.log_alpha_optimizer = torch.optim.Adam([self.log_alpha], lr=alpha_lr) + + # World model + self.world_model = world_network + + @property + def _alpha(self) -> float: + return self.log_alpha.exp() + + # pylint: disable-next=unused-argument to keep the same interface + def select_action_from_policy( + self, state: np.ndarray, evaluation: bool = False, noise_scale: float = 0 + ) -> np.ndarray: + # note that when evaluating this algorithm we need to select mu as + self.actor_net.eval() + with torch.no_grad(): + state_tensor = torch.FloatTensor(state).unsqueeze(0).to(self.device) + if evaluation is False: + (action, _, _) = self.actor_net(state_tensor) + else: + (_, _, action) = self.actor_net(state_tensor) + action = action.cpu().data.numpy().flatten() + self.actor_net.train() + return action + + def _train_policy( + self, + states: torch.Tensor, + actions: torch.Tensor, + rewards: torch.Tensor, + next_states: torch.Tensor, + dones: torch.Tensor, + ) -> None: + + ################## Update the Critic First #################### + with torch.no_grad(): + next_actions, next_log_pi, _ = self.actor_net(next_states) + + target_q_one, target_q_two = self.target_critic_net( + next_states, next_actions + ) + target_q_values = ( + torch.minimum(target_q_one, target_q_two) - self._alpha * next_log_pi + ) + q_target = rewards + self.gamma * (1 - dones) * target_q_values + + q_values_one, q_values_two = self.critic_net(states, actions) + + critic_loss_one = ((q_values_one - q_target).pow(2)).mean() + critic_loss_two = ((q_values_two - q_target).pow(2)).mean() + + critic_loss_total = critic_loss_one + critic_loss_two + + # Update the Critic + self.critic_net_optimiser.zero_grad() + critic_loss_total.backward() + self.critic_net_optimiser.step() + + ################## Update the Actor Second #################### + pi, first_log_p, _ = self.actor_net(states) + qf1_pi, qf2_pi = self.critic_net(states, pi) + min_qf_pi = torch.minimum(qf1_pi, qf2_pi) + actor_loss = ((self._alpha * first_log_p) - min_qf_pi).mean() + + # Update the Actor + self.actor_net_optimiser.zero_grad() + actor_loss.backward() + self.actor_net_optimiser.step() + + # Update the temperature + alpha_loss = -( + self.log_alpha * (first_log_p + self.target_entropy).detach() + ).mean() + + self.log_alpha_optimizer.zero_grad() + alpha_loss.backward() + self.log_alpha_optimizer.step() + + if self.learn_counter % self.policy_update_freq == 0: + for target_param, param in zip( + self.target_critic_net.parameters(), self.critic_net.parameters() + ): + target_param.data.copy_( + param.data * self.tau + target_param.data * (1.0 - self.tau) + ) + + def train_world_model( + self, memory: PrioritizedReplayBuffer, batch_size: int + ) -> None: + + experiences = memory.sample_uniform(batch_size) + states, actions, rewards, next_states, _, _ = experiences + + states = torch.FloatTensor(np.asarray(states)).to(self.device) + actions = torch.FloatTensor(np.asarray(actions)).to(self.device) + rewards = torch.FloatTensor(np.asarray(rewards)).to(self.device).unsqueeze(1) + next_states = torch.FloatTensor(np.asarray(next_states)).to(self.device) + + self.world_model.train_world( + states=states, + actions=actions, + next_states=next_states, + ) + self.world_model.train_reward( + states=states, + actions=actions, + next_states=next_states, + rewards=rewards, + ) + + def train_policy(self, memory: PrioritizedReplayBuffer, batch_size: int) -> None: + self.learn_counter += 1 + + experiences = memory.sample_uniform(batch_size) + states, actions, rewards, next_states, dones, _ = experiences + # Convert into tensor + states = torch.FloatTensor(np.asarray(states)).to(self.device) + actions = torch.FloatTensor(np.asarray(actions)).to(self.device) + rewards = torch.FloatTensor(np.asarray(rewards)).to(self.device).unsqueeze(1) + next_states = torch.FloatTensor(np.asarray(next_states)).to(self.device) + dones = torch.LongTensor(np.asarray(dones)).to(self.device).unsqueeze(1) + # Step 2 train as usual + self._train_policy( + states=states, + actions=actions, + rewards=rewards, + next_states=next_states, + dones=dones, + ) + # # # Step 3 Dyna add more data + self._dyna_generate_and_train(next_states=next_states) + + def _dyna_generate_and_train(self, next_states: torch.Tensor) -> None: + pred_states = [] + pred_actions = [] + pred_rs = [] + pred_n_states = [] + + with torch.no_grad(): + pred_state = next_states + for _ in range(self.horizon): + pred_state = torch.repeat_interleave(pred_state, self.num_samples, dim=0) + # This part is controversial. But random actions is empirically better. + rand_acts = np.random.uniform(-1, 1, (pred_state.shape[0], self.action_num)) + pred_acts = torch.FloatTensor(rand_acts).to(self.device) + pred_next_state, _, _, _ = self.world_model.pred_next_states( + pred_state, pred_acts + ) + pred_reward = self.world_model.pred_rewards(pred_state, pred_acts, pred_next_state) + pred_states.append(pred_state) + pred_actions.append(pred_acts.detach()) + pred_rs.append(pred_reward.detach()) + pred_n_states.append(pred_next_state.detach()) + pred_state = pred_next_state.detach() + pred_states = torch.vstack(pred_states) + pred_actions = torch.vstack(pred_actions) + pred_rs = torch.vstack(pred_rs) + pred_n_states = torch.vstack(pred_n_states) + # Pay attention to here! It is dones in the Cares RL Code! + pred_dones = torch.FloatTensor(np.zeros(pred_rs.shape)).to(self.device) + # states, actions, rewards, next_states, not_dones + self._train_policy( + pred_states, pred_actions, pred_rs, pred_n_states, pred_dones + ) + + def set_statistics(self, stats: dict) -> None: + self.world_model.set_statistics(stats) + + def save_models(self, filename: str, filepath: str = "models") -> None: + path = f"{filepath}/models" if filepath != "models" else filepath + dir_exists = os.path.exists(path) + if not dir_exists: + os.makedirs(path) + torch.save(self.actor_net.state_dict(), f"{path}/{filename}_actor.pth") + torch.save(self.critic_net.state_dict(), f"{path}/{filename}_critic.pth") + logging.info("models has been saved...") + + def load_models(self, filepath: str, filename: str) -> None: + path = f"{filepath}/models" if filepath != "models" else filepath + self.actor_net.load_state_dict(torch.load(f"{path}/{filename}_actor.pth")) + self.critic_net.load_state_dict(torch.load(f"{path}/{filename}_critic.pth")) + logging.info("models has been loaded...") diff --git a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_Immerse_Reweight_Combo.py b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_SAS_Immersive_Weight.py similarity index 70% rename from cares_reinforcement_learning/algorithm/mbrl/DynaSAC_Immerse_Reweight_Combo.py rename to cares_reinforcement_learning/algorithm/mbrl/DynaSAC_SAS_Immersive_Weight.py index c72a9832..162eb3b5 100644 --- a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_Immerse_Reweight_Combo.py +++ b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_SAS_Immersive_Weight.py @@ -15,12 +15,12 @@ from cares_reinforcement_learning.memory import PrioritizedReplayBuffer import torch.nn.functional as F -from cares_reinforcement_learning.networks.world_models.ensemble_ns_world import ( - EnsembleWorldAndOneReward, +from cares_reinforcement_learning.networks.world_models.ensemble_sas_world import ( + EnsembleWorldAndOneSASReward, ) -class DynaSAC_Immerse_Reweight_Combo: +class DynaSAC_SAS_Immersive_Weight: """ Max as ? """ @@ -29,7 +29,7 @@ def __init__( self, actor_network: torch.nn.Module, critic_network: torch.nn.Module, - world_network: EnsembleWorldAndOneReward, + world_network: EnsembleWorldAndOneSASReward, gamma: float, tau: float, action_num: int, @@ -38,16 +38,17 @@ def __init__( alpha_lr: float, num_samples: int, horizon: int, - threshold_scale_actor: float, - threshold_scale_critic: float, + threshold_scale: float, + reweight_critic: bool, + reweight_actor: bool, + mode: int, sample_times: int, device: torch.device, ): self.type = "mbrl" self.device = device - self.threshold_scale_actor = threshold_scale_actor - self.threshold_scale_critic = threshold_scale_critic - + self.reweight_critic = reweight_critic + self.reweight_actor = reweight_actor # this may be called policy_net in other implementations self.actor_net = actor_network.to(self.device) # this may be called soft_q_net in other implementations @@ -80,6 +81,8 @@ def __init__( # World model self.world_model = world_network # Parameter + self.threshold_scale = threshold_scale + self.mode = mode self.sample_times = sample_times @property @@ -109,8 +112,7 @@ def _train_policy( rewards: torch.Tensor, next_states: torch.Tensor, dones: torch.Tensor, - critic_weights: torch.Tensor, - actor_weights: torch.Tensor, + weights: torch.Tensor, ) -> None: ################## Update the Critic First #################### # Have more target values? @@ -126,20 +128,29 @@ def _train_policy( q_values_one, q_values_two = self.critic_net(states, actions) - # Reweighted loss function. weight not participant in training. - l2_loss_one = (q_values_one - q_target).pow(2) - l2_loss_two = (q_values_two - q_target).pow(2) - critic_weights = critic_weights.detach() - disc_l2_loss_one = l2_loss_one * critic_weights - disc_l2_loss_two = l2_loss_two * critic_weights - # A ratio to scale the loss back to original loss scale. - ratio_1 = torch.mean(l2_loss_one) / torch.mean(disc_l2_loss_one) - ratio_1 = ratio_1.detach() - ratio_2 = torch.mean(l2_loss_two) / torch.mean(disc_l2_loss_two) - ratio_2 = ratio_2.detach() - critic_loss_one = disc_l2_loss_one.mean() * ratio_1 - critic_loss_two = disc_l2_loss_two.mean() * ratio_2 - critic_loss_total = critic_loss_one + critic_loss_two + if self.reweight_critic: + # Reweighted loss function. weight not participant in training. + l2_loss_one = (q_values_one - q_target).pow(2) + l2_loss_two = (q_values_two - q_target).pow(2) + + weights = weights.detach() + disc_l2_loss_one = l2_loss_one * weights + disc_l2_loss_two = l2_loss_two * weights + # A ratio to scale the loss back to original loss scale. + + ratio_1 = torch.mean(l2_loss_one) / torch.mean(disc_l2_loss_one) + ratio_1 = ratio_1.detach() + ratio_2 = torch.mean(l2_loss_two) / torch.mean(disc_l2_loss_two) + ratio_2 = ratio_2.detach() + + critic_loss_one = disc_l2_loss_one.mean() * ratio_1 + critic_loss_two = disc_l2_loss_two.mean() * ratio_2 + + critic_loss_total = critic_loss_one + critic_loss_two + else: + critic_loss_one = F.mse_loss(q_values_one, q_target) + critic_loss_two = F.mse_loss(q_values_two, q_target) + critic_loss_total = critic_loss_one + critic_loss_two # Update the Critic self.critic_net_optimiser.zero_grad() @@ -151,12 +162,15 @@ def _train_policy( qf1_pi, qf2_pi = self.critic_net(states, pi) min_qf_pi = torch.minimum(qf1_pi, qf2_pi) - actor_weights = actor_weights.detach() - a_loss = (self._alpha * first_log_p) - min_qf_pi - disc_actor_loss = a_loss * actor_weights - actor_ratio = torch.mean(a_loss) / torch.mean(disc_actor_loss) - actor_ratio = actor_ratio.detach() - actor_loss = actor_ratio * torch.mean(disc_actor_loss) + if self.reweight_actor: + weights = weights.detach() + a_loss = (self._alpha * first_log_p) - min_qf_pi + disc_actor_loss = a_loss * weights + ratio = torch.mean(a_loss) / torch.mean(disc_actor_loss) + ratio = ratio.detach() + actor_loss = ratio * torch.mean(disc_actor_loss) + else: + actor_loss = ((self._alpha * first_log_p) - min_qf_pi).mean() # Update the Actor self.actor_net_optimiser.zero_grad() @@ -197,6 +211,8 @@ def train_world_model( next_states=next_states, ) self.world_model.train_reward( + states=states, + actions=actions, next_states=next_states, rewards=rewards, ) @@ -221,8 +237,7 @@ def train_policy(self, memory: PrioritizedReplayBuffer, batch_size: int) -> None rewards=rewards, next_states=next_states, dones=dones, - critic_weights=full_weights, - actor_weights=full_weights, + weights=full_weights, ) # # # Step 3 Dyna add more data self._dyna_generate_and_train(next_states=next_states) @@ -236,9 +251,7 @@ def _dyna_generate_and_train(self, next_states): pred_actions = [] pred_rs = [] pred_n_states = [] - pred_uncerts_actor = [] - pred_uncerts_critic = [] - + pred_uncerts = [] with torch.no_grad(): pred_state = next_states for _ in range(self.horizon): @@ -250,13 +263,11 @@ def _dyna_generate_and_train(self, next_states): pred_next_state, _, pred_mean, pred_var = self.world_model.pred_next_states( pred_state, pred_acts ) - critic_uncert, actor_uncert = self.sampling(pred_means=pred_mean, pred_vars=pred_var) - critic_uncert = critic_uncert.unsqueeze(dim=1).to(self.device) - actor_uncert = actor_uncert.unsqueeze(dim=1).to(self.device) - pred_uncerts_critic.append(critic_uncert) - pred_uncerts_actor.append(actor_uncert) + uncert = self.sampling(pred_state, pred_act, pred_means=pred_mean, pred_vars=pred_var) + uncert = uncert.unsqueeze(dim=1).to(self.device) + pred_uncerts.append(uncert) - pred_reward = self.world_model.pred_rewards(pred_next_state) + pred_reward = self.world_model.pred_rewards(pred_state, pred_acts, pred_next_state) pred_states.append(pred_state) pred_actions.append(pred_acts.detach()) pred_rs.append(pred_reward.detach()) @@ -266,22 +277,25 @@ def _dyna_generate_and_train(self, next_states): pred_actions = torch.vstack(pred_actions) pred_rs = torch.vstack(pred_rs) pred_n_states = torch.vstack(pred_n_states) - pred_uncerts_actor = torch.vstack(pred_uncerts_actor) - pred_uncerts_critic = torch.vstack(pred_uncerts_critic) + pred_weights = torch.vstack(pred_uncerts) # Pay attention to here! It is dones in the Cares RL Code! pred_dones = torch.FloatTensor(np.zeros(pred_rs.shape)).to(self.device) # states, actions, rewards, next_states, not_dones self._train_policy( - pred_states, pred_actions, pred_rs, pred_n_states, pred_dones, pred_uncerts_critic, pred_uncerts_actor + pred_states, pred_actions, pred_rs, pred_n_states, pred_dones, pred_weights ) - def sampling(self, pred_means, pred_vars): + def sampling(self, pred_state, pred_act, pred_means, pred_vars): """ High std means low uncertainty. Therefore, divided by 1 :param pred_means: :param pred_vars: :return: + + Args: + pred_act: + pred_state: """ with torch.no_grad(): # 5 models. Each predict 10 next_states. @@ -300,17 +314,18 @@ def sampling(self, pred_means, pred_vars): qs = [] # Varying the next_state's distribution. for i in range(self.sample_times): - # 5 models, each sampled 10 times = 50, - pred_rwd1 = self.world_model.pred_rewards(sample1[i]) - pred_rwd2 = self.world_model.pred_rewards(sample2[i]) - pred_rwd3 = self.world_model.pred_rewards(sample3[i]) - pred_rwd4 = self.world_model.pred_rewards(sample4[i]) - pred_rwd5 = self.world_model.pred_rewards(sample5[i]) - rs.append(pred_rwd1) - rs.append(pred_rwd2) - rs.append(pred_rwd3) - rs.append(pred_rwd4) - rs.append(pred_rwd5) + if self.reweight_critic == 1: + # 5 models, each sampled 10 times = 50, + pred_rwd1 = self.world_model.pred_rewards(pred_state, pred_act, sample1[i]) + pred_rwd2 = self.world_model.pred_rewards(pred_state, pred_act, sample2[i]) + pred_rwd3 = self.world_model.pred_rewards(pred_state, pred_act, sample3[i]) + pred_rwd4 = self.world_model.pred_rewards(pred_state, pred_act, sample4[i]) + pred_rwd5 = self.world_model.pred_rewards(pred_state, pred_act, sample5[i]) + rs.append(pred_rwd1) + rs.append(pred_rwd2) + rs.append(pred_rwd3) + rs.append(pred_rwd4) + rs.append(pred_rwd5) # Each times, 5 models predict different actions. # [2560, 17] pred_act1, log_pi1, _ = self.actor_net(sample1[i]) @@ -342,52 +357,53 @@ def sampling(self, pred_means, pred_vars): qs.append(qc) qs.append(qd) qs.append(qe) - - rs = torch.stack(rs) + if self.reweight_critic == 1: + rs = torch.stack(rs) acts = torch.stack(acts) qs = torch.stack(qs) - var_r = torch.var(rs, dim=0) - var_a = torch.var(acts, dim=0) - var_q = torch.var(qs, dim=0) - - mean_a = torch.mean(acts, dim=0, keepdim=True) - mean_q = torch.mean(qs, dim=0, keepdim=True) - diff_a = acts - mean_a - diff_q = qs - mean_q - cov_aq = torch.mean(diff_a * diff_q, dim=0) - - mean_r = torch.mean(rs, dim=0, keepdim=True) - diff_r = rs - mean_r - cov_rq = torch.mean(diff_r * diff_q, dim=0) - - cov_ra = torch.mean(diff_r * diff_a, dim=0) - - gamma_sq = self.gamma * self.gamma - - critic_total_var = var_r + gamma_sq * var_a + gamma_sq * var_q + gamma_sq * 2 * cov_aq + \ - gamma_sq * 2 * cov_rq + gamma_sq * 2 * cov_ra - - # For actor: alpha^2 * var_a + var_q - actor_total_var = (self._alpha ** 2) * var_a + var_q + (self._alpha ** 2) * cov_aq - - critic_min_var = torch.min(critic_total_var) - critic_max_var = torch.max(critic_total_var) + if self.reweight_critic: + var_r = torch.var(rs, dim=0) + var_a = torch.var(acts, dim=0) + var_q = torch.var(qs, dim=0) + + mean_a = torch.mean(acts, dim=0, keepdim=True) + mean_q = torch.mean(qs, dim=0, keepdim=True) + diff_a = acts - mean_a + diff_q = qs - mean_q + cov_aq = torch.mean(diff_a * diff_q, dim=0) + + mean_r = torch.mean(rs, dim=0, keepdim=True) + diff_r = rs - mean_r + cov_rq = torch.mean(diff_r * diff_q, dim=0) + cov_ra = torch.mean(diff_r * diff_a, dim=0) + + gamma_sq = self.gamma * self.gamma + total_var = var_r + gamma_sq * var_a + gamma_sq * var_q + gamma_sq * 2 * cov_aq + \ + gamma_sq * 2 * cov_rq + gamma_sq * 2 * cov_ra + + if self.reweight_actor: + mean_a = torch.mean(acts, dim=0, keepdim=True) + mean_q = torch.mean(qs, dim=0, keepdim=True) + diff_a = acts - mean_a + diff_q = qs - mean_q + cov_aq = torch.mean(diff_a * diff_q, dim=0) + + var_a = torch.var(acts, dim=0) + var_q = torch.var(qs, dim=0) + # For actor: alpha^2 * var_a + var_q + total_var = (self._alpha ** 2) * var_a + var_q + (self._alpha ** 2) * cov_aq + + min_var = torch.min(total_var) + max_var = torch.max(total_var) # As (max-min) decrease, threshold should go down. - critic_threshold = self.threshold_scale_critic * (critic_max_var - critic_min_var) + critic_min_var - critic_total_var[critic_total_var <= critic_threshold] = critic_threshold - - actor_min_var = torch.min(actor_total_var) - actor_max_var = torch.max(actor_total_var) - actor_threshold = self.threshold_scale_actor * (actor_max_var - actor_min_var) + actor_min_var - actor_total_var[actor_total_var <= actor_threshold] = actor_threshold + threshold = self.threshold_scale * (max_var - min_var) + min_var + total_var[total_var <= threshold] = threshold - actor_total_var += 0.00000001 - critic_total_var += 0.00000001 - critic_total_stds = 1 / critic_total_var - actor_total_stds = 1 / actor_total_var + total_var += 0.00000001 + total_stds = 1 / total_var - return critic_total_stds.detach(), actor_total_stds.detach() + return total_stds.detach() def set_statistics(self, stats: dict) -> None: self.world_model.set_statistics(stats) diff --git a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_SA_Immerse_Reweight.py b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_SA_Immersive_Weight.py similarity index 98% rename from cares_reinforcement_learning/algorithm/mbrl/DynaSAC_SA_Immerse_Reweight.py rename to cares_reinforcement_learning/algorithm/mbrl/DynaSAC_SA_Immersive_Weight.py index 3cc8804d..a3e9f965 100644 --- a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_SA_Immerse_Reweight.py +++ b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_SA_Immersive_Weight.py @@ -358,18 +358,13 @@ def sampling(self, pred_means, pred_vars): # For actor: alpha^2 * var_a + var_q total_var = (self._alpha ** 2) * var_a + var_q + cov_aq - # Exacerbate the sample difference. - old_mean_var = torch.mean(total_var) - # normalize vars to sum = 1 - total_var /= old_mean_var - total_var += 0.00000001 min_var = torch.min(total_var) max_var = torch.max(total_var) # As (max-min) decrease, threshold should go down. threshold = self.threshold_scale * (max_var - min_var) + min_var total_var[total_var <= threshold] = threshold + total_var += 0.00000001 total_stds = 1 / total_var - return total_stds.detach() def set_statistics(self, stats: dict) -> None: diff --git a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_SUNRISE_Reweight.py b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_SUNRISE_Reweight.py index 84c1ca0d..c5020491 100644 --- a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_SUNRISE_Reweight.py +++ b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_SUNRISE_Reweight.py @@ -16,7 +16,7 @@ import torch.nn.functional as F from cares_reinforcement_learning.networks.world_models.ensemble_ns_world import ( - EnsembleWorldAndOneReward, + EnsembleWorldAndOneNSReward, ) @@ -29,7 +29,7 @@ def __init__( self, actor_network: torch.nn.Module, critic_network: torch.nn.Module, - world_network: EnsembleWorldAndOneReward, + world_network: EnsembleWorldAndOneNSReward, gamma: float, tau: float, action_num: int, diff --git a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_UWAC_Reweight.py b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_UWAC_Reweight.py index ad4abfbe..dfe04df4 100644 --- a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_UWAC_Reweight.py +++ b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_UWAC_Reweight.py @@ -16,7 +16,7 @@ import torch.nn.functional as F from cares_reinforcement_learning.networks.world_models.ensemble_ns_world import ( - EnsembleWorldAndOneReward, + EnsembleWorldAndOneNSReward, ) @@ -29,7 +29,7 @@ def __init__( self, actor_network: torch.nn.Module, critic_network: torch.nn.Module, - world_network: EnsembleWorldAndOneReward, + world_network: EnsembleWorldAndOneNSReward, gamma: float, tau: float, action_num: int, diff --git a/cares_reinforcement_learning/algorithm/mbrl/STEVE_MEAN_SAC.py b/cares_reinforcement_learning/algorithm/mbrl/STEVE_MEAN_SAC.py index 12864a53..2fb55b41 100644 --- a/cares_reinforcement_learning/algorithm/mbrl/STEVE_MEAN_SAC.py +++ b/cares_reinforcement_learning/algorithm/mbrl/STEVE_MEAN_SAC.py @@ -15,8 +15,8 @@ from cares_reinforcement_learning.memory import PrioritizedReplayBuffer -from cares_reinforcement_learning.networks.world_models.ensemble_all import ( - EnsembleWorldRewardDone, +from cares_reinforcement_learning.networks.world_models.ensemble_world_ensemble_sas_reward import ( + EnsembleWorldEnsembleSASReward, ) @@ -25,7 +25,7 @@ def __init__( self, actor_network: torch.nn.Module, critic_network: torch.nn.Module, - world_network: EnsembleWorldRewardDone, + world_network: EnsembleWorldEnsembleSASReward, gamma: float, tau: float, action_num: int, @@ -105,7 +105,7 @@ def _train_policy( accum_dist_rewards = torch.repeat_interleave(rewards.unsqueeze(dim=0), repeats=25, dim=0) # 5 * 5 * 4 = 100 for hori in range(self.horizon): - curr_hori_action, curr_hori_log_pi, _ = self.actor_net(next_states) + _, curr_hori_log_pi, curr_hori_action= self.actor_net(next_states) mean_predictions, all_mean_next, _, _ = self.world_model.pred_next_states(next_states, curr_hori_action) pred_rewards, _ = self.world_model.pred_multiple_rewards(observation=next_states, action=curr_hori_action, @@ -139,7 +139,6 @@ def _train_policy( var_0[torch.abs(var_0) < 0.0001] = 0.0001 weights_0 = 1.0 / var_0 q_weights.append(weights_0) - next_states = mean_predictions all_means = torch.stack(q_means) all_weights = torch.stack(q_weights) diff --git a/cares_reinforcement_learning/algorithm/mbrl/__init__.py b/cares_reinforcement_learning/algorithm/mbrl/__init__.py index ac339772..3ff88b02 100644 --- a/cares_reinforcement_learning/algorithm/mbrl/__init__.py +++ b/cares_reinforcement_learning/algorithm/mbrl/__init__.py @@ -1,9 +1,12 @@ -from .DynaSAC import DynaSAC_SA -from .DynaSAC_Immerse_Reweight import DynaSAC_ScaleBatchReweight -from .DynaSAC_Immerse_Reweight_Combo import DynaSAC_Immerse_Reweight_Combo +from .DynaSAC_NS import DynaSAC_NS +from .DynaSAC_NS_Immersive_Weight import DynaSAC_ScaleBatchReweight +from .DynaSAC_SA import DynaSAC_SA +from .DynaSAC_SA_Immersive_Weight import DynaSAC_SABR +from .DynaSAC_SAS import DynaSAC_SAS +from .DynaSAC_SAS_Immersive_Weight import DynaSAC_SAS_Immersive_Weight + from .DynaSAC_SUNRISE_Reweight import DynaSAC_SUNRISEReweight from .DynaSAC_UWAC_Reweight import DynaSAC_UWACReweight from .DynaSAC_BIV_Reweight import DynaSAC_BIVReweight -from .DynaSAC_SA import DynaSAC_SA -from .DynaSAC_SA_Immerse_Reweight import DynaSAC_SABR -from .STEVE_MEAN_SAC import STEVE_MEAN \ No newline at end of file + +from .STEVE_MEAN_SAC import STEVE_MEAN diff --git a/cares_reinforcement_learning/networks/world_models/__init__.py b/cares_reinforcement_learning/networks/world_models/__init__.py index 211923e2..8bda984e 100644 --- a/cares_reinforcement_learning/networks/world_models/__init__.py +++ b/cares_reinforcement_learning/networks/world_models/__init__.py @@ -1,9 +1,19 @@ -from cares_reinforcement_learning.networks.world_models.ensemble_integrated import ( +from cares_reinforcement_learning.networks.world_models.z_ensemble_integrated import ( EnsembleWorldReward, ) + from cares_reinforcement_learning.networks.world_models.ensemble_ns_world import ( - EnsembleWorldAndOneReward, + EnsembleWorldAndOneNSReward, ) + from cares_reinforcement_learning.networks.world_models.ensmeble_sa_world import ( EnsembleWorldAndOneSAReward, -) \ No newline at end of file +) + +from cares_reinforcement_learning.networks.world_models.ensemble_sas_world import ( + EnsembleWorldAndOneSASReward, +) + +from cares_reinforcement_learning.networks.world_models.ensemble_world_ensemble_sas_reward import ( + EnsembleWorldEnsembleSASReward, +) diff --git a/cares_reinforcement_learning/networks/world_models/ensemble_ns_world.py b/cares_reinforcement_learning/networks/world_models/ensemble_ns_world.py index 003c4a02..e2221cbe 100644 --- a/cares_reinforcement_learning/networks/world_models/ensemble_ns_world.py +++ b/cares_reinforcement_learning/networks/world_models/ensemble_ns_world.py @@ -10,7 +10,7 @@ from torch import optim from cares_reinforcement_learning.networks.world_models.probabilistic_dynamics import ( - ProbabilisticDynamics, + Probabilistic_Dynamics, ) from cares_reinforcement_learning.networks.world_models.simple_ns_reward import ( Simple_NS_Reward, @@ -18,7 +18,7 @@ from cares_reinforcement_learning.util.helpers import normalize_observation_delta -class EnsembleWorldAndOneReward: +class EnsembleWorldAndOneNSReward: def __init__( self, observation_size: int, @@ -40,7 +40,7 @@ def __init__( self.reward_optimizer = optim.Adam(self.reward_network.parameters(), lr=lr) self.models = [ - ProbabilisticDynamics( + Probabilistic_Dynamics( observation_size=observation_size, num_actions=num_actions, hidden_size=hidden_size, diff --git a/cares_reinforcement_learning/networks/world_models/ensemble_sas_world.py b/cares_reinforcement_learning/networks/world_models/ensemble_sas_world.py new file mode 100644 index 00000000..f4dd5657 --- /dev/null +++ b/cares_reinforcement_learning/networks/world_models/ensemble_sas_world.py @@ -0,0 +1,168 @@ +import logging +import math +import random +import sys + +import numpy as np +import torch +import torch.nn.functional as F +import torch.utils +from torch import optim + +from cares_reinforcement_learning.networks.world_models.probabilistic_dynamics import ( + Probabilistic_Dynamics, +) +from cares_reinforcement_learning.networks.world_models.simple_sas_reward import ( + Simple_SAS_Reward, +) +from cares_reinforcement_learning.util.helpers import normalize_observation_delta + + +class EnsembleWorldAndOneSASReward: + """ + + """ + def __init__( + self, + observation_size: int, + num_actions: int, + num_models: int, + lr: float, + device: str, + hidden_size: int = 128, + ): + self.num_models = num_models + self.observation_size = observation_size + self.num_actions = num_actions + + self.reward_network = Simple_SAS_Reward( + observation_size=observation_size, + num_actions=num_actions, + hidden_size=hidden_size, + ) + self.reward_optimizer = optim.Adam(self.reward_network.parameters(), lr=lr) + + self.models = [ + Probabilistic_Dynamics( + observation_size=observation_size, + num_actions=num_actions, + hidden_size=hidden_size, + ) + for _ in range(self.num_models) + ] + + self.optimizers = [optim.Adam(self.models[i].parameters(), lr=lr) for i in range(self.num_models)] + + self.statistics = {} + + # Bring all reward prediction and dynamic rediction networks to device. + self.device = device + self.reward_network.to(self.device) + for model in self.models: + model.to(device) + + def set_statistics(self, statistics: dict) -> None: + """ + Update all statistics for normalization for all world models and the + ensemble itself. + + :param (Dictionary) statistics: + """ + for key, value in statistics.items(): + if isinstance(value, np.ndarray): + statistics[key] = torch.FloatTensor(statistics[key]).to(self.device) + + self.statistics = statistics + for model in self.models: + model.statistics = statistics + + def pred_rewards(self, observation: torch.Tensor, action: torch.Tensor, next_observation:torch.Tensor): + comb = torch.cat((observation, action, next_observation), dim=1) + pred_rewards = self.reward_network(comb) + return pred_rewards + + def pred_next_states( + self, observation: torch.Tensor, actions: torch.Tensor + ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: + assert ( + observation.shape[1] + actions.shape[1] + == self.observation_size + self.num_actions + ) + means = [] + norm_means = [] + norm_vars = [] + # Iterate over the neural networks and get the predictions + for model in self.models: + # Predict delta + mean, n_mean, n_var = model.forward(observation, actions) + means.append(mean) + norm_means.append(n_mean) + norm_vars.append(n_var) + # Normalized + predictions_means = torch.stack(means) + predictions_norm_means = torch.stack(norm_means) + predictions_vars = torch.stack(norm_vars) + # Get rid of the nans + not_nans = [] + for i in range(self.num_models): + if not torch.any(torch.isnan(predictions_means[i])): + not_nans.append(i) + if len(not_nans) == 0: + logging.info("Predicting all Nans") + sys.exit() + # Random Take next state. + rand_ind = random.randint(0, len(not_nans) - 1) + prediction = predictions_means[not_nans[rand_ind]] + # next = current + delta + prediction += observation + all_predictions = torch.stack(means) + for j in range(all_predictions.shape[0]): + all_predictions[j] += observation + return prediction, all_predictions, predictions_norm_means, predictions_vars + + def train_world( + self, + states: torch.Tensor, + actions: torch.Tensor, + next_states: torch.Tensor, + ) -> None: + + assert len(states.shape) >= 2 + assert len(actions.shape) == 2 + assert ( + states.shape[1] + actions.shape[1] + == self.num_actions + self.observation_size + ) + # For each model, train with different data. + mini_batch_size = int(math.floor(states.shape[0] / self.num_models)) + + for i in range(self.num_models): + sub_states = states[i * mini_batch_size: (i + 1) * mini_batch_size] + sub_actions = actions[i * mini_batch_size: (i + 1) * mini_batch_size] + sub_next_states = next_states[i * mini_batch_size: (i + 1) * mini_batch_size] + sub_target = sub_next_states - sub_states + + delta_targets_normalized = normalize_observation_delta(sub_target, self.statistics) + _, n_mean, n_var = self.models[i].forward(sub_states, sub_actions) + model_loss = F.gaussian_nll_loss(input=n_mean, target=delta_targets_normalized, var=n_var).mean() + + self.optimizers[i].zero_grad() + model_loss.backward() + self.optimizers[i].step() + + def train_reward( + self, + states: torch.Tensor, + actions: torch.Tensor, + next_states: torch.Tensor, + rewards: torch.Tensor, + ) -> None: + assert len(next_states.shape) >= 2 + self.reward_optimizer.zero_grad() + comb = torch.cat((states, actions, next_states), dim=1) + rwd_mean = self.reward_network.forward(comb) + reward_loss = F.mse_loss(rwd_mean, rewards) + reward_loss.backward() + self.reward_optimizer.step() + + diff --git a/cares_reinforcement_learning/networks/world_models/ensemble_all.py b/cares_reinforcement_learning/networks/world_models/ensemble_world_ensemble_sas_reward.py similarity index 94% rename from cares_reinforcement_learning/networks/world_models/ensemble_all.py rename to cares_reinforcement_learning/networks/world_models/ensemble_world_ensemble_sas_reward.py index 9e3cf290..183fb7eb 100644 --- a/cares_reinforcement_learning/networks/world_models/ensemble_all.py +++ b/cares_reinforcement_learning/networks/world_models/ensemble_world_ensemble_sas_reward.py @@ -9,18 +9,18 @@ from torch import optim from cares_reinforcement_learning.networks.world_models.probabilistic_dynamics import ( - ProbabilisticDynamics, + Probabilistic_Dynamics, ) from cares_reinforcement_learning.networks.world_models.probabilistic_sas_reward import ( Probabilistic_SAS_Reward, ) -from cares_reinforcement_learning.networks.world_models.simple_sas_done import ( - SASDone, -) +# from cares_reinforcement_learning.networks.world_models.simple_sas_done import ( +# SASDone, +# ) from cares_reinforcement_learning.util.helpers import normalize_observation_delta -class EnsembleWorldRewardDone: +class EnsembleWorldEnsembleSASReward: """ This class consist of an ensemble of all components for critic update. Q_label = REWARD + gamma * (1 - DONES) * Q(NEXT_STATES). @@ -44,7 +44,7 @@ def __init__( self.num_actions = num_actions self.device = device - self.world_models = [ProbabilisticDynamics(observation_size=observation_size, num_actions=num_actions, + self.world_models = [Probabilistic_Dynamics(observation_size=observation_size, num_actions=num_actions, hidden_size=hidden_size) for _ in range(self.num_world_models)] self.reward_models = [Probabilistic_SAS_Reward(observation_size=observation_size, num_actions=num_actions, hidden_size=hidden_size) for _ in range(self.num_reward_models)] @@ -59,10 +59,10 @@ def __init__( for world_model in self.world_models: world_model.to(self.device) - self.done_model = SASDone(observation_size=observation_size, num_actions=num_actions, - hidden_size=hidden_size) - self.done_optimizers = optim.Adam(self.done_model.parameters(), lr=lr) - self.done_model.to(self.device) + # self.done_model = SASDone(observation_size=observation_size, num_actions=num_actions, + # hidden_size=hidden_size) + # self.done_optimizers = optim.Adam(self.done_model.parameters(), lr=lr) + # self.done_model.to(self.device) self.statistics = {} def set_statistics(self, statistics: dict) -> None: diff --git a/cares_reinforcement_learning/networks/world_models/ensmeble_sa_world.py b/cares_reinforcement_learning/networks/world_models/ensmeble_sa_world.py index 4c9ee396..f7e90a36 100644 --- a/cares_reinforcement_learning/networks/world_models/ensmeble_sa_world.py +++ b/cares_reinforcement_learning/networks/world_models/ensmeble_sa_world.py @@ -10,7 +10,7 @@ from torch import optim from cares_reinforcement_learning.networks.world_models.probabilistic_dynamics import ( - ProbabilisticDynamics, + Probabilistic_Dynamics, ) from cares_reinforcement_learning.networks.world_models.simple_sa_reward import ( Simple_SA_Reward, @@ -19,6 +19,10 @@ class EnsembleWorldAndOneSAReward: + """ + Specifications: + + """ def __init__( self, observation_size: int, @@ -40,7 +44,7 @@ def __init__( self.reward_optimizer = optim.Adam(self.reward_network.parameters(), lr=lr) self.models = [ - ProbabilisticDynamics( + Probabilistic_Dynamics( observation_size=observation_size, num_actions=num_actions, hidden_size=hidden_size, diff --git a/cares_reinforcement_learning/networks/world_models/probabilistic_dynamics.py b/cares_reinforcement_learning/networks/world_models/probabilistic_dynamics.py index ca559f55..8e67b2f9 100644 --- a/cares_reinforcement_learning/networks/world_models/probabilistic_dynamics.py +++ b/cares_reinforcement_learning/networks/world_models/probabilistic_dynamics.py @@ -10,7 +10,7 @@ ) -class ProbabilisticDynamics(nn.Module): +class Probabilistic_Dynamics(nn.Module): """ A world model with fully connected layers. It takes current states (s) and current actions (a), and predict next states (s'). diff --git a/cares_reinforcement_learning/networks/world_models/simple_sas_done.py b/cares_reinforcement_learning/networks/world_models/simple_sas_done.py index bff8cc14..d0810e53 100644 --- a/cares_reinforcement_learning/networks/world_models/simple_sas_done.py +++ b/cares_reinforcement_learning/networks/world_models/simple_sas_done.py @@ -4,7 +4,7 @@ from cares_reinforcement_learning.util.helpers import weight_init -class SASDone(nn.Module): +class Simple_SAS_Done(nn.Module): def __init__(self, observation_size: int, num_actions: int, hidden_size: int): """ Note, This reward function is limited to 0 ~ 1 for dm_control. diff --git a/cares_reinforcement_learning/networks/world_models/simple_sas_reward.py b/cares_reinforcement_learning/networks/world_models/simple_sas_reward.py new file mode 100644 index 00000000..3e01cc31 --- /dev/null +++ b/cares_reinforcement_learning/networks/world_models/simple_sas_reward.py @@ -0,0 +1,51 @@ +import torch +from torch import nn +import torch.nn.functional as F +from cares_reinforcement_learning.util.helpers import weight_init + + +class Simple_SAS_Reward(nn.Module): + def __init__(self, observation_size: int, num_actions: int, hidden_size: int): + """ + Note, This reward function is limited to 0 ~ 1 for dm_control. + A reward model with fully connected layers. It takes current states (s) + and current actions (a), and predict rewards (r). + + :param (int) observation_size -- dimension of states + :param (int) num_actions -- dimension of actions + :param (int) hidden_size -- size of neurons in hidden layers. + """ + super().__init__() + self.observation_size = observation_size + self.num_actions = num_actions + self.linear1 = nn.Linear(2 * observation_size + num_actions, hidden_size) + self.linear2 = nn.Linear(hidden_size, hidden_size) + self.linear3 = nn.Linear(hidden_size, 1) + self.apply(weight_init) + + def forward( + self, observation: torch.Tensor, actions: torch.Tensor, next_observation: torch.Tensor, normalized: bool = False + ) -> torch.Tensor: + """ + Forward the inputs throught the network. + Note: For DMCS environment, the reward is from 0~1. + + :param (Tensors) obs -- dimension of states + :param (Tensors) actions -- dimension of actions + :param (Bool) normalized -- whether normalized reward to 0~1 + + :return (Tensors) x -- predicted rewards. + """ + assert ( + observation.shape[1] + actions.shape[1] + == self.observation_size + self.num_actions + ) + x = torch.cat((observation, actions, next_observation), dim=1) + x = self.linear1(x) + x = F.relu(x) + x = self.linear2(x) + x = F.relu(x) + rwd_mean = self.linear3(x) + if normalized: + rwd_mean = F.sigmoid(rwd_mean) + return rwd_mean diff --git a/cares_reinforcement_learning/networks/world_models/ensemble_integrated.py b/cares_reinforcement_learning/networks/world_models/z_ensemble_integrated.py similarity index 99% rename from cares_reinforcement_learning/networks/world_models/ensemble_integrated.py rename to cares_reinforcement_learning/networks/world_models/z_ensemble_integrated.py index a4f17f31..5fdf2574 100644 --- a/cares_reinforcement_learning/networks/world_models/ensemble_integrated.py +++ b/cares_reinforcement_learning/networks/world_models/z_ensemble_integrated.py @@ -10,7 +10,7 @@ from torch import optim from cares_reinforcement_learning.networks.world_models.probabilistic_dynamics import ( - ProbabilisticDynamics, + Probabilistic_Dynamics, ) from cares_reinforcement_learning.networks.world_models.simple_ns_reward import ( Simple_NS_Reward, @@ -36,7 +36,7 @@ def __init__( hidden_size: int, lr: float = 0.001, ): - self.dyna_network = ProbabilisticDynamics( + self.dyna_network = Probabilistic_Dynamics( observation_size=observation_size, num_actions=num_actions, hidden_size=hidden_size, diff --git a/cares_reinforcement_learning/util/configurations.py b/cares_reinforcement_learning/util/configurations.py index 863a6e61..e04b2efa 100644 --- a/cares_reinforcement_learning/util/configurations.py +++ b/cares_reinforcement_learning/util/configurations.py @@ -218,8 +218,26 @@ class DynaSACConfig(AlgorithmConfig): world_model_lr: Optional[float] = 0.001 -class DynaSAC_ScaleBatchReweightConfig(AlgorithmConfig): - algorithm: str = Field("DynaSAC_ScaleBatchReweight", Literal=True) +class DynaSAC_SASConfig(AlgorithmConfig): + algorithm: str = Field("DynaSAC_SAS", Literal=True) + actor_lr: Optional[float] = 3e-4 + critic_lr: Optional[float] = 3e-4 + + alpha_lr: Optional[float] = 3e-4 + use_bounded_active: Optional[bool] = False + num_models: Optional[int] = 5 + + gamma: Optional[float] = 0.99 + tau: Optional[float] = 0.005 + reward_scale: Optional[float] = 1.0 + + horizon: Optional[int] = 1 + num_samples: Optional[int] = 10 + world_model_lr: Optional[float] = 0.001 + + +class DynaSAC_SAS_Immersive_WeightConfig(AlgorithmConfig): + algorithm: str = Field("DynaSAC_SAS_Immersive_Weight", Literal=True) actor_lr: Optional[float] = 3e-4 critic_lr: Optional[float] = 3e-4 @@ -243,8 +261,8 @@ class DynaSAC_ScaleBatchReweightConfig(AlgorithmConfig): sample_times: Optional[int] = 10 -class DynaSAC_Immerse_Reweight_ComboConfig(AlgorithmConfig): - algorithm: str = Field("DynaSAC_Immerse_Reweight_Combo", Literal=True) +class DynaSAC_ScaleBatchReweightConfig(AlgorithmConfig): + algorithm: str = Field("DynaSAC_ScaleBatchReweight", Literal=True) actor_lr: Optional[float] = 3e-4 critic_lr: Optional[float] = 3e-4 @@ -260,9 +278,11 @@ class DynaSAC_Immerse_Reweight_ComboConfig(AlgorithmConfig): num_samples: Optional[int] = 10 world_model_lr: Optional[float] = 0.001 - threshold_scale_critic: Optional[float] = 0.7 - threshold_scale_actor: Optional[float] = 0.7 + threshold_scale: Optional[float] = 0.7 + reweight_critic: Optional[bool] = True + reweight_actor: Optional[bool] = False + mode: Optional[int] = 1 sample_times: Optional[int] = 10 diff --git a/cares_reinforcement_learning/util/network_factory.py b/cares_reinforcement_learning/util/network_factory.py index 451b8f2c..d111f9ca 100644 --- a/cares_reinforcement_learning/util/network_factory.py +++ b/cares_reinforcement_learning/util/network_factory.py @@ -85,14 +85,15 @@ def create_STEVE_MEAN(observation_size, action_num, config: AlgorithmConfig): """ from cares_reinforcement_learning.algorithm.mbrl import STEVE_MEAN from cares_reinforcement_learning.networks.SAC import Actor, Critic - from cares_reinforcement_learning.networks.world_models.ensemble_all import EnsembleWorldRewardDone + from cares_reinforcement_learning.networks.world_models.ensemble_world_ensemble_sas_reward import \ + EnsembleWorldEnsembleSASReward actor = Actor(observation_size, action_num) critic = Critic(observation_size, action_num) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - world_model = EnsembleWorldRewardDone( + world_model = EnsembleWorldEnsembleSASReward( observation_size=observation_size, num_actions=action_num, num_world_models=config.num_world_models, @@ -203,22 +204,62 @@ def create_DynaSAC_SABR(observation_size, action_num, config: AlgorithmConfig): return agent -def create_DynaSAC_ScaleBatchReweight(observation_size, action_num, config: AlgorithmConfig): +def create_DynaSAC_SAS(observation_size, action_num, config: AlgorithmConfig): """ Create networks for model-based SAC agent. The Actor and Critic is same. An extra world model is added. """ - from cares_reinforcement_learning.algorithm.mbrl import DynaSAC_ScaleBatchReweight + from cares_reinforcement_learning.algorithm.mbrl import DynaSAC_SAS from cares_reinforcement_learning.networks.SAC import Actor, Critic - from cares_reinforcement_learning.networks.world_models import EnsembleWorldAndOneReward + from cares_reinforcement_learning.networks.world_models import EnsembleWorldAndOneSASReward actor = Actor(observation_size, action_num) critic = Critic(observation_size, action_num) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - world_model = EnsembleWorldAndOneReward( + world_model = EnsembleWorldAndOneSASReward( + observation_size=observation_size, + num_actions=action_num, + num_models=config.num_models, + lr=config.world_model_lr, + device=device, + ) + + agent = DynaSAC_SAS( + actor_network=actor, + critic_network=critic, + world_network=world_model, + actor_lr=config.actor_lr, + critic_lr=config.critic_lr, + gamma=config.gamma, + tau=config.tau, + action_num=action_num, + alpha_lr=config.alpha_lr, + horizon=config.horizon, + num_samples=config.num_samples, + device=device, + ) + return agent + + +def create_DynaSAC_SAS_Immerssive_Weight(observation_size, action_num, config: AlgorithmConfig): + """ + Create networks for model-based SAC agent. The Actor and Critic is same. + An extra world model is added. + + """ + from cares_reinforcement_learning.algorithm.mbrl import DynaSAC_SAS_Immersive_Weight + from cares_reinforcement_learning.networks.SAC import Actor, Critic + from cares_reinforcement_learning.networks.world_models import EnsembleWorldAndOneSASReward + + actor = Actor(observation_size, action_num) + critic = Critic(observation_size, action_num) + + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + world_model = EnsembleWorldAndOneSASReward( observation_size=observation_size, num_actions=action_num, num_models=config.num_models, @@ -226,7 +267,7 @@ def create_DynaSAC_ScaleBatchReweight(observation_size, action_num, config: Algo lr=config.world_model_lr, ) - agent = DynaSAC_ScaleBatchReweight( + agent = DynaSAC_SAS_Immersive_Weight( actor_network=actor, critic_network=critic, world_network=world_model, @@ -248,22 +289,22 @@ def create_DynaSAC_ScaleBatchReweight(observation_size, action_num, config: Algo return agent -def create_DynaSAC_Immerse_Reweight_Combo(observation_size, action_num, config: AlgorithmConfig): +def create_DynaSAC_ScaleBatchReweight(observation_size, action_num, config: AlgorithmConfig): """ Create networks for model-based SAC agent. The Actor and Critic is same. An extra world model is added. """ - from cares_reinforcement_learning.algorithm.mbrl import DynaSAC_Immerse_Reweight_Combo + from cares_reinforcement_learning.algorithm.mbrl import DynaSAC_ScaleBatchReweight from cares_reinforcement_learning.networks.SAC import Actor, Critic - from cares_reinforcement_learning.networks.world_models import EnsembleWorldAndOneReward + from cares_reinforcement_learning.networks.world_models import EnsembleWorldAndOneNSReward actor = Actor(observation_size, action_num) critic = Critic(observation_size, action_num) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - world_model = EnsembleWorldAndOneReward( + world_model = EnsembleWorldAndOneNSReward( observation_size=observation_size, num_actions=action_num, num_models=config.num_models, @@ -271,7 +312,7 @@ def create_DynaSAC_Immerse_Reweight_Combo(observation_size, action_num, config: lr=config.world_model_lr, ) - agent = DynaSAC_Immerse_Reweight_Combo( + agent = DynaSAC_ScaleBatchReweight( actor_network=actor, critic_network=critic, world_network=world_model, @@ -284,8 +325,10 @@ def create_DynaSAC_Immerse_Reweight_Combo(observation_size, action_num, config: alpha_lr=config.alpha_lr, horizon=config.horizon, num_samples=config.num_samples, - threshold_scale_critic=config.threshold_scale_critic, - threshold_scale_actor=config.threshold_scale_actor, + threshold_scale=config.threshold_scale, + reweight_critic=config.reweight_critic, + reweight_actor=config.reweight_actor, + mode=config.mode, sample_times=config.sample_times, ) return agent @@ -299,14 +342,14 @@ def create_DynaSAC_BIVReweight(observation_size, action_num, config: AlgorithmCo """ from cares_reinforcement_learning.algorithm.mbrl import DynaSAC_BIVReweight from cares_reinforcement_learning.networks.SAC import Actor, Critic - from cares_reinforcement_learning.networks.world_models import EnsembleWorldAndOneReward + from cares_reinforcement_learning.networks.world_models import EnsembleWorldAndOneNSReward actor = Actor(observation_size, action_num) critic = Critic(observation_size, action_num) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - world_model = EnsembleWorldAndOneReward( + world_model = EnsembleWorldAndOneNSReward( observation_size=observation_size, num_actions=action_num, num_models=config.num_models, @@ -344,14 +387,14 @@ def create_DynaSAC_SUNRISEReweight(observation_size, action_num, config: Algorit """ from cares_reinforcement_learning.algorithm.mbrl import DynaSAC_SUNRISEReweight from cares_reinforcement_learning.networks.SAC import Actor, Critic - from cares_reinforcement_learning.networks.world_models import EnsembleWorldAndOneReward + from cares_reinforcement_learning.networks.world_models import EnsembleWorldAndOneNSReward actor = Actor(observation_size, action_num) critic = Critic(observation_size, action_num) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - world_model = EnsembleWorldAndOneReward( + world_model = EnsembleWorldAndOneNSReward( observation_size=observation_size, num_actions=action_num, num_models=config.num_models, @@ -389,14 +432,14 @@ def create_DynaSAC_UWACReweight(observation_size, action_num, config: AlgorithmC """ from cares_reinforcement_learning.algorithm.mbrl import DynaSAC_UWACReweight from cares_reinforcement_learning.networks.SAC import Actor, Critic - from cares_reinforcement_learning.networks.world_models import EnsembleWorldAndOneReward + from cares_reinforcement_learning.networks.world_models import EnsembleWorldAndOneNSReward actor = Actor(observation_size, action_num) critic = Critic(observation_size, action_num) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - world_model = EnsembleWorldAndOneReward( + world_model = EnsembleWorldAndOneNSReward( observation_size=observation_size, num_actions=action_num, num_models=config.num_models, @@ -432,16 +475,16 @@ def create_DynaSAC(observation_size, action_num, config: AlgorithmConfig): An extra world model is added. """ - from cares_reinforcement_learning.algorithm.mbrl import DynaSAC + from cares_reinforcement_learning.algorithm.mbrl import DynaSAC_NS from cares_reinforcement_learning.networks.SAC import Actor, Critic - from cares_reinforcement_learning.networks.world_models import EnsembleWorldAndOneReward + from cares_reinforcement_learning.networks.world_models import EnsembleWorldAndOneNSReward actor = Actor(observation_size, action_num) critic = Critic(observation_size, action_num) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - world_model = EnsembleWorldAndOneReward( + world_model = EnsembleWorldAndOneNSReward( observation_size=observation_size, num_actions=action_num, num_models=config.num_models, @@ -449,7 +492,7 @@ def create_DynaSAC(observation_size, action_num, config: AlgorithmConfig): device=device, ) - agent = DynaSAC( + agent = DynaSAC_NS( actor_network=actor, critic_network=critic, world_network=world_model, From 5f901778e12f220ec4a2fd16232673ebef5b2ec0 Mon Sep 17 00:00:00 2001 From: tony Date: Fri, 9 Aug 2024 11:27:48 +1200 Subject: [PATCH 56/91] naming convention --- cares_reinforcement_learning/util/configurations.py | 1 + 1 file changed, 1 insertion(+) diff --git a/cares_reinforcement_learning/util/configurations.py b/cares_reinforcement_learning/util/configurations.py index e04b2efa..c5736682 100644 --- a/cares_reinforcement_learning/util/configurations.py +++ b/cares_reinforcement_learning/util/configurations.py @@ -218,6 +218,7 @@ class DynaSACConfig(AlgorithmConfig): world_model_lr: Optional[float] = 0.001 + class DynaSAC_SASConfig(AlgorithmConfig): algorithm: str = Field("DynaSAC_SAS", Literal=True) actor_lr: Optional[float] = 3e-4 From 3b3449d643eb44203d854691c23e4f8c34982392 Mon Sep 17 00:00:00 2001 From: tony Date: Fri, 9 Aug 2024 11:50:43 +1200 Subject: [PATCH 57/91] naming convention --- .../networks/world_models/ensemble_sas_world.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/cares_reinforcement_learning/networks/world_models/ensemble_sas_world.py b/cares_reinforcement_learning/networks/world_models/ensemble_sas_world.py index f4dd5657..da7b17b3 100644 --- a/cares_reinforcement_learning/networks/world_models/ensemble_sas_world.py +++ b/cares_reinforcement_learning/networks/world_models/ensemble_sas_world.py @@ -77,8 +77,7 @@ def set_statistics(self, statistics: dict) -> None: model.statistics = statistics def pred_rewards(self, observation: torch.Tensor, action: torch.Tensor, next_observation:torch.Tensor): - comb = torch.cat((observation, action, next_observation), dim=1) - pred_rewards = self.reward_network(comb) + pred_rewards = self.reward_network(observation, action, next_observation) return pred_rewards def pred_next_states( @@ -159,8 +158,7 @@ def train_reward( ) -> None: assert len(next_states.shape) >= 2 self.reward_optimizer.zero_grad() - comb = torch.cat((states, actions, next_states), dim=1) - rwd_mean = self.reward_network.forward(comb) + rwd_mean = self.reward_network(states, actions, next_states) reward_loss = F.mse_loss(rwd_mean, rewards) reward_loss.backward() self.reward_optimizer.step() From 9a2f73a149fded2f05a1e09f1710aa4ca5ec043e Mon Sep 17 00:00:00 2001 From: tony Date: Fri, 9 Aug 2024 11:57:09 +1200 Subject: [PATCH 58/91] typo --- cares_reinforcement_learning/util/network_factory.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cares_reinforcement_learning/util/network_factory.py b/cares_reinforcement_learning/util/network_factory.py index d111f9ca..0def465b 100644 --- a/cares_reinforcement_learning/util/network_factory.py +++ b/cares_reinforcement_learning/util/network_factory.py @@ -244,7 +244,7 @@ def create_DynaSAC_SAS(observation_size, action_num, config: AlgorithmConfig): return agent -def create_DynaSAC_SAS_Immerssive_Weight(observation_size, action_num, config: AlgorithmConfig): +def create_DynaSAC_SAS_Immersive_Weight(observation_size, action_num, config: AlgorithmConfig): """ Create networks for model-based SAC agent. The Actor and Critic is same. An extra world model is added. From aefeef933fcb26c0d6b543ae41588386cf7127a7 Mon Sep 17 00:00:00 2001 From: tony Date: Fri, 9 Aug 2024 12:07:04 +1200 Subject: [PATCH 59/91] typo --- .../algorithm/mbrl/DynaSAC_SAS_Immersive_Weight.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_SAS_Immersive_Weight.py b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_SAS_Immersive_Weight.py index 162eb3b5..1a3634b3 100644 --- a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_SAS_Immersive_Weight.py +++ b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_SAS_Immersive_Weight.py @@ -263,7 +263,7 @@ def _dyna_generate_and_train(self, next_states): pred_next_state, _, pred_mean, pred_var = self.world_model.pred_next_states( pred_state, pred_acts ) - uncert = self.sampling(pred_state, pred_act, pred_means=pred_mean, pred_vars=pred_var) + uncert = self.sampling(pred_state, pred_acts, pred_means=pred_mean, pred_vars=pred_var) uncert = uncert.unsqueeze(dim=1).to(self.device) pred_uncerts.append(uncert) From 68803dd753c7b5b49fd7718e5aa3909343177e6f Mon Sep 17 00:00:00 2001 From: tony Date: Fri, 9 Aug 2024 16:28:07 +1200 Subject: [PATCH 60/91] typo --- .../algorithm/mbrl/DynaSAC_BIV_Reweight.py | 2 +- .../algorithm/mbrl/DynaSAC_NS.py | 2 +- .../algorithm/mbrl/DynaSAC_NS_Immersive_Weight.py | 2 +- .../algorithm/mbrl/DynaSAC_SA.py | 2 +- .../algorithm/mbrl/DynaSAC_SAS.py | 2 +- .../mbrl/DynaSAC_SAS_Immersive_Weight.py | 15 +++++++++------ .../algorithm/mbrl/DynaSAC_SA_Immersive_Weight.py | 2 +- .../algorithm/mbrl/DynaSAC_SUNRISE_Reweight.py | 2 +- .../algorithm/mbrl/DynaSAC_UWAC_Reweight.py | 2 +- .../algorithm/mbrl/STEVE_MEAN_SAC.py | 2 +- .../networks/world_models/__init__.py | 6 +++--- .../networks/world_models/ensemble_ns_world.py | 3 +++ 12 files changed, 24 insertions(+), 18 deletions(-) diff --git a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_BIV_Reweight.py b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_BIV_Reweight.py index 070e1efb..10ad9fa7 100644 --- a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_BIV_Reweight.py +++ b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_BIV_Reweight.py @@ -15,7 +15,7 @@ from cares_reinforcement_learning.memory import PrioritizedReplayBuffer import torch.nn.functional as F -from cares_reinforcement_learning.networks.world_models.ensemble_ns_world import ( +from cares_reinforcement_learning.networks.world_models import ( EnsembleWorldAndOneNSReward, ) diff --git a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_NS.py b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_NS.py index 71cd2f53..da151034 100644 --- a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_NS.py +++ b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_NS.py @@ -16,7 +16,7 @@ from cares_reinforcement_learning.memory import PrioritizedReplayBuffer -from cares_reinforcement_learning.networks.world_models.ensemble_ns_world import ( +from cares_reinforcement_learning.networks.world_models import ( EnsembleWorldAndOneNSReward, ) diff --git a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_NS_Immersive_Weight.py b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_NS_Immersive_Weight.py index 0724e904..eb1c66e9 100644 --- a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_NS_Immersive_Weight.py +++ b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_NS_Immersive_Weight.py @@ -15,7 +15,7 @@ from cares_reinforcement_learning.memory import PrioritizedReplayBuffer import torch.nn.functional as F -from cares_reinforcement_learning.networks.world_models.ensemble_ns_world import ( +from cares_reinforcement_learning.networks.world_models import ( EnsembleWorldAndOneNSReward, ) diff --git a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_SA.py b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_SA.py index c812574d..493d636a 100644 --- a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_SA.py +++ b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_SA.py @@ -16,7 +16,7 @@ from cares_reinforcement_learning.memory import PrioritizedReplayBuffer -from cares_reinforcement_learning.networks.world_models.ensmeble_sa_world import ( +from cares_reinforcement_learning.networks.world_models import ( EnsembleWorldAndOneSAReward, ) diff --git a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_SAS.py b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_SAS.py index 2f2ad91a..456299ba 100644 --- a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_SAS.py +++ b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_SAS.py @@ -16,7 +16,7 @@ from cares_reinforcement_learning.memory import PrioritizedReplayBuffer -from cares_reinforcement_learning.networks.world_models.ensemble_sas_world import ( +from cares_reinforcement_learning.networks.world_models import ( EnsembleWorldAndOneSASReward, ) diff --git a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_SAS_Immersive_Weight.py b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_SAS_Immersive_Weight.py index 1a3634b3..9b545811 100644 --- a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_SAS_Immersive_Weight.py +++ b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_SAS_Immersive_Weight.py @@ -15,10 +15,11 @@ from cares_reinforcement_learning.memory import PrioritizedReplayBuffer import torch.nn.functional as F -from cares_reinforcement_learning.networks.world_models.ensemble_sas_world import ( +from cares_reinforcement_learning.networks.world_models import ( EnsembleWorldAndOneSASReward, ) +from cares_reinforcement_learning.util.helpers import denormalize_observation_delta class DynaSAC_SAS_Immersive_Weight: """ @@ -316,11 +317,13 @@ def sampling(self, pred_state, pred_act, pred_means, pred_vars): for i in range(self.sample_times): if self.reweight_critic == 1: # 5 models, each sampled 10 times = 50, - pred_rwd1 = self.world_model.pred_rewards(pred_state, pred_act, sample1[i]) - pred_rwd2 = self.world_model.pred_rewards(pred_state, pred_act, sample2[i]) - pred_rwd3 = self.world_model.pred_rewards(pred_state, pred_act, sample3[i]) - pred_rwd4 = self.world_model.pred_rewards(pred_state, pred_act, sample4[i]) - pred_rwd5 = self.world_model.pred_rewards(pred_state, pred_act, sample5[i]) + denormalize_observation_delta(sample1[i], self.world_model.statistics) + + pred_rwd1 = self.world_model.pred_rewards(pred_state, pred_act, denormalize_observation_delta(sample1[i], self.world_model.statistics)) + pred_rwd2 = self.world_model.pred_rewards(pred_state, pred_act, denormalize_observation_delta(sample2[i], self.world_model.statistics)) + pred_rwd3 = self.world_model.pred_rewards(pred_state, pred_act, denormalize_observation_delta(sample3[i], self.world_model.statistics)) + pred_rwd4 = self.world_model.pred_rewards(pred_state, pred_act, denormalize_observation_delta(sample4[i], self.world_model.statistics)) + pred_rwd5 = self.world_model.pred_rewards(pred_state, pred_act, denormalize_observation_delta(sample5[i], self.world_model.statistics)) rs.append(pred_rwd1) rs.append(pred_rwd2) rs.append(pred_rwd3) diff --git a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_SA_Immersive_Weight.py b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_SA_Immersive_Weight.py index a3e9f965..363f3b16 100644 --- a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_SA_Immersive_Weight.py +++ b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_SA_Immersive_Weight.py @@ -15,7 +15,7 @@ from cares_reinforcement_learning.memory import PrioritizedReplayBuffer import torch.nn.functional as F -from cares_reinforcement_learning.networks.world_models.ensmeble_sa_world import ( +from cares_reinforcement_learning.networks.world_models import ( EnsembleWorldAndOneSAReward, ) diff --git a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_SUNRISE_Reweight.py b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_SUNRISE_Reweight.py index c5020491..3c4d8c45 100644 --- a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_SUNRISE_Reweight.py +++ b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_SUNRISE_Reweight.py @@ -15,7 +15,7 @@ from cares_reinforcement_learning.memory import PrioritizedReplayBuffer import torch.nn.functional as F -from cares_reinforcement_learning.networks.world_models.ensemble_ns_world import ( +from cares_reinforcement_learning.networks.world_models import ( EnsembleWorldAndOneNSReward, ) diff --git a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_UWAC_Reweight.py b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_UWAC_Reweight.py index dfe04df4..3c9a6f39 100644 --- a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_UWAC_Reweight.py +++ b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_UWAC_Reweight.py @@ -15,7 +15,7 @@ from cares_reinforcement_learning.memory import PrioritizedReplayBuffer import torch.nn.functional as F -from cares_reinforcement_learning.networks.world_models.ensemble_ns_world import ( +from cares_reinforcement_learning.networks.world_models import ( EnsembleWorldAndOneNSReward, ) diff --git a/cares_reinforcement_learning/algorithm/mbrl/STEVE_MEAN_SAC.py b/cares_reinforcement_learning/algorithm/mbrl/STEVE_MEAN_SAC.py index 2fb55b41..9737ed08 100644 --- a/cares_reinforcement_learning/algorithm/mbrl/STEVE_MEAN_SAC.py +++ b/cares_reinforcement_learning/algorithm/mbrl/STEVE_MEAN_SAC.py @@ -15,7 +15,7 @@ from cares_reinforcement_learning.memory import PrioritizedReplayBuffer -from cares_reinforcement_learning.networks.world_models.ensemble_world_ensemble_sas_reward import ( +from cares_reinforcement_learning.networks.world_models import ( EnsembleWorldEnsembleSASReward, ) diff --git a/cares_reinforcement_learning/networks/world_models/__init__.py b/cares_reinforcement_learning/networks/world_models/__init__.py index 8bda984e..395dfb8b 100644 --- a/cares_reinforcement_learning/networks/world_models/__init__.py +++ b/cares_reinforcement_learning/networks/world_models/__init__.py @@ -1,6 +1,6 @@ -from cares_reinforcement_learning.networks.world_models.z_ensemble_integrated import ( - EnsembleWorldReward, -) +# from cares_reinforcement_learning.networks.world_models.z_ensemble_integrated import ( +# EnsembleWorldReward, +# ) from cares_reinforcement_learning.networks.world_models.ensemble_ns_world import ( EnsembleWorldAndOneNSReward, diff --git a/cares_reinforcement_learning/networks/world_models/ensemble_ns_world.py b/cares_reinforcement_learning/networks/world_models/ensemble_ns_world.py index e2221cbe..94c3ac1e 100644 --- a/cares_reinforcement_learning/networks/world_models/ensemble_ns_world.py +++ b/cares_reinforcement_learning/networks/world_models/ensemble_ns_world.py @@ -19,6 +19,9 @@ class EnsembleWorldAndOneNSReward: + """ + Spec + """ def __init__( self, observation_size: int, From 2124c7135aedf09a7165e35c124fb97d133cbbfc Mon Sep 17 00:00:00 2001 From: tony Date: Fri, 9 Aug 2024 16:29:42 +1200 Subject: [PATCH 61/91] typo --- .../algorithm/mbrl/DynaSAC_SAS_Immersive_Weight.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_SAS_Immersive_Weight.py b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_SAS_Immersive_Weight.py index 9b545811..875699c1 100644 --- a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_SAS_Immersive_Weight.py +++ b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_SAS_Immersive_Weight.py @@ -317,8 +317,6 @@ def sampling(self, pred_state, pred_act, pred_means, pred_vars): for i in range(self.sample_times): if self.reweight_critic == 1: # 5 models, each sampled 10 times = 50, - denormalize_observation_delta(sample1[i], self.world_model.statistics) - pred_rwd1 = self.world_model.pred_rewards(pred_state, pred_act, denormalize_observation_delta(sample1[i], self.world_model.statistics)) pred_rwd2 = self.world_model.pred_rewards(pred_state, pred_act, denormalize_observation_delta(sample2[i], self.world_model.statistics)) pred_rwd3 = self.world_model.pred_rewards(pred_state, pred_act, denormalize_observation_delta(sample3[i], self.world_model.statistics)) From c866d80a58c72b83d1dcb24fd0ab337236f38247 Mon Sep 17 00:00:00 2001 From: tony Date: Fri, 9 Aug 2024 16:32:28 +1200 Subject: [PATCH 62/91] typo --- .../algorithm/mbrl/DynaSAC_NS_Immersive_Weight.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_NS_Immersive_Weight.py b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_NS_Immersive_Weight.py index eb1c66e9..54397389 100644 --- a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_NS_Immersive_Weight.py +++ b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_NS_Immersive_Weight.py @@ -18,6 +18,7 @@ from cares_reinforcement_learning.networks.world_models import ( EnsembleWorldAndOneNSReward, ) +from cares_reinforcement_learning.util.helpers import denormalize_observation_delta @@ -311,11 +312,11 @@ def sampling(self, pred_means, pred_vars): for i in range(self.sample_times): if self.reweight_critic == 1: # 5 models, each sampled 10 times = 50, - pred_rwd1 = self.world_model.pred_rewards(sample1[i]) - pred_rwd2 = self.world_model.pred_rewards(sample2[i]) - pred_rwd3 = self.world_model.pred_rewards(sample3[i]) - pred_rwd4 = self.world_model.pred_rewards(sample4[i]) - pred_rwd5 = self.world_model.pred_rewards(sample5[i]) + pred_rwd1 = self.world_model.pred_rewards(denormalize_observation_delta(sample1[i], self.world_model.statistics)) + pred_rwd2 = self.world_model.pred_rewards(denormalize_observation_delta(sample2[i], self.world_model.statistics)) + pred_rwd3 = self.world_model.pred_rewards(denormalize_observation_delta(sample3[i], self.world_model.statistics)) + pred_rwd4 = self.world_model.pred_rewards(denormalize_observation_delta(sample4[i], self.world_model.statistics)) + pred_rwd5 = self.world_model.pred_rewards(denormalize_observation_delta(sample5[i], self.world_model.statistics)) rs.append(pred_rwd1) rs.append(pred_rwd2) rs.append(pred_rwd3) From 65d7d4c24260d70a78b1c07aa102b75b7bba68c2 Mon Sep 17 00:00:00 2001 From: tony Date: Fri, 9 Aug 2024 16:41:17 +1200 Subject: [PATCH 63/91] typo --- .../mbrl/DynaSAC_NS_Immersive_Weight.py | 36 +++++++++++-------- .../mbrl/DynaSAC_SAS_Immersive_Weight.py | 36 +++++++++++-------- 2 files changed, 42 insertions(+), 30 deletions(-) diff --git a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_NS_Immersive_Weight.py b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_NS_Immersive_Weight.py index 54397389..39b7d029 100644 --- a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_NS_Immersive_Weight.py +++ b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_NS_Immersive_Weight.py @@ -305,18 +305,24 @@ def sampling(self, pred_means, pred_vars): [self.sample_times]) sample5 = torch.distributions.Normal(pred_means[4], pred_vars[4]).sample( [self.sample_times]) + rs = [] acts = [] qs = [] # Varying the next_state's distribution. for i in range(self.sample_times): + sample1i = denormalize_observation_delta(sample1[i], self.world_model.statistics) + sample2i = denormalize_observation_delta(sample2[i], self.world_model.statistics) + sample3i = denormalize_observation_delta(sample3[i], self.world_model.statistics) + sample4i = denormalize_observation_delta(sample4[i], self.world_model.statistics) + sample5i = denormalize_observation_delta(sample5[i], self.world_model.statistics) if self.reweight_critic == 1: # 5 models, each sampled 10 times = 50, - pred_rwd1 = self.world_model.pred_rewards(denormalize_observation_delta(sample1[i], self.world_model.statistics)) - pred_rwd2 = self.world_model.pred_rewards(denormalize_observation_delta(sample2[i], self.world_model.statistics)) - pred_rwd3 = self.world_model.pred_rewards(denormalize_observation_delta(sample3[i], self.world_model.statistics)) - pred_rwd4 = self.world_model.pred_rewards(denormalize_observation_delta(sample4[i], self.world_model.statistics)) - pred_rwd5 = self.world_model.pred_rewards(denormalize_observation_delta(sample5[i], self.world_model.statistics)) + pred_rwd1 = self.world_model.pred_rewards(sample1i) + pred_rwd2 = self.world_model.pred_rewards(sample2i) + pred_rwd3 = self.world_model.pred_rewards(sample3i) + pred_rwd4 = self.world_model.pred_rewards(sample4i) + pred_rwd5 = self.world_model.pred_rewards(sample5i) rs.append(pred_rwd1) rs.append(pred_rwd2) rs.append(pred_rwd3) @@ -324,11 +330,11 @@ def sampling(self, pred_means, pred_vars): rs.append(pred_rwd5) # Each times, 5 models predict different actions. # [2560, 17] - pred_act1, log_pi1, _ = self.actor_net(sample1[i]) - pred_act2, log_pi2, _ = self.actor_net(sample2[i]) - pred_act3, log_pi3, _ = self.actor_net(sample3[i]) - pred_act4, log_pi4, _ = self.actor_net(sample4[i]) - pred_act5, log_pi5, _ = self.actor_net(sample5[i]) + pred_act1, log_pi1, _ = self.actor_net(sample1i) + pred_act2, log_pi2, _ = self.actor_net(sample2i) + pred_act3, log_pi3, _ = self.actor_net(sample3i) + pred_act4, log_pi4, _ = self.actor_net(sample4i) + pred_act5, log_pi5, _ = self.actor_net(sample5i) acts.append(log_pi1) acts.append(log_pi2) acts.append(log_pi3) @@ -338,15 +344,15 @@ def sampling(self, pred_means, pred_vars): # Now: sample1 sample2... same next state, different model. # Pred_act1 pred_act2 same next_state, different actions. # 5[] * 10[var of state] - qa1, qa2 = self.target_critic_net(sample1[i], pred_act1) + qa1, qa2 = self.target_critic_net(sample1i, pred_act1) qa = torch.minimum(qa1, qa2) - qb1, qb2 = self.target_critic_net(sample2[i], pred_act2) + qb1, qb2 = self.target_critic_net(sample2i, pred_act2) qb = torch.minimum(qb1, qb2) - qc1, qc2 = self.target_critic_net(sample3[i], pred_act3) + qc1, qc2 = self.target_critic_net(sample3i, pred_act3) qc = torch.minimum(qc1, qc2) - qd1, qd2 = self.target_critic_net(sample4[i], pred_act4) + qd1, qd2 = self.target_critic_net(sample4i, pred_act4) qd = torch.minimum(qd1, qd2) - qe1, qe2 = self.target_critic_net(sample5[i], pred_act5) + qe1, qe2 = self.target_critic_net(sample5i, pred_act5) qe = torch.minimum(qe1, qe2) qs.append(qa) qs.append(qb) diff --git a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_SAS_Immersive_Weight.py b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_SAS_Immersive_Weight.py index 875699c1..471cda0a 100644 --- a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_SAS_Immersive_Weight.py +++ b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_SAS_Immersive_Weight.py @@ -315,13 +315,19 @@ def sampling(self, pred_state, pred_act, pred_means, pred_vars): qs = [] # Varying the next_state's distribution. for i in range(self.sample_times): + sample1i = denormalize_observation_delta(sample1[i], self.world_model.statistics) + sample2i = denormalize_observation_delta(sample2[i], self.world_model.statistics) + sample3i = denormalize_observation_delta(sample3[i], self.world_model.statistics) + sample4i = denormalize_observation_delta(sample4[i], self.world_model.statistics) + sample5i = denormalize_observation_delta(sample5[i], self.world_model.statistics) + if self.reweight_critic == 1: # 5 models, each sampled 10 times = 50, - pred_rwd1 = self.world_model.pred_rewards(pred_state, pred_act, denormalize_observation_delta(sample1[i], self.world_model.statistics)) - pred_rwd2 = self.world_model.pred_rewards(pred_state, pred_act, denormalize_observation_delta(sample2[i], self.world_model.statistics)) - pred_rwd3 = self.world_model.pred_rewards(pred_state, pred_act, denormalize_observation_delta(sample3[i], self.world_model.statistics)) - pred_rwd4 = self.world_model.pred_rewards(pred_state, pred_act, denormalize_observation_delta(sample4[i], self.world_model.statistics)) - pred_rwd5 = self.world_model.pred_rewards(pred_state, pred_act, denormalize_observation_delta(sample5[i], self.world_model.statistics)) + pred_rwd1 = self.world_model.pred_rewards(pred_state, pred_act, sample1i) + pred_rwd2 = self.world_model.pred_rewards(pred_state, pred_act, sample2i) + pred_rwd3 = self.world_model.pred_rewards(pred_state, pred_act, sample3i) + pred_rwd4 = self.world_model.pred_rewards(pred_state, pred_act, sample4i) + pred_rwd5 = self.world_model.pred_rewards(pred_state, pred_act, sample5i) rs.append(pred_rwd1) rs.append(pred_rwd2) rs.append(pred_rwd3) @@ -329,11 +335,11 @@ def sampling(self, pred_state, pred_act, pred_means, pred_vars): rs.append(pred_rwd5) # Each times, 5 models predict different actions. # [2560, 17] - pred_act1, log_pi1, _ = self.actor_net(sample1[i]) - pred_act2, log_pi2, _ = self.actor_net(sample2[i]) - pred_act3, log_pi3, _ = self.actor_net(sample3[i]) - pred_act4, log_pi4, _ = self.actor_net(sample4[i]) - pred_act5, log_pi5, _ = self.actor_net(sample5[i]) + pred_act1, log_pi1, _ = self.actor_net(sample1i) + pred_act2, log_pi2, _ = self.actor_net(sample2i) + pred_act3, log_pi3, _ = self.actor_net(sample3i) + pred_act4, log_pi4, _ = self.actor_net(sample4i) + pred_act5, log_pi5, _ = self.actor_net(sample5i) acts.append(log_pi1) acts.append(log_pi2) acts.append(log_pi3) @@ -343,15 +349,15 @@ def sampling(self, pred_state, pred_act, pred_means, pred_vars): # Now: sample1 sample2... same next state, different model. # Pred_act1 pred_act2 same next_state, different actions. # 5[] * 10[var of state] - qa1, qa2 = self.target_critic_net(sample1[i], pred_act1) + qa1, qa2 = self.target_critic_net(sample1i, pred_act1) qa = torch.minimum(qa1, qa2) - qb1, qb2 = self.target_critic_net(sample2[i], pred_act2) + qb1, qb2 = self.target_critic_net(sample2i, pred_act2) qb = torch.minimum(qb1, qb2) - qc1, qc2 = self.target_critic_net(sample3[i], pred_act3) + qc1, qc2 = self.target_critic_net(sample3i, pred_act3) qc = torch.minimum(qc1, qc2) - qd1, qd2 = self.target_critic_net(sample4[i], pred_act4) + qd1, qd2 = self.target_critic_net(sample4i, pred_act4) qd = torch.minimum(qd1, qd2) - qe1, qe2 = self.target_critic_net(sample5[i], pred_act5) + qe1, qe2 = self.target_critic_net(sample5i, pred_act5) qe = torch.minimum(qe1, qe2) qs.append(qa) qs.append(qb) From 3b2d3679ea48799e77c9ccccb6981db13e031c4d Mon Sep 17 00:00:00 2001 From: tony Date: Fri, 9 Aug 2024 16:52:47 +1200 Subject: [PATCH 64/91] naming --- .../algorithm/mbrl/DynaSAC_BIV_Reweight.py | 38 +++++++++++-------- .../mbrl/DynaSAC_SA_Immersive_Weight.py | 27 ++++++++----- .../mbrl/DynaSAC_SUNRISE_Reweight.py | 37 ++++++++++-------- .../algorithm/mbrl/DynaSAC_UWAC_Reweight.py | 37 ++++++++++-------- 4 files changed, 84 insertions(+), 55 deletions(-) diff --git a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_BIV_Reweight.py b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_BIV_Reweight.py index 10ad9fa7..5c21e1c4 100644 --- a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_BIV_Reweight.py +++ b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_BIV_Reweight.py @@ -19,6 +19,8 @@ EnsembleWorldAndOneNSReward, ) +from cares_reinforcement_learning.util.helpers import denormalize_observation_delta + class DynaSAC_BIVReweight: """ @@ -308,12 +310,18 @@ def sampling(self, pred_means, pred_vars): qs = [] # Varying the next_state's distribution. for i in range(self.sample_times): + sample1i = denormalize_observation_delta(sample1[i], self.world_model.statistics) + sample2i = denormalize_observation_delta(sample2[i], self.world_model.statistics) + sample3i = denormalize_observation_delta(sample3[i], self.world_model.statistics) + sample4i = denormalize_observation_delta(sample4[i], self.world_model.statistics) + sample5i = denormalize_observation_delta(sample5[i], self.world_model.statistics) + # 5 models, each sampled 10 times = 50, - pred_rwd1 = self.world_model.pred_rewards(sample1[i]) - pred_rwd2 = self.world_model.pred_rewards(sample2[i]) - pred_rwd3 = self.world_model.pred_rewards(sample3[i]) - pred_rwd4 = self.world_model.pred_rewards(sample4[i]) - pred_rwd5 = self.world_model.pred_rewards(sample5[i]) + pred_rwd1 = self.world_model.pred_rewards(sample1i) + pred_rwd2 = self.world_model.pred_rewards(sample2i) + pred_rwd3 = self.world_model.pred_rewards(sample3i) + pred_rwd4 = self.world_model.pred_rewards(sample4i) + pred_rwd5 = self.world_model.pred_rewards(sample5i) rs.append(pred_rwd1) rs.append(pred_rwd2) rs.append(pred_rwd3) @@ -321,11 +329,11 @@ def sampling(self, pred_means, pred_vars): rs.append(pred_rwd5) # Each times, 5 models predict different actions. # [2560, 17] - pred_act1, log_pi1, _ = self.actor_net(sample1[i]) - pred_act2, log_pi2, _ = self.actor_net(sample2[i]) - pred_act3, log_pi3, _ = self.actor_net(sample3[i]) - pred_act4, log_pi4, _ = self.actor_net(sample4[i]) - pred_act5, log_pi5, _ = self.actor_net(sample5[i]) + pred_act1, log_pi1, _ = self.actor_net(sample1i) + pred_act2, log_pi2, _ = self.actor_net(sample2i) + pred_act3, log_pi3, _ = self.actor_net(sample3i) + pred_act4, log_pi4, _ = self.actor_net(sample4i) + pred_act5, log_pi5, _ = self.actor_net(sample5i) acts.append(log_pi1) acts.append(log_pi2) acts.append(log_pi3) @@ -335,15 +343,15 @@ def sampling(self, pred_means, pred_vars): # Now: sample1 sample2... same next state, different model. # Pred_act1 pred_act2 same next_state, different actions. # 5[] * 10[var of state] - qa1, qa2 = self.target_critic_net(sample1[i], pred_act1) + qa1, qa2 = self.target_critic_net(sample1i, pred_act1) qa = torch.minimum(qa1, qa2) - qb1, qb2 = self.target_critic_net(sample2[i], pred_act2) + qb1, qb2 = self.target_critic_net(sample2i, pred_act2) qb = torch.minimum(qb1, qb2) - qc1, qc2 = self.target_critic_net(sample3[i], pred_act3) + qc1, qc2 = self.target_critic_net(sample3i, pred_act3) qc = torch.minimum(qc1, qc2) - qd1, qd2 = self.target_critic_net(sample4[i], pred_act4) + qd1, qd2 = self.target_critic_net(sample4i, pred_act4) qd = torch.minimum(qd1, qd2) - qe1, qe2 = self.target_critic_net(sample5[i], pred_act5) + qe1, qe2 = self.target_critic_net(sample5i, pred_act5) qe = torch.minimum(qe1, qe2) qs.append(qa) qs.append(qb) diff --git a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_SA_Immersive_Weight.py b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_SA_Immersive_Weight.py index 363f3b16..8ac48314 100644 --- a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_SA_Immersive_Weight.py +++ b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_SA_Immersive_Weight.py @@ -19,6 +19,7 @@ EnsembleWorldAndOneSAReward, ) +from cares_reinforcement_learning.util.helpers import denormalize_observation_delta class DynaSAC_SABR: """ @@ -308,13 +309,19 @@ def sampling(self, pred_means, pred_vars): qs = [] # Varying the next_state's distribution. for i in range(self.sample_times): + sample1i = denormalize_observation_delta(sample1[i], self.world_model.statistics) + sample2i = denormalize_observation_delta(sample2[i], self.world_model.statistics) + sample3i = denormalize_observation_delta(sample3[i], self.world_model.statistics) + sample4i = denormalize_observation_delta(sample4[i], self.world_model.statistics) + sample5i = denormalize_observation_delta(sample5[i], self.world_model.statistics) + # Each times, 5 models predict different actions. # [2560, 17] - pred_act1, log_pi1, _ = self.actor_net(sample1[i]) - pred_act2, log_pi2, _ = self.actor_net(sample2[i]) - pred_act3, log_pi3, _ = self.actor_net(sample3[i]) - pred_act4, log_pi4, _ = self.actor_net(sample4[i]) - pred_act5, log_pi5, _ = self.actor_net(sample5[i]) + pred_act1, log_pi1, _ = self.actor_net(sample1i) + pred_act2, log_pi2, _ = self.actor_net(sample2i) + pred_act3, log_pi3, _ = self.actor_net(sample3i) + pred_act4, log_pi4, _ = self.actor_net(sample4i) + pred_act5, log_pi5, _ = self.actor_net(sample5i) acts.append(log_pi1) acts.append(log_pi2) acts.append(log_pi3) @@ -324,15 +331,15 @@ def sampling(self, pred_means, pred_vars): # Now: sample1 sample2... same next state, different model. # Pred_act1 pred_act2 same next_state, different actions. # 5[] * 10[var of state] - qa1, qa2 = self.target_critic_net(sample1[i], pred_act1) + qa1, qa2 = self.target_critic_net(sample1i, pred_act1) qa = torch.minimum(qa1, qa2) - qb1, qb2 = self.target_critic_net(sample2[i], pred_act2) + qb1, qb2 = self.target_critic_net(sample2i, pred_act2) qb = torch.minimum(qb1, qb2) - qc1, qc2 = self.target_critic_net(sample3[i], pred_act3) + qc1, qc2 = self.target_critic_net(sample3i, pred_act3) qc = torch.minimum(qc1, qc2) - qd1, qd2 = self.target_critic_net(sample4[i], pred_act4) + qd1, qd2 = self.target_critic_net(sample4i, pred_act4) qd = torch.minimum(qd1, qd2) - qe1, qe2 = self.target_critic_net(sample5[i], pred_act5) + qe1, qe2 = self.target_critic_net(sample5i, pred_act5) qe = torch.minimum(qe1, qe2) qs.append(qa) qs.append(qb) diff --git a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_SUNRISE_Reweight.py b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_SUNRISE_Reweight.py index 3c4d8c45..240332c4 100644 --- a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_SUNRISE_Reweight.py +++ b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_SUNRISE_Reweight.py @@ -18,6 +18,7 @@ from cares_reinforcement_learning.networks.world_models import ( EnsembleWorldAndOneNSReward, ) +from cares_reinforcement_learning.util.helpers import denormalize_observation_delta class DynaSAC_SUNRISEReweight: @@ -308,12 +309,18 @@ def sampling(self, pred_means, pred_vars): qs = [] # Varying the next_state's distribution. for i in range(self.sample_times): + sample1i = denormalize_observation_delta(sample1[i], self.world_model.statistics) + sample2i = denormalize_observation_delta(sample2[i], self.world_model.statistics) + sample3i = denormalize_observation_delta(sample3[i], self.world_model.statistics) + sample4i = denormalize_observation_delta(sample4[i], self.world_model.statistics) + sample5i = denormalize_observation_delta(sample5[i], self.world_model.statistics) + # 5 models, each sampled 10 times = 50, - pred_rwd1 = self.world_model.pred_rewards(sample1[i]) - pred_rwd2 = self.world_model.pred_rewards(sample2[i]) - pred_rwd3 = self.world_model.pred_rewards(sample3[i]) - pred_rwd4 = self.world_model.pred_rewards(sample4[i]) - pred_rwd5 = self.world_model.pred_rewards(sample5[i]) + pred_rwd1 = self.world_model.pred_rewards(sample1i) + pred_rwd2 = self.world_model.pred_rewards(sample2i) + pred_rwd3 = self.world_model.pred_rewards(sample3i) + pred_rwd4 = self.world_model.pred_rewards(sample4i) + pred_rwd5 = self.world_model.pred_rewards(sample5i) rs.append(pred_rwd1) rs.append(pred_rwd2) rs.append(pred_rwd3) @@ -321,11 +328,11 @@ def sampling(self, pred_means, pred_vars): rs.append(pred_rwd5) # Each times, 5 models predict different actions. # [2560, 17] - pred_act1, log_pi1, _ = self.actor_net(sample1[i]) - pred_act2, log_pi2, _ = self.actor_net(sample2[i]) - pred_act3, log_pi3, _ = self.actor_net(sample3[i]) - pred_act4, log_pi4, _ = self.actor_net(sample4[i]) - pred_act5, log_pi5, _ = self.actor_net(sample5[i]) + pred_act1, log_pi1, _ = self.actor_net(sample1i) + pred_act2, log_pi2, _ = self.actor_net(sample2i) + pred_act3, log_pi3, _ = self.actor_net(sample3i) + pred_act4, log_pi4, _ = self.actor_net(sample4i) + pred_act5, log_pi5, _ = self.actor_net(sample5i) acts.append(log_pi1) acts.append(log_pi2) acts.append(log_pi3) @@ -335,15 +342,15 @@ def sampling(self, pred_means, pred_vars): # Now: sample1 sample2... same next state, different model. # Pred_act1 pred_act2 same next_state, different actions. # 5[] * 10[var of state] - qa1, qa2 = self.target_critic_net(sample1[i], pred_act1) + qa1, qa2 = self.target_critic_net(sample1i, pred_act1) qa = torch.minimum(qa1, qa2) - qb1, qb2 = self.target_critic_net(sample2[i], pred_act2) + qb1, qb2 = self.target_critic_net(sample2i, pred_act2) qb = torch.minimum(qb1, qb2) - qc1, qc2 = self.target_critic_net(sample3[i], pred_act3) + qc1, qc2 = self.target_critic_net(sample3i, pred_act3) qc = torch.minimum(qc1, qc2) - qd1, qd2 = self.target_critic_net(sample4[i], pred_act4) + qd1, qd2 = self.target_critic_net(sample4i, pred_act4) qd = torch.minimum(qd1, qd2) - qe1, qe2 = self.target_critic_net(sample5[i], pred_act5) + qe1, qe2 = self.target_critic_net(sample5i, pred_act5) qe = torch.minimum(qe1, qe2) qs.append(qa) qs.append(qb) diff --git a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_UWAC_Reweight.py b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_UWAC_Reweight.py index 3c9a6f39..1257f5fb 100644 --- a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_UWAC_Reweight.py +++ b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_UWAC_Reweight.py @@ -19,6 +19,7 @@ EnsembleWorldAndOneNSReward, ) +from cares_reinforcement_learning.util.helpers import denormalize_observation_delta class DynaSAC_UWACReweight: """ @@ -308,12 +309,18 @@ def sampling(self, pred_means, pred_vars): qs = [] # Varying the next_state's distribution. for i in range(self.sample_times): + sample1i = denormalize_observation_delta(sample1[i], self.world_model.statistics) + sample2i = denormalize_observation_delta(sample2[i], self.world_model.statistics) + sample3i = denormalize_observation_delta(sample3[i], self.world_model.statistics) + sample4i = denormalize_observation_delta(sample4[i], self.world_model.statistics) + sample5i = denormalize_observation_delta(sample5[i], self.world_model.statistics) + # 5 models, each sampled 10 times = 50, - pred_rwd1 = self.world_model.pred_rewards(sample1[i]) - pred_rwd2 = self.world_model.pred_rewards(sample2[i]) - pred_rwd3 = self.world_model.pred_rewards(sample3[i]) - pred_rwd4 = self.world_model.pred_rewards(sample4[i]) - pred_rwd5 = self.world_model.pred_rewards(sample5[i]) + pred_rwd1 = self.world_model.pred_rewards(sample1i) + pred_rwd2 = self.world_model.pred_rewards(sample2i) + pred_rwd3 = self.world_model.pred_rewards(sample3i) + pred_rwd4 = self.world_model.pred_rewards(sample4i) + pred_rwd5 = self.world_model.pred_rewards(sample5i) rs.append(pred_rwd1) rs.append(pred_rwd2) rs.append(pred_rwd3) @@ -321,11 +328,11 @@ def sampling(self, pred_means, pred_vars): rs.append(pred_rwd5) # Each times, 5 models predict different actions. # [2560, 17] - pred_act1, log_pi1, _ = self.actor_net(sample1[i]) - pred_act2, log_pi2, _ = self.actor_net(sample2[i]) - pred_act3, log_pi3, _ = self.actor_net(sample3[i]) - pred_act4, log_pi4, _ = self.actor_net(sample4[i]) - pred_act5, log_pi5, _ = self.actor_net(sample5[i]) + pred_act1, log_pi1, _ = self.actor_net(sample1i) + pred_act2, log_pi2, _ = self.actor_net(sample2i) + pred_act3, log_pi3, _ = self.actor_net(sample3i) + pred_act4, log_pi4, _ = self.actor_net(sample4i) + pred_act5, log_pi5, _ = self.actor_net(sample5i) acts.append(log_pi1) acts.append(log_pi2) acts.append(log_pi3) @@ -335,15 +342,15 @@ def sampling(self, pred_means, pred_vars): # Now: sample1 sample2... same next state, different model. # Pred_act1 pred_act2 same next_state, different actions. # 5[] * 10[var of state] - qa1, qa2 = self.target_critic_net(sample1[i], pred_act1) + qa1, qa2 = self.target_critic_net(sample1i, pred_act1) qa = torch.minimum(qa1, qa2) - qb1, qb2 = self.target_critic_net(sample2[i], pred_act2) + qb1, qb2 = self.target_critic_net(sample2i, pred_act2) qb = torch.minimum(qb1, qb2) - qc1, qc2 = self.target_critic_net(sample3[i], pred_act3) + qc1, qc2 = self.target_critic_net(sample3i, pred_act3) qc = torch.minimum(qc1, qc2) - qd1, qd2 = self.target_critic_net(sample4[i], pred_act4) + qd1, qd2 = self.target_critic_net(sample4i, pred_act4) qd = torch.minimum(qd1, qd2) - qe1, qe2 = self.target_critic_net(sample5[i], pred_act5) + qe1, qe2 = self.target_critic_net(sample5i, pred_act5) qe = torch.minimum(qe1, qe2) qs.append(qa) qs.append(qb) From 2641b3651aa2b086aedc5b0641c2e9daa79bd586 Mon Sep 17 00:00:00 2001 From: tony Date: Fri, 9 Aug 2024 17:03:40 +1200 Subject: [PATCH 65/91] naming issue --- .../algorithm/mbrl/DynaSAC_BIV_Reweight.py | 9 +++++++-- .../mbrl/DynaSAC_NS_Immersive_Weight.py | 10 ++++++++-- .../mbrl/DynaSAC_SAS_Immersive_Weight.py | 18 +++++++++++------- .../mbrl/DynaSAC_SA_Immersive_Weight.py | 10 +++++++--- .../algorithm/mbrl/DynaSAC_SUNRISE_Reweight.py | 10 +++++++--- .../algorithm/mbrl/DynaSAC_UWAC_Reweight.py | 10 +++++++--- 6 files changed, 47 insertions(+), 20 deletions(-) diff --git a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_BIV_Reweight.py b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_BIV_Reweight.py index 5c21e1c4..0dac6b34 100644 --- a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_BIV_Reweight.py +++ b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_BIV_Reweight.py @@ -263,7 +263,7 @@ def _dyna_generate_and_train(self, next_states): pred_next_state, _, pred_mean, pred_var = self.world_model.pred_next_states( pred_state, pred_acts ) - uncert = self.sampling(pred_means=pred_mean, pred_vars=pred_var) + uncert = self.sampling(curr_states= pred_state, pred_means=pred_mean, pred_vars=pred_var) uncert = uncert.unsqueeze(dim=1).to(self.device) pred_uncerts.append(uncert) @@ -285,7 +285,7 @@ def _dyna_generate_and_train(self, next_states): pred_states, pred_actions, pred_rs, pred_n_states, pred_dones, pred_weights ) - def sampling(self, pred_means, pred_vars): + def sampling(self, curr_states, pred_means, pred_vars): """ High std means low uncertainty. Therefore, divided by 1 @@ -311,10 +311,15 @@ def sampling(self, pred_means, pred_vars): # Varying the next_state's distribution. for i in range(self.sample_times): sample1i = denormalize_observation_delta(sample1[i], self.world_model.statistics) + sample1i += curr_states sample2i = denormalize_observation_delta(sample2[i], self.world_model.statistics) + sample2i += curr_states sample3i = denormalize_observation_delta(sample3[i], self.world_model.statistics) + sample3i += curr_states sample4i = denormalize_observation_delta(sample4[i], self.world_model.statistics) + sample4i += curr_states sample5i = denormalize_observation_delta(sample5[i], self.world_model.statistics) + sample5i += curr_states # 5 models, each sampled 10 times = 50, pred_rwd1 = self.world_model.pred_rewards(sample1i) diff --git a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_NS_Immersive_Weight.py b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_NS_Immersive_Weight.py index 39b7d029..2e1472a6 100644 --- a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_NS_Immersive_Weight.py +++ b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_NS_Immersive_Weight.py @@ -263,7 +263,7 @@ def _dyna_generate_and_train(self, next_states): pred_next_state, _, pred_mean, pred_var = self.world_model.pred_next_states( pred_state, pred_acts ) - uncert = self.sampling(pred_means=pred_mean, pred_vars=pred_var) + uncert = self.sampling(curr_states = pred_state, pred_means=pred_mean, pred_vars=pred_var) uncert = uncert.unsqueeze(dim=1).to(self.device) pred_uncerts.append(uncert) @@ -285,7 +285,7 @@ def _dyna_generate_and_train(self, next_states): pred_states, pred_actions, pred_rs, pred_n_states, pred_dones, pred_weights ) - def sampling(self, pred_means, pred_vars): + def sampling(self, curr_states, pred_means, pred_vars): """ High std means low uncertainty. Therefore, divided by 1 @@ -312,10 +312,16 @@ def sampling(self, pred_means, pred_vars): # Varying the next_state's distribution. for i in range(self.sample_times): sample1i = denormalize_observation_delta(sample1[i], self.world_model.statistics) + sample1i += curr_states sample2i = denormalize_observation_delta(sample2[i], self.world_model.statistics) + sample2i += curr_states sample3i = denormalize_observation_delta(sample3[i], self.world_model.statistics) + sample3i += curr_states sample4i = denormalize_observation_delta(sample4[i], self.world_model.statistics) + sample4i += curr_states sample5i = denormalize_observation_delta(sample5[i], self.world_model.statistics) + sample5i += curr_states + if self.reweight_critic == 1: # 5 models, each sampled 10 times = 50, pred_rwd1 = self.world_model.pred_rewards(sample1i) diff --git a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_SAS_Immersive_Weight.py b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_SAS_Immersive_Weight.py index 471cda0a..b4078d22 100644 --- a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_SAS_Immersive_Weight.py +++ b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_SAS_Immersive_Weight.py @@ -286,7 +286,7 @@ def _dyna_generate_and_train(self, next_states): pred_states, pred_actions, pred_rs, pred_n_states, pred_dones, pred_weights ) - def sampling(self, pred_state, pred_act, pred_means, pred_vars): + def sampling(self, curr_state, curr_act, pred_means, pred_vars): """ High std means low uncertainty. Therefore, divided by 1 @@ -316,18 +316,22 @@ def sampling(self, pred_state, pred_act, pred_means, pred_vars): # Varying the next_state's distribution. for i in range(self.sample_times): sample1i = denormalize_observation_delta(sample1[i], self.world_model.statistics) + sample1i += curr_state sample2i = denormalize_observation_delta(sample2[i], self.world_model.statistics) + sample2i += curr_state sample3i = denormalize_observation_delta(sample3[i], self.world_model.statistics) + sample3i += curr_state sample4i = denormalize_observation_delta(sample4[i], self.world_model.statistics) + sample4i += curr_state sample5i = denormalize_observation_delta(sample5[i], self.world_model.statistics) - + sample5i += curr_state if self.reweight_critic == 1: # 5 models, each sampled 10 times = 50, - pred_rwd1 = self.world_model.pred_rewards(pred_state, pred_act, sample1i) - pred_rwd2 = self.world_model.pred_rewards(pred_state, pred_act, sample2i) - pred_rwd3 = self.world_model.pred_rewards(pred_state, pred_act, sample3i) - pred_rwd4 = self.world_model.pred_rewards(pred_state, pred_act, sample4i) - pred_rwd5 = self.world_model.pred_rewards(pred_state, pred_act, sample5i) + pred_rwd1 = self.world_model.pred_rewards(curr_state, curr_act, sample1i) + pred_rwd2 = self.world_model.pred_rewards(curr_state, curr_act, sample2i) + pred_rwd3 = self.world_model.pred_rewards(curr_state, curr_act, sample3i) + pred_rwd4 = self.world_model.pred_rewards(curr_state, curr_act, sample4i) + pred_rwd5 = self.world_model.pred_rewards(curr_state, curr_act, sample5i) rs.append(pred_rwd1) rs.append(pred_rwd2) rs.append(pred_rwd3) diff --git a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_SA_Immersive_Weight.py b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_SA_Immersive_Weight.py index 8ac48314..fa6d47ed 100644 --- a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_SA_Immersive_Weight.py +++ b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_SA_Immersive_Weight.py @@ -263,7 +263,7 @@ def _dyna_generate_and_train(self, next_states): pred_next_state, _, pred_mean, pred_var = self.world_model.pred_next_states( pred_state, pred_acts ) - uncert = self.sampling(pred_means=pred_mean, pred_vars=pred_var) + uncert = self.sampling(curr_states=pred_state, pred_means=pred_mean, pred_vars=pred_var) uncert = uncert.unsqueeze(dim=1).to(self.device) pred_uncerts.append(uncert) @@ -285,7 +285,7 @@ def _dyna_generate_and_train(self, next_states): pred_states, pred_actions, pred_rs, pred_n_states, pred_dones, pred_weights ) - def sampling(self, pred_means, pred_vars): + def sampling(self, curr_states, pred_means, pred_vars): """ High std means low uncertainty. Therefore, divided by 1 @@ -310,11 +310,15 @@ def sampling(self, pred_means, pred_vars): # Varying the next_state's distribution. for i in range(self.sample_times): sample1i = denormalize_observation_delta(sample1[i], self.world_model.statistics) + sample1i += curr_states sample2i = denormalize_observation_delta(sample2[i], self.world_model.statistics) + sample2i += curr_states sample3i = denormalize_observation_delta(sample3[i], self.world_model.statistics) + sample3i += curr_states sample4i = denormalize_observation_delta(sample4[i], self.world_model.statistics) + sample4i += curr_states sample5i = denormalize_observation_delta(sample5[i], self.world_model.statistics) - + sample5i += curr_states # Each times, 5 models predict different actions. # [2560, 17] pred_act1, log_pi1, _ = self.actor_net(sample1i) diff --git a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_SUNRISE_Reweight.py b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_SUNRISE_Reweight.py index 240332c4..a1730ad2 100644 --- a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_SUNRISE_Reweight.py +++ b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_SUNRISE_Reweight.py @@ -262,7 +262,7 @@ def _dyna_generate_and_train(self, next_states): pred_next_state, _, pred_mean, pred_var = self.world_model.pred_next_states( pred_state, pred_acts ) - uncert = self.sampling(pred_means=pred_mean, pred_vars=pred_var) + uncert = self.sampling(curr_states=pred_state, pred_means=pred_mean, pred_vars=pred_var) uncert = uncert.unsqueeze(dim=1).to(self.device) pred_uncerts.append(uncert) @@ -284,7 +284,7 @@ def _dyna_generate_and_train(self, next_states): pred_states, pred_actions, pred_rs, pred_n_states, pred_dones, pred_weights ) - def sampling(self, pred_means, pred_vars): + def sampling(self, curr_states, pred_means, pred_vars): """ High std means low uncertainty. Therefore, divided by 1 @@ -310,11 +310,15 @@ def sampling(self, pred_means, pred_vars): # Varying the next_state's distribution. for i in range(self.sample_times): sample1i = denormalize_observation_delta(sample1[i], self.world_model.statistics) + sample1i += curr_states sample2i = denormalize_observation_delta(sample2[i], self.world_model.statistics) + sample2i += curr_states sample3i = denormalize_observation_delta(sample3[i], self.world_model.statistics) + sample3i += curr_states sample4i = denormalize_observation_delta(sample4[i], self.world_model.statistics) + sample4i += curr_states sample5i = denormalize_observation_delta(sample5[i], self.world_model.statistics) - + sample5i += curr_states # 5 models, each sampled 10 times = 50, pred_rwd1 = self.world_model.pred_rewards(sample1i) pred_rwd2 = self.world_model.pred_rewards(sample2i) diff --git a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_UWAC_Reweight.py b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_UWAC_Reweight.py index 1257f5fb..4cfa46cd 100644 --- a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_UWAC_Reweight.py +++ b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_UWAC_Reweight.py @@ -262,7 +262,7 @@ def _dyna_generate_and_train(self, next_states): pred_next_state, _, pred_mean, pred_var = self.world_model.pred_next_states( pred_state, pred_acts ) - uncert = self.sampling(pred_means=pred_mean, pred_vars=pred_var) + uncert = self.sampling(curr_states = pred_state, pred_means=pred_mean, pred_vars=pred_var) uncert = uncert.unsqueeze(dim=1).to(self.device) pred_uncerts.append(uncert) @@ -284,7 +284,7 @@ def _dyna_generate_and_train(self, next_states): pred_states, pred_actions, pred_rs, pred_n_states, pred_dones, pred_weights ) - def sampling(self, pred_means, pred_vars): + def sampling(self, curr_states, pred_means, pred_vars): """ High std means low uncertainty. Therefore, divided by 1 @@ -310,11 +310,15 @@ def sampling(self, pred_means, pred_vars): # Varying the next_state's distribution. for i in range(self.sample_times): sample1i = denormalize_observation_delta(sample1[i], self.world_model.statistics) + sample1i += curr_states sample2i = denormalize_observation_delta(sample2[i], self.world_model.statistics) + sample2i += curr_states sample3i = denormalize_observation_delta(sample3[i], self.world_model.statistics) + sample3i += curr_states sample4i = denormalize_observation_delta(sample4[i], self.world_model.statistics) + sample4i += curr_states sample5i = denormalize_observation_delta(sample5[i], self.world_model.statistics) - + sample5i += curr_states # 5 models, each sampled 10 times = 50, pred_rwd1 = self.world_model.pred_rewards(sample1i) pred_rwd2 = self.world_model.pred_rewards(sample2i) From 295bcf02be5a6c128a08249ef5358edde0c1958a Mon Sep 17 00:00:00 2001 From: tony Date: Fri, 9 Aug 2024 17:24:21 +1200 Subject: [PATCH 66/91] space --- .../algorithm/mbrl/DynaSAC_BIV_Reweight.py | 2 +- .../mbrl/DynaSAC_NS_Immersive_Weight.py | 2 +- .../mbrl/DynaSAC_SAS_Immersive_Weight.py | 22 +++++++++---------- .../algorithm/mbrl/DynaSAC_UWAC_Reweight.py | 6 +++-- 4 files changed, 17 insertions(+), 15 deletions(-) diff --git a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_BIV_Reweight.py b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_BIV_Reweight.py index 0dac6b34..3c03b13a 100644 --- a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_BIV_Reweight.py +++ b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_BIV_Reweight.py @@ -263,7 +263,7 @@ def _dyna_generate_and_train(self, next_states): pred_next_state, _, pred_mean, pred_var = self.world_model.pred_next_states( pred_state, pred_acts ) - uncert = self.sampling(curr_states= pred_state, pred_means=pred_mean, pred_vars=pred_var) + uncert = self.sampling(curr_states=pred_state, pred_means=pred_mean, pred_vars=pred_var) uncert = uncert.unsqueeze(dim=1).to(self.device) pred_uncerts.append(uncert) diff --git a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_NS_Immersive_Weight.py b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_NS_Immersive_Weight.py index 2e1472a6..7ede052c 100644 --- a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_NS_Immersive_Weight.py +++ b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_NS_Immersive_Weight.py @@ -263,7 +263,7 @@ def _dyna_generate_and_train(self, next_states): pred_next_state, _, pred_mean, pred_var = self.world_model.pred_next_states( pred_state, pred_acts ) - uncert = self.sampling(curr_states = pred_state, pred_means=pred_mean, pred_vars=pred_var) + uncert = self.sampling(curr_states=pred_state, pred_means=pred_mean, pred_vars=pred_var) uncert = uncert.unsqueeze(dim=1).to(self.device) pred_uncerts.append(uncert) diff --git a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_SAS_Immersive_Weight.py b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_SAS_Immersive_Weight.py index b4078d22..fa0115f0 100644 --- a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_SAS_Immersive_Weight.py +++ b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_SAS_Immersive_Weight.py @@ -286,7 +286,7 @@ def _dyna_generate_and_train(self, next_states): pred_states, pred_actions, pred_rs, pred_n_states, pred_dones, pred_weights ) - def sampling(self, curr_state, curr_act, pred_means, pred_vars): + def sampling(self, curr_states, curr_act, pred_means, pred_vars): """ High std means low uncertainty. Therefore, divided by 1 @@ -316,22 +316,22 @@ def sampling(self, curr_state, curr_act, pred_means, pred_vars): # Varying the next_state's distribution. for i in range(self.sample_times): sample1i = denormalize_observation_delta(sample1[i], self.world_model.statistics) - sample1i += curr_state + sample1i += curr_states sample2i = denormalize_observation_delta(sample2[i], self.world_model.statistics) - sample2i += curr_state + sample2i += curr_states sample3i = denormalize_observation_delta(sample3[i], self.world_model.statistics) - sample3i += curr_state + sample3i += curr_states sample4i = denormalize_observation_delta(sample4[i], self.world_model.statistics) - sample4i += curr_state + sample4i += curr_states sample5i = denormalize_observation_delta(sample5[i], self.world_model.statistics) - sample5i += curr_state + sample5i += curr_states if self.reweight_critic == 1: # 5 models, each sampled 10 times = 50, - pred_rwd1 = self.world_model.pred_rewards(curr_state, curr_act, sample1i) - pred_rwd2 = self.world_model.pred_rewards(curr_state, curr_act, sample2i) - pred_rwd3 = self.world_model.pred_rewards(curr_state, curr_act, sample3i) - pred_rwd4 = self.world_model.pred_rewards(curr_state, curr_act, sample4i) - pred_rwd5 = self.world_model.pred_rewards(curr_state, curr_act, sample5i) + pred_rwd1 = self.world_model.pred_rewards(curr_states, curr_act, sample1i) + pred_rwd2 = self.world_model.pred_rewards(curr_states, curr_act, sample2i) + pred_rwd3 = self.world_model.pred_rewards(curr_states, curr_act, sample3i) + pred_rwd4 = self.world_model.pred_rewards(curr_states, curr_act, sample4i) + pred_rwd5 = self.world_model.pred_rewards(curr_states, curr_act, sample5i) rs.append(pred_rwd1) rs.append(pred_rwd2) rs.append(pred_rwd3) diff --git a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_UWAC_Reweight.py b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_UWAC_Reweight.py index 4cfa46cd..b093d4d6 100644 --- a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_UWAC_Reweight.py +++ b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_UWAC_Reweight.py @@ -21,6 +21,7 @@ from cares_reinforcement_learning.util.helpers import denormalize_observation_delta + class DynaSAC_UWACReweight: """ Max as ? @@ -262,7 +263,7 @@ def _dyna_generate_and_train(self, next_states): pred_next_state, _, pred_mean, pred_var = self.world_model.pred_next_states( pred_state, pred_acts ) - uncert = self.sampling(curr_states = pred_state, pred_means=pred_mean, pred_vars=pred_var) + uncert = self.sampling(curr_states=pred_state, pred_means=pred_mean, pred_vars=pred_var) uncert = uncert.unsqueeze(dim=1).to(self.device) pred_uncerts.append(uncert) @@ -395,7 +396,8 @@ def sampling(self, curr_states, pred_means, pred_vars): if self.mode == 1: total_var = gamma_sq * var_a + gamma_sq * var_q + gamma_sq * 2 * cov_aq - total_stds = torch.minimum(self.threshold_scale/total_var, torch.ones(total_var.shape).to(self.device) * 1.5) + total_stds = torch.minimum(self.threshold_scale / total_var, + torch.ones(total_var.shape).to(self.device) * 1.5) return total_stds.detach() From 0e7f6af5501a61ebb3f2233f09eb0efb3554aed7 Mon Sep 17 00:00:00 2001 From: tony Date: Sat, 21 Dec 2024 23:56:13 +1300 Subject: [PATCH 67/91] merge --- ...aSAC_BIV_Reweight.py => DynaSAC_BIV_NS.py} | 15 +- .../{DynaSAC_SAS.py => DynaSAC_Bounded.py} | 181 +++-- ...S_Immersive_Weight.py => DynaSAC_IW_NS.py} | 15 +- .../algorithm/mbrl/DynaSAC_NS.py | 95 ++- .../algorithm/mbrl/DynaSAC_SA.py | 304 -------- .../mbrl/DynaSAC_SAS_Immersive_Weight.py | 435 ----------- .../mbrl/DynaSAC_SA_Immersive_Weight.py | 397 ---------- ...RISE_Reweight.py => DynaSAC_SUNRISE_NS.py} | 14 +- ...AC_UWAC_Reweight.py => DynaSAC_UWAC_NS.py} | 14 +- .../algorithm/mbrl/STEVE_MEAN_SAC.py | 15 +- .../algorithm/mbrl/__init__.py | 15 +- .../algorithm/policy/SAC.py | 2 +- .../networks/world_models/__init__.py | 17 +- .../world_models/ensemble/__init__.py | 2 + .../ensemble/world_ensemble_big.py | 260 +++++++ .../ensemble/world_ensemble_one_rwd.py | 245 ++++++ .../world_models/ensemble_ns_world.py | 169 ----- .../world_models/ensemble_sas_world.py | 166 ----- .../ensemble_world_ensemble_sas_reward.py | 246 ------ .../world_models/ensmeble_sa_world.py | 165 ---- .../networks/world_models/simple/__init__.py | 5 + .../probabilistic_dynamic.py} | 47 +- .../simple/probabilistic_ns_reward.py | 36 + .../{ => simple}/probabilistic_sas_reward.py | 35 +- .../simple_ns_reward.py} | 31 +- .../{ => simple}/simple_sas_reward.py | 24 +- .../networks/world_models/simple_ns_reward.py | 53 -- .../networks/world_models/simple_sas_done.py | 50 -- .../networks/world_models/world_model.py | 174 +++++ .../world_models/z_ensemble_integrated.py | 307 -------- .../util/configurations.py | 702 ++++++++++++++++-- cares_reinforcement_learning/util/helpers.py | 44 ++ .../util/network_factory.py | 619 ++++++++------- 33 files changed, 1989 insertions(+), 2910 deletions(-) rename cares_reinforcement_learning/algorithm/mbrl/{DynaSAC_BIV_Reweight.py => DynaSAC_BIV_NS.py} (97%) rename cares_reinforcement_learning/algorithm/mbrl/{DynaSAC_SAS.py => DynaSAC_Bounded.py} (53%) rename cares_reinforcement_learning/algorithm/mbrl/{DynaSAC_NS_Immersive_Weight.py => DynaSAC_IW_NS.py} (97%) delete mode 100644 cares_reinforcement_learning/algorithm/mbrl/DynaSAC_SA.py delete mode 100644 cares_reinforcement_learning/algorithm/mbrl/DynaSAC_SAS_Immersive_Weight.py delete mode 100644 cares_reinforcement_learning/algorithm/mbrl/DynaSAC_SA_Immersive_Weight.py rename cares_reinforcement_learning/algorithm/mbrl/{DynaSAC_SUNRISE_Reweight.py => DynaSAC_SUNRISE_NS.py} (97%) rename cares_reinforcement_learning/algorithm/mbrl/{DynaSAC_UWAC_Reweight.py => DynaSAC_UWAC_NS.py} (97%) create mode 100644 cares_reinforcement_learning/networks/world_models/ensemble/__init__.py create mode 100644 cares_reinforcement_learning/networks/world_models/ensemble/world_ensemble_big.py create mode 100644 cares_reinforcement_learning/networks/world_models/ensemble/world_ensemble_one_rwd.py delete mode 100644 cares_reinforcement_learning/networks/world_models/ensemble_ns_world.py delete mode 100644 cares_reinforcement_learning/networks/world_models/ensemble_sas_world.py delete mode 100644 cares_reinforcement_learning/networks/world_models/ensemble_world_ensemble_sas_reward.py delete mode 100644 cares_reinforcement_learning/networks/world_models/ensmeble_sa_world.py create mode 100644 cares_reinforcement_learning/networks/world_models/simple/__init__.py rename cares_reinforcement_learning/networks/world_models/{probabilistic_dynamics.py => simple/probabilistic_dynamic.py} (60%) create mode 100644 cares_reinforcement_learning/networks/world_models/simple/probabilistic_ns_reward.py rename cares_reinforcement_learning/networks/world_models/{ => simple}/probabilistic_sas_reward.py (65%) rename cares_reinforcement_learning/networks/world_models/{simple_sa_reward.py => simple/simple_ns_reward.py} (62%) rename cares_reinforcement_learning/networks/world_models/{ => simple}/simple_sas_reward.py (72%) delete mode 100644 cares_reinforcement_learning/networks/world_models/simple_ns_reward.py delete mode 100644 cares_reinforcement_learning/networks/world_models/simple_sas_done.py create mode 100644 cares_reinforcement_learning/networks/world_models/world_model.py delete mode 100644 cares_reinforcement_learning/networks/world_models/z_ensemble_integrated.py diff --git a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_BIV_Reweight.py b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_BIV_NS.py similarity index 97% rename from cares_reinforcement_learning/algorithm/mbrl/DynaSAC_BIV_Reweight.py rename to cares_reinforcement_learning/algorithm/mbrl/DynaSAC_BIV_NS.py index 3c03b13a..90c46dd3 100644 --- a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_BIV_Reweight.py +++ b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_BIV_NS.py @@ -12,11 +12,12 @@ from scipy.optimize import minimize import numpy as np import torch -from cares_reinforcement_learning.memory import PrioritizedReplayBuffer + import torch.nn.functional as F -from cares_reinforcement_learning.networks.world_models import ( - EnsembleWorldAndOneNSReward, +from cares_reinforcement_learning.memory import MemoryBuffer +from cares_reinforcement_learning.networks.world_models.ensemble import ( + Ensemble_Dyna_Big ) from cares_reinforcement_learning.util.helpers import denormalize_observation_delta @@ -31,7 +32,7 @@ def __init__( self, actor_network: torch.nn.Module, critic_network: torch.nn.Module, - world_network: EnsembleWorldAndOneNSReward, + world_network: Ensemble_Dyna_Big, gamma: float, tau: float, action_num: int, @@ -75,7 +76,7 @@ def __init__( ) # Set to initial alpha to 1.0 according to other baselines. - self.log_alpha = torch.tensor(np.log(1.0)).to(device) + self.log_alpha = torch.FloatTensor([np.log(1.0)]).to(device) self.log_alpha.requires_grad = True self.target_entropy = -action_num self.log_alpha_optimizer = torch.optim.Adam([self.log_alpha], lr=alpha_lr) @@ -197,7 +198,7 @@ def _train_policy( ) def train_world_model( - self, memory: PrioritizedReplayBuffer, batch_size: int + self, memory: MemoryBuffer, batch_size: int ) -> None: experiences = memory.sample_uniform(batch_size) states, actions, rewards, next_states, _, _ = experiences @@ -217,7 +218,7 @@ def train_world_model( rewards=rewards, ) - def train_policy(self, memory: PrioritizedReplayBuffer, batch_size: int) -> None: + def train_policy(self, memory: MemoryBuffer, batch_size: int) -> None: self.learn_counter += 1 experiences = memory.sample_uniform(batch_size) diff --git a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_SAS.py b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_Bounded.py similarity index 53% rename from cares_reinforcement_learning/algorithm/mbrl/DynaSAC_SAS.py rename to cares_reinforcement_learning/algorithm/mbrl/DynaSAC_Bounded.py index 456299ba..f84c99b1 100644 --- a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_SAS.py +++ b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_Bounded.py @@ -8,35 +8,47 @@ import copy import logging -import os import numpy as np import torch -import torch.nn.functional as F -from cares_reinforcement_learning.memory import PrioritizedReplayBuffer +from cares_reinforcement_learning.memory import MemoryBuffer -from cares_reinforcement_learning.networks.world_models import ( - EnsembleWorldAndOneSASReward, +from cares_reinforcement_learning.networks.world_models.ensemble import ( + Ensemble_Dyna_Big, ) +import torch.nn.functional as F - -class DynaSAC_SAS: +class DynaSAC_Bounded: def __init__( - self, - actor_network: torch.nn.Module, - critic_network: torch.nn.Module, - world_network: EnsembleWorldAndOneSASReward, - gamma: float, - tau: float, - action_num: int, - actor_lr: float, - critic_lr: float, - alpha_lr: float, - num_samples: int, - horizon: int, - device: torch.device, + self, + actor_network: torch.nn.Module, + critic_network: torch.nn.Module, + world_network: Ensemble_Dyna_Big, + gamma: float, + tau: float, + action_num: int, + actor_lr: float, + critic_lr: float, + alpha_lr: float, + num_samples: int, + horizon: int, + device: torch.device, + train_reward: bool, + train_both: bool, + gripper:bool, + threshold:float, + exploration_sample:int ): + logging.info("-----------------------------------------------") + logging.info("----I am runing the DynaSAC_Bounded Agent! ----") + logging.info("-----------------------------------------------") + self.train_reward = train_reward + self.train_both = train_both + self.gripper = gripper + self.exploration_sample = exploration_sample + self.threshold = threshold + self.set_stat = False self.type = "mbrl" self.device = device @@ -64,7 +76,7 @@ def __init__( ) # Set to initial alpha to 1.0 according to other baselines. - self.log_alpha = torch.tensor(np.log(1.0)).to(device) + self.log_alpha = torch.FloatTensor([np.log(1.0)]).to(device) self.log_alpha.requires_grad = True self.target_entropy = -action_num self.log_alpha_optimizer = torch.optim.Adam([self.log_alpha], lr=alpha_lr) @@ -76,16 +88,42 @@ def __init__( def _alpha(self) -> float: return self.log_alpha.exp() - # pylint: disable-next=unused-argument to keep the same interface def select_action_from_policy( - self, state: np.ndarray, evaluation: bool = False, noise_scale: float = 0 + self, state: np.ndarray, evaluation: bool = False, noise_scale: float = 0 ) -> np.ndarray: # note that when evaluating this algorithm we need to select mu as self.actor_net.eval() with torch.no_grad(): state_tensor = torch.FloatTensor(state).unsqueeze(0).to(self.device) if evaluation is False: - (action, _, _) = self.actor_net(state_tensor) + if self.threshold == 0: + (action, _, _) = self.actor_net(state_tensor) + else: + if self.set_stat: + multi_state_tensor = torch.repeat_interleave(state_tensor, self.exploration_sample, dim=0) + (multi_action, multi_log_pi, _) = self.actor_net(multi_state_tensor) + # Estimate uncertainty + # [6, 10, 17] + _, _, nstate_means, nstate_vars = self.world_model.pred_next_states( + observation=multi_state_tensor, actions=multi_action) + # [10, 17] + aleatoric = torch.mean(nstate_vars ** 2, dim=0) ** 0.5 + epistemic = torch.var(nstate_means, dim=0) ** 0.5 + aleatoric = torch.clamp(aleatoric, max=10e3) + epistemic = torch.clamp(epistemic, max=10e3) + total_unc = (aleatoric ** 2 + epistemic ** 2) ** 0.5 + uncert = torch.mean(total_unc, dim=1) + multi_log_pi = multi_log_pi.squeeze() + policy_dist = F.softmax(multi_log_pi, dim=0) + world_dist = F.softmax(uncert, dim=0) + final_dist = (1 - self.threshold) * policy_dist + self.threshold * world_dist + final_dist = F.softmax(final_dist, dim=0) + candi = torch.argmax(final_dist) + # new_dist = torch.distributions.Categorical(final_dist) + # candi = new_dist.sample([5]).squeeze() + action = multi_action[candi] + else: + (action, _, _) = self.actor_net(state_tensor) else: (_, _, action) = self.actor_net(state_tensor) action = action.cpu().data.numpy().flatten() @@ -93,14 +131,16 @@ def select_action_from_policy( return action def _train_policy( - self, - states: torch.Tensor, - actions: torch.Tensor, - rewards: torch.Tensor, - next_states: torch.Tensor, - dones: torch.Tensor, + self, + states: torch.Tensor, + actions: torch.Tensor, + rewards: torch.Tensor, + next_states: torch.Tensor, + dones: torch.Tensor, + weights: torch.Tensor, ) -> None: - + if weights is None: + weights = torch.ones(rewards.shape) ################## Update the Critic First #################### with torch.no_grad(): next_actions, next_log_pi, _ = self.actor_net(next_states) @@ -138,7 +178,7 @@ def _train_policy( # Update the temperature alpha_loss = -( - self.log_alpha * (first_log_p + self.target_entropy).detach() + self.log_alpha * (first_log_p + self.target_entropy).detach() ).mean() self.log_alpha_optimizer.zero_grad() @@ -147,14 +187,14 @@ def _train_policy( if self.learn_counter % self.policy_update_freq == 0: for target_param, param in zip( - self.target_critic_net.parameters(), self.critic_net.parameters() + self.target_critic_net.parameters(), self.critic_net.parameters() ): target_param.data.copy_( param.data * self.tau + target_param.data * (1.0 - self.tau) ) def train_world_model( - self, memory: PrioritizedReplayBuffer, batch_size: int + self, memory: MemoryBuffer, batch_size: int ) -> None: experiences = memory.sample_uniform(batch_size) @@ -162,7 +202,6 @@ def train_world_model( states = torch.FloatTensor(np.asarray(states)).to(self.device) actions = torch.FloatTensor(np.asarray(actions)).to(self.device) - rewards = torch.FloatTensor(np.asarray(rewards)).to(self.device).unsqueeze(1) next_states = torch.FloatTensor(np.asarray(next_states)).to(self.device) self.world_model.train_world( @@ -170,24 +209,30 @@ def train_world_model( actions=actions, next_states=next_states, ) - self.world_model.train_reward( - states=states, - actions=actions, - next_states=next_states, - rewards=rewards, - ) - def train_policy(self, memory: PrioritizedReplayBuffer, batch_size: int) -> None: + batch_size = len(states) + # Reshape to batch_size x whatever + if self.train_reward: + rewards = torch.FloatTensor(np.asarray(rewards)).to(self.device) + rewards = rewards.unsqueeze(0).reshape(batch_size, 1) + if self.train_both: + self.world_model.train_together(states, actions, rewards) + else: + self.world_model.train_reward(states, actions, next_states, rewards) + + def train_policy(self, memory: MemoryBuffer, batch_size: int) -> None: self.learn_counter += 1 experiences = memory.sample_uniform(batch_size) states, actions, rewards, next_states, dones, _ = experiences + # Convert into tensor states = torch.FloatTensor(np.asarray(states)).to(self.device) actions = torch.FloatTensor(np.asarray(actions)).to(self.device) rewards = torch.FloatTensor(np.asarray(rewards)).to(self.device).unsqueeze(1) next_states = torch.FloatTensor(np.asarray(next_states)).to(self.device) dones = torch.LongTensor(np.asarray(dones)).to(self.device).unsqueeze(1) + # Step 2 train as usual self._train_policy( states=states, @@ -195,9 +240,9 @@ def train_policy(self, memory: PrioritizedReplayBuffer, batch_size: int) -> None rewards=rewards, next_states=next_states, dones=dones, + weights=torch.ones(rewards.shape) ) - # # # Step 3 Dyna add more data - self._dyna_generate_and_train(next_states=next_states) + # self._dyna_generate_and_train(next_states) def _dyna_generate_and_train(self, next_states: torch.Tensor) -> None: pred_states = [] @@ -210,12 +255,21 @@ def _dyna_generate_and_train(self, next_states: torch.Tensor) -> None: for _ in range(self.horizon): pred_state = torch.repeat_interleave(pred_state, self.num_samples, dim=0) # This part is controversial. But random actions is empirically better. - rand_acts = np.random.uniform(-1, 1, (pred_state.shape[0], self.action_num)) - pred_acts = torch.FloatTensor(rand_acts).to(self.device) + # rand_acts = np.random.uniform(-1, 1, (pred_state.shape[0], self.action_num)) + # pred_acts = torch.FloatTensor(rand_acts).to(self.device) + pred_acts, _, _ = self.actor_net(pred_state) pred_next_state, _, _, _ = self.world_model.pred_next_states( pred_state, pred_acts ) - pred_reward = self.world_model.pred_rewards(pred_state, pred_acts, pred_next_state) + + if self.gripper: + pred_reward = self.reward_function(pred_state, pred_next_state) + pred_next_state[:, -2:] = pred_state[:, -2:] + else: + pred_reward, _ = self.world_model.pred_rewards(observation=pred_state, + action=pred_acts, + next_observation=pred_next_state) + pred_states.append(pred_state) pred_actions.append(pred_acts.detach()) pred_rs.append(pred_reward.detach()) @@ -229,23 +283,36 @@ def _dyna_generate_and_train(self, next_states: torch.Tensor) -> None: pred_dones = torch.FloatTensor(np.zeros(pred_rs.shape)).to(self.device) # states, actions, rewards, next_states, not_dones self._train_policy( - pred_states, pred_actions, pred_rs, pred_n_states, pred_dones + pred_states, pred_actions, pred_rs, pred_n_states, pred_dones, torch.ones(pred_rs.shape) ) + def reward_function(self, curr_states, next_states): + target_goal_tensor = curr_states[:, -2:] + object_current = next_states[:, -4:-2] + sq_diff = (target_goal_tensor - object_current) ** 2 + # [256, 1] + goal_distance_after = torch.sqrt(torch.sum(sq_diff, dim=1)).unsqueeze(dim=1) + pred_reward = -goal_distance_after + 70 + mask1 = goal_distance_after <= 10 + mask2 = goal_distance_after > 70 + pred_reward[mask1] = 800 + pred_reward[mask2] = 0 + return pred_reward + def set_statistics(self, stats: dict) -> None: self.world_model.set_statistics(stats) + self.set_stat = True def save_models(self, filename: str, filepath: str = "models") -> None: - path = f"{filepath}/models" if filepath != "models" else filepath - dir_exists = os.path.exists(path) - if not dir_exists: - os.makedirs(path) - torch.save(self.actor_net.state_dict(), f"{path}/{filename}_actor.pth") - torch.save(self.critic_net.state_dict(), f"{path}/{filename}_critic.pth") + # if not os.path.exists(filepath): + # os.makedirs(filepath) + # print(filepath) + # logging.info(filepath) + # torch.save(self.actor_net.state_dict(), f"{filepath}/{filename}_actor.pht") + # torch.save(self.critic_net.state_dict(), f"{filepath}/{filename}_critic.pht") logging.info("models has been saved...") def load_models(self, filepath: str, filename: str) -> None: - path = f"{filepath}/models" if filepath != "models" else filepath - self.actor_net.load_state_dict(torch.load(f"{path}/{filename}_actor.pth")) - self.critic_net.load_state_dict(torch.load(f"{path}/{filename}_critic.pth")) + self.actor_net.load_state_dict(torch.load(f"{filepath}/{filename}_actor.pht")) + self.critic_net.load_state_dict(torch.load(f"{filepath}/{filename}_critic.pht")) logging.info("models has been loaded...") diff --git a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_NS_Immersive_Weight.py b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_IW_NS.py similarity index 97% rename from cares_reinforcement_learning/algorithm/mbrl/DynaSAC_NS_Immersive_Weight.py rename to cares_reinforcement_learning/algorithm/mbrl/DynaSAC_IW_NS.py index 7ede052c..915bed13 100644 --- a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_NS_Immersive_Weight.py +++ b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_IW_NS.py @@ -12,12 +12,13 @@ import numpy as np import torch -from cares_reinforcement_learning.memory import PrioritizedReplayBuffer import torch.nn.functional as F -from cares_reinforcement_learning.networks.world_models import ( - EnsembleWorldAndOneNSReward, +from cares_reinforcement_learning.memory import MemoryBuffer +from cares_reinforcement_learning.networks.world_models.ensemble import ( + Ensemble_Dyna_Big ) + from cares_reinforcement_learning.util.helpers import denormalize_observation_delta @@ -31,7 +32,7 @@ def __init__( self, actor_network: torch.nn.Module, critic_network: torch.nn.Module, - world_network: EnsembleWorldAndOneNSReward, + world_network: Ensemble_Dyna_Big, gamma: float, tau: float, action_num: int, @@ -75,7 +76,7 @@ def __init__( ) # Set to initial alpha to 1.0 according to other baselines. - self.log_alpha = torch.tensor(np.log(1.0)).to(device) + self.log_alpha = torch.FloatTensor([np.log(1.0)]).to(device) self.log_alpha.requires_grad = True self.target_entropy = -action_num self.log_alpha_optimizer = torch.optim.Adam([self.log_alpha], lr=alpha_lr) @@ -197,7 +198,7 @@ def _train_policy( ) def train_world_model( - self, memory: PrioritizedReplayBuffer, batch_size: int + self, memory: MemoryBuffer, batch_size: int ) -> None: experiences = memory.sample_uniform(batch_size) states, actions, rewards, next_states, _, _ = experiences @@ -217,7 +218,7 @@ def train_world_model( rewards=rewards, ) - def train_policy(self, memory: PrioritizedReplayBuffer, batch_size: int) -> None: + def train_policy(self, memory: MemoryBuffer, batch_size: int) -> None: self.learn_counter += 1 experiences = memory.sample_uniform(batch_size) diff --git a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_NS.py b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_NS.py index da151034..00a65d55 100644 --- a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_NS.py +++ b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_NS.py @@ -8,16 +8,14 @@ import copy import logging -import os import numpy as np import torch -import torch.nn.functional as F -from cares_reinforcement_learning.memory import PrioritizedReplayBuffer +from cares_reinforcement_learning.memory import MemoryBuffer -from cares_reinforcement_learning.networks.world_models import ( - EnsembleWorldAndOneNSReward, +from cares_reinforcement_learning.networks.world_models.ensemble import ( + Ensemble_Dyna_Big, ) @@ -26,7 +24,7 @@ def __init__( self, actor_network: torch.nn.Module, critic_network: torch.nn.Module, - world_network: EnsembleWorldAndOneNSReward, + world_network: Ensemble_Dyna_Big, gamma: float, tau: float, action_num: int, @@ -36,7 +34,17 @@ def __init__( num_samples: int, horizon: int, device: torch.device, + train_reward: bool, + train_both: bool, + gripper:bool, ): + logging.info("-------------------------------------------") + logging.info("----I am runing the Dyna_SAC_NS Agent! ----") + logging.info("-------------------------------------------") + self.train_reward = train_reward + self.train_both = train_both + self.gripper = gripper + self.type = "mbrl" self.device = device @@ -64,7 +72,7 @@ def __init__( ) # Set to initial alpha to 1.0 according to other baselines. - self.log_alpha = torch.tensor(np.log(1.0)).to(device) + self.log_alpha = torch.FloatTensor([np.log(1.0)]).to(device) self.log_alpha.requires_grad = True self.target_entropy = -action_num self.log_alpha_optimizer = torch.optim.Adam([self.log_alpha], lr=alpha_lr) @@ -76,7 +84,6 @@ def __init__( def _alpha(self) -> float: return self.log_alpha.exp() - # pylint: disable-next=unused-argument to keep the same interface def select_action_from_policy( self, state: np.ndarray, evaluation: bool = False, noise_scale: float = 0 ) -> np.ndarray: @@ -99,8 +106,10 @@ def _train_policy( rewards: torch.Tensor, next_states: torch.Tensor, dones: torch.Tensor, + weights: torch.Tensor, ) -> None: - + if weights is None: + weights = torch.ones(rewards.shape) ################## Update the Critic First #################### with torch.no_grad(): next_actions, next_log_pi, _ = self.actor_net(next_states) @@ -154,7 +163,7 @@ def _train_policy( ) def train_world_model( - self, memory: PrioritizedReplayBuffer, batch_size: int + self, memory: MemoryBuffer, batch_size: int ) -> None: experiences = memory.sample_uniform(batch_size) @@ -162,7 +171,6 @@ def train_world_model( states = torch.FloatTensor(np.asarray(states)).to(self.device) actions = torch.FloatTensor(np.asarray(actions)).to(self.device) - rewards = torch.FloatTensor(np.asarray(rewards)).to(self.device).unsqueeze(1) next_states = torch.FloatTensor(np.asarray(next_states)).to(self.device) self.world_model.train_world( @@ -170,12 +178,18 @@ def train_world_model( actions=actions, next_states=next_states, ) - self.world_model.train_reward( - next_states=next_states, - rewards=rewards, - ) - def train_policy(self, memory: PrioritizedReplayBuffer, batch_size: int) -> None: + batch_size = len(states) + # Reshape to batch_size x whatever + if self.train_reward: + rewards = torch.FloatTensor(np.asarray(rewards)).to(self.device) + rewards = rewards.unsqueeze(0).reshape(batch_size, 1) + if self.train_both: + self.world_model.train_together(states, actions, rewards) + else: + self.world_model.train_reward(states, actions, next_states, rewards) + + def train_policy(self, memory: MemoryBuffer, batch_size: int) -> None: self.learn_counter += 1 experiences = memory.sample_uniform(batch_size) @@ -195,7 +209,9 @@ def train_policy(self, memory: PrioritizedReplayBuffer, batch_size: int) -> None rewards=rewards, next_states=next_states, dones=dones, + weights=torch.ones(rewards.shape) ) + # self._dyna_generate_and_train(next_states) def _dyna_generate_and_train(self, next_states: torch.Tensor) -> None: pred_states = [] @@ -208,12 +224,21 @@ def _dyna_generate_and_train(self, next_states: torch.Tensor) -> None: for _ in range(self.horizon): pred_state = torch.repeat_interleave(pred_state, self.num_samples, dim=0) # This part is controversial. But random actions is empirically better. - rand_acts = np.random.uniform(-1, 1, (pred_state.shape[0], self.action_num)) - pred_acts = torch.FloatTensor(rand_acts).to(self.device) + # rand_acts = np.random.uniform(-1, 1, (pred_state.shape[0], self.action_num)) + # pred_acts = torch.FloatTensor(rand_acts).to(self.device) + pred_acts, _, _ = self.actor_net(pred_state) pred_next_state, _, _, _ = self.world_model.pred_next_states( pred_state, pred_acts ) - pred_reward = self.world_model.pred_rewards(pred_next_state) + + if self.gripper: + pred_reward = self.reward_function(pred_state, pred_next_state) + pred_next_state[:, -2:] = pred_state[:, -2:] + else: + pred_reward, _ = self.world_model.pred_rewards(observation=pred_state, + action=pred_acts, + next_observation=pred_next_state) + pred_states.append(pred_state) pred_actions.append(pred_acts.detach()) pred_rs.append(pred_reward.detach()) @@ -227,23 +252,35 @@ def _dyna_generate_and_train(self, next_states: torch.Tensor) -> None: pred_dones = torch.FloatTensor(np.zeros(pred_rs.shape)).to(self.device) # states, actions, rewards, next_states, not_dones self._train_policy( - pred_states, pred_actions, pred_rs, pred_n_states, pred_dones + pred_states, pred_actions, pred_rs, pred_n_states, pred_dones, torch.ones(pred_rs.shape) ) + def reward_function(self, curr_states, next_states): + target_goal_tensor = curr_states[:, -2:] + object_current = next_states[:, -4:-2] + sq_diff = (target_goal_tensor - object_current) ** 2 + # [256, 1] + goal_distance_after = torch.sqrt(torch.sum(sq_diff, dim=1)).unsqueeze(dim=1) + pred_reward = -goal_distance_after + 70 + mask1 = goal_distance_after <= 10 + mask2 = goal_distance_after > 70 + pred_reward[mask1] = 800 + pred_reward[mask2] = 0 + return pred_reward + def set_statistics(self, stats: dict) -> None: self.world_model.set_statistics(stats) def save_models(self, filename: str, filepath: str = "models") -> None: - path = f"{filepath}/models" if filepath != "models" else filepath - dir_exists = os.path.exists(path) - if not dir_exists: - os.makedirs(path) - torch.save(self.actor_net.state_dict(), f"{path}/{filename}_actor.pth") - torch.save(self.critic_net.state_dict(), f"{path}/{filename}_critic.pth") + # if not os.path.exists(filepath): + # os.makedirs(filepath) + # print(filepath) + # logging.info(filepath) + # torch.save(self.actor_net.state_dict(), f"{filepath}/{filename}_actor.pht") + # torch.save(self.critic_net.state_dict(), f"{filepath}/{filename}_critic.pht") logging.info("models has been saved...") def load_models(self, filepath: str, filename: str) -> None: - path = f"{filepath}/models" if filepath != "models" else filepath - self.actor_net.load_state_dict(torch.load(f"{path}/{filename}_actor.pth")) - self.critic_net.load_state_dict(torch.load(f"{path}/{filename}_critic.pth")) + self.actor_net.load_state_dict(torch.load(f"{filepath}/{filename}_actor.pht")) + self.critic_net.load_state_dict(torch.load(f"{filepath}/{filename}_critic.pht")) logging.info("models has been loaded...") diff --git a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_SA.py b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_SA.py deleted file mode 100644 index d0746377..00000000 --- a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_SA.py +++ /dev/null @@ -1,304 +0,0 @@ -""" -Sutton, Richard S. "Dyna, an integrated architecture for learning, planning, and reacting." - -Original Paper: https://dl.acm.org/doi/abs/10.1145/122344.122377 - -This code runs automatic entropy tuning -""" - -import copy -import logging -import os - -import numpy as np -import torch -import torch.nn.functional as F - -import cares_reinforcement_learning.util.helpers as hlp -from cares_reinforcement_learning.memory import MemoryBuffer -from cares_reinforcement_learning.networks.DynaSAC import Actor, Critic -from cares_reinforcement_learning.networks.world_models.ensemble_integrated import ( - EnsembleWorldReward, -) -from cares_reinforcement_learning.util.configurations import DynaSACConfig - - -class DynaSAC: - def __init__( - self, - actor_network: Actor, - critic_network: Critic, - world_network: EnsembleWorldReward, - config: DynaSACConfig, - device: torch.device, - ): - self.type = "mbrl" - self.device = device - - # this may be called policy_net in other implementations - self.actor_net = actor_network.to(self.device) - # this may be called soft_q_net in other implementations - self.critic_net = critic_network.to(self.device) - self.target_critic_net = copy.deepcopy(self.critic_net) - - self.gamma = config.gamma - self.tau = config.tau - - self.num_samples = config.num_samples - self.horizon = config.horizon - self.action_num = self.actor_net.num_actions - - self.learn_counter = 0 - self.policy_update_freq = config.policy_update_freq - self.target_update_freq = config.target_update_freq - - self.target_entropy = -self.action_num - - self.actor_net_optimiser = torch.optim.Adam( - self.actor_net.parameters(), lr=config.actor_lr - ) - self.critic_net_optimiser = torch.optim.Adam( - self.critic_net.parameters(), lr=config.critic_lr - ) - - # Set to initial alpha to 1.0 according to other baselines. - self.log_alpha = torch.tensor(np.log(1.0)).to(device) - self.log_alpha.requires_grad = True - self.log_alpha_optimizer = torch.optim.Adam( - [self.log_alpha], lr=config.alpha_lr - ) - - # World model - self.world_model = world_network - - @property - def _alpha(self) -> torch.Tensor: - return self.log_alpha.exp() - - def select_action_from_policy( - self, state: np.ndarray, evaluation: bool = False, noise_scale: float = 0 - ) -> np.ndarray: - # pylint: disable-next=unused-argument - - # note that when evaluating this algorithm we need to select mu as - self.actor_net.eval() - with torch.no_grad(): - state_tensor = torch.FloatTensor(state).unsqueeze(0).to(self.device) - if evaluation is False: - (action, _, _) = self.actor_net(state_tensor) - else: - (_, _, action) = self.actor_net(state_tensor) - action = action.cpu().data.numpy().flatten() - self.actor_net.train() - return action - - def _update_critic_actor(self, states, actions, rewards, next_states, dones): - # Update Critic - self._update_critic(states, actions, rewards, next_states, dones) - - if self.learn_counter % self.policy_update_freq == 0: - # Update Actor - self._update_actor(states) - - if self.learn_counter % self.target_update_freq == 0: - hlp.soft_update_params(self.critic_net, self.target_critic_net, self.tau) - - def _update_critic(self, states, actions, rewards, next_states, dones): - with torch.no_grad(): - next_actions, next_log_pi, _ = self.actor_net(next_states) - target_q_one, target_q_two = self.target_critic_net( - next_states, next_actions - ) - target_q_values = ( - torch.minimum(target_q_one, target_q_two) - self._alpha * next_log_pi - ) - q_target = rewards + self.gamma * (1 - dones) * target_q_values - - q_values_one, q_values_two = self.critic_net(states, actions) - critic_loss_one = F.mse_loss(q_values_one, q_target) - critic_loss_two = F.mse_loss(q_values_two, q_target) - critic_loss_total = critic_loss_one + critic_loss_two - - # Update the Critic - self.critic_net_optimiser.zero_grad() - critic_loss_total.backward() - self.critic_net_optimiser.step() - - def _update_actor(self, states): - pi, first_log_p, _ = self.actor_net(states) - qf1_pi, qf2_pi = self.critic_net(states, pi) - min_qf_pi = torch.minimum(qf1_pi, qf2_pi) - actor_loss = ((self._alpha * first_log_p) - min_qf_pi).mean() - - # Update the Actor - self.actor_net_optimiser.zero_grad() - actor_loss.backward() - self.actor_net_optimiser.step() - - # Update the temperature - alpha_loss = -( - self.log_alpha * (first_log_p + self.target_entropy).detach() - ).mean() - - self.log_alpha_optimizer.zero_grad() - alpha_loss.backward() - self.log_alpha_optimizer.step() - - if self.learn_counter % self.policy_update_freq == 0: - for target_param, param in zip( - self.target_critic_net.parameters(), self.critic_net.parameters() - ): - target_param.data.copy_( - param.data * self.tau + target_param.data * (1.0 - self.tau) - ) - - def train_world_model( - self, memory: PrioritizedReplayBuffer, batch_size: int - ) -> None: - - experiences = memory.sample_uniform(batch_size) - states, actions, rewards, next_states, _, _ = experiences - - states = torch.FloatTensor(np.asarray(states)).to(self.device) - actions = torch.FloatTensor(np.asarray(actions)).to(self.device) - rewards = torch.FloatTensor(np.asarray(rewards)).to(self.device).unsqueeze(1) - next_states = torch.FloatTensor(np.asarray(next_states)).to(self.device) - - self.world_model.train_world( - states=states, - actions=actions, - next_states=next_states, - ) - self.world_model.train_reward( - states=states, - actions=actions, - rewards=rewards, - ) - - def train_policy(self, memory: PrioritizedReplayBuffer, batch_size: int) -> None: - self.learn_counter += 1 - - experiences = memory.sample_uniform(batch_size) - states, actions, rewards, next_states, dones, _ = experiences - - # Convert into tensor - states = torch.FloatTensor(np.asarray(states)).to(self.device) - actions = torch.FloatTensor(np.asarray(actions)).to(self.device) - rewards = torch.FloatTensor(np.asarray(rewards)).to(self.device).unsqueeze(1) - next_states = torch.FloatTensor(np.asarray(next_states)).to(self.device) - dones = torch.LongTensor(np.asarray(dones)).to(self.device).unsqueeze(1) - - # Step 2 train as usual - self._train_policy( - states=states, - actions=actions, - rewards=rewards, - next_states=next_states, - dones=dones, - ) - # # # Step 3 Dyna add more data - self._dyna_generate_and_train(next_states=next_states) - - def _dyna_generate_and_train(self, next_states: torch.Tensor) -> None: - pred_states = [] - pred_actions = [] - pred_rs = [] - pred_n_states = [] - - with torch.no_grad(): - pred_state = next_states - for _ in range(self.horizon): - pred_state = torch.repeat_interleave(pred_state, self.num_samples, dim=0) - # This part is controversial. But random actions is empirically better. - rand_acts = np.random.uniform(-1, 1, (pred_state.shape[0], self.action_num)) - pred_acts = torch.FloatTensor(rand_acts).to(self.device) - pred_next_state, _, _, _ = self.world_model.pred_next_states( - pred_state, pred_acts - ) - pred_reward = self.world_model.pred_rewards(pred_state, pred_acts) - pred_states.append(pred_state) - pred_actions.append(pred_acts.detach()) - pred_rs.append(pred_reward.detach()) - pred_n_states.append(pred_next_state.detach()) - pred_state = pred_next_state.detach() - pred_states = torch.vstack(pred_states) - pred_actions = torch.vstack(pred_actions) - pred_rs = torch.vstack(pred_rs) - pred_n_states = torch.vstack(pred_n_states) - # Pay attention to here! It is dones in the Cares RL Code! - pred_dones = torch.FloatTensor(np.zeros(pred_rs.shape)).to(self.device) - # states, actions, rewards, next_states, not_dones - self._train_policy( - pred_states, pred_actions, pred_rs, pred_n_states, pred_dones - ) - - def train_policy(self, memory: MemoryBuffer, batch_size: int) -> None: - self.learn_counter += 1 - - experiences = memory.sample_uniform(batch_size) - states, actions, rewards, next_states, dones, _ = experiences - - # Convert into tensor - states = torch.FloatTensor(np.asarray(states)).to(self.device) - actions = torch.FloatTensor(np.asarray(actions)).to(self.device) - rewards = torch.FloatTensor(np.asarray(rewards)).to(self.device).unsqueeze(1) - next_states = torch.FloatTensor(np.asarray(next_states)).to(self.device) - dones = torch.LongTensor(np.asarray(dones)).to(self.device).unsqueeze(1) - - # Step 1 train as usual - self._update_critic_actor(states, actions, rewards, next_states, dones) - - # # # Step 2 Dyna add more data - self._dyna_generate_and_train(next_states=next_states) - - def train_world_model(self, memory: MemoryBuffer, batch_size: int) -> None: - experiences = memory.sample_consecutive(batch_size) - - ( - states, - actions, - rewards, - next_states, - _, - _, - next_actions, - next_rewards, - _, - _, - _, - ) = experiences - - states = torch.FloatTensor(np.asarray(states)).to(self.device) - actions = torch.FloatTensor(np.asarray(actions)).to(self.device) - rewards = torch.FloatTensor(np.asarray(rewards)).to(self.device).unsqueeze(1) - next_states = torch.FloatTensor(np.asarray(next_states)).to(self.device) - next_rewards = ( - torch.FloatTensor(np.asarray(next_rewards)).to(self.device).unsqueeze(1) - ) - next_actions = torch.FloatTensor(np.asarray(next_actions)).to(self.device) - - # Step 1 train the world model. - self.world_model.train_world( - states=states, - actions=actions, - rewards=rewards, - next_states=next_states, - next_actions=next_actions, - next_rewards=next_rewards, - ) - - def set_statistics(self, stats: dict) -> None: - self.world_model.set_statistics(stats) - - def save_models(self, filepath: str, filename: str) -> None: - if not os.path.exists(filepath): - os.makedirs(filepath) - - torch.save(self.actor_net.state_dict(), f"{filepath}/{filename}_actor.pth") - torch.save(self.critic_net.state_dict(), f"{filepath}/{filename}_critic.pth") - logging.info("models has been saved...") - - def load_models(self, filepath: str, filename: str) -> None: - self.actor_net.load_state_dict(torch.load(f"{filepath}/{filename}_actor.pth")) - self.critic_net.load_state_dict(torch.load(f"{filepath}/{filename}_critic.pth")) - logging.info("models has been loaded...") diff --git a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_SAS_Immersive_Weight.py b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_SAS_Immersive_Weight.py deleted file mode 100644 index fa0115f0..00000000 --- a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_SAS_Immersive_Weight.py +++ /dev/null @@ -1,435 +0,0 @@ -""" -Sutton, Richard S. "Dyna, an integrated architecture for learning, planning, and reacting." - -Original Paper: https://dl.acm.org/doi/abs/10.1145/122344.122377 - -This code runs automatic entropy tuning -""" - -import copy -import logging -import os - -import numpy as np -import torch -from cares_reinforcement_learning.memory import PrioritizedReplayBuffer -import torch.nn.functional as F - -from cares_reinforcement_learning.networks.world_models import ( - EnsembleWorldAndOneSASReward, -) - -from cares_reinforcement_learning.util.helpers import denormalize_observation_delta - -class DynaSAC_SAS_Immersive_Weight: - """ - Max as ? - """ - - def __init__( - self, - actor_network: torch.nn.Module, - critic_network: torch.nn.Module, - world_network: EnsembleWorldAndOneSASReward, - gamma: float, - tau: float, - action_num: int, - actor_lr: float, - critic_lr: float, - alpha_lr: float, - num_samples: int, - horizon: int, - threshold_scale: float, - reweight_critic: bool, - reweight_actor: bool, - mode: int, - sample_times: int, - device: torch.device, - ): - self.type = "mbrl" - self.device = device - self.reweight_critic = reweight_critic - self.reweight_actor = reweight_actor - # this may be called policy_net in other implementations - self.actor_net = actor_network.to(self.device) - # this may be called soft_q_net in other implementations - self.critic_net = critic_network.to(self.device) - self.target_critic_net = copy.deepcopy(self.critic_net) - - self.gamma = gamma - self.tau = tau - - self.num_samples = num_samples - self.horizon = horizon - self.action_num = action_num - - self.learn_counter = 0 - self.policy_update_freq = 1 - - self.actor_net_optimiser = torch.optim.Adam( - self.actor_net.parameters(), lr=actor_lr - ) - self.critic_net_optimiser = torch.optim.Adam( - self.critic_net.parameters(), lr=critic_lr - ) - - # Set to initial alpha to 1.0 according to other baselines. - self.log_alpha = torch.tensor(np.log(1.0)).to(device) - self.log_alpha.requires_grad = True - self.target_entropy = -action_num - self.log_alpha_optimizer = torch.optim.Adam([self.log_alpha], lr=alpha_lr) - - # World model - self.world_model = world_network - # Parameter - self.threshold_scale = threshold_scale - self.mode = mode - self.sample_times = sample_times - - @property - def _alpha(self) -> float: - return self.log_alpha.exp() - - # pylint: disable-next=unused-argument to keep the same interface - def select_action_from_policy( - self, state: np.ndarray, evaluation: bool = False, noise_scale: float = 0 - ) -> np.ndarray: - # note that when evaluating this algorithm we need to select mu as - self.actor_net.eval() - with torch.no_grad(): - state_tensor = torch.FloatTensor(state).unsqueeze(0).to(self.device) - if evaluation is False: - (action, _, _) = self.actor_net(state_tensor) - else: - (_, _, action) = self.actor_net(state_tensor) - action = action.cpu().data.numpy().flatten() - self.actor_net.train() - return action - - def _train_policy( - self, - states: torch.Tensor, - actions: torch.Tensor, - rewards: torch.Tensor, - next_states: torch.Tensor, - dones: torch.Tensor, - weights: torch.Tensor, - ) -> None: - ################## Update the Critic First #################### - # Have more target values? - with torch.no_grad(): - next_actions, next_log_pi, _ = self.actor_net(next_states) - target_q_one, target_q_two = self.target_critic_net( - next_states, next_actions - ) - target_q_values = ( - torch.minimum(target_q_one, target_q_two) - self._alpha * next_log_pi - ) - q_target = rewards + self.gamma * (1 - dones) * target_q_values - - q_values_one, q_values_two = self.critic_net(states, actions) - - if self.reweight_critic: - # Reweighted loss function. weight not participant in training. - l2_loss_one = (q_values_one - q_target).pow(2) - l2_loss_two = (q_values_two - q_target).pow(2) - - weights = weights.detach() - disc_l2_loss_one = l2_loss_one * weights - disc_l2_loss_two = l2_loss_two * weights - # A ratio to scale the loss back to original loss scale. - - ratio_1 = torch.mean(l2_loss_one) / torch.mean(disc_l2_loss_one) - ratio_1 = ratio_1.detach() - ratio_2 = torch.mean(l2_loss_two) / torch.mean(disc_l2_loss_two) - ratio_2 = ratio_2.detach() - - critic_loss_one = disc_l2_loss_one.mean() * ratio_1 - critic_loss_two = disc_l2_loss_two.mean() * ratio_2 - - critic_loss_total = critic_loss_one + critic_loss_two - else: - critic_loss_one = F.mse_loss(q_values_one, q_target) - critic_loss_two = F.mse_loss(q_values_two, q_target) - critic_loss_total = critic_loss_one + critic_loss_two - - # Update the Critic - self.critic_net_optimiser.zero_grad() - critic_loss_total.backward() - self.critic_net_optimiser.step() - - ################## Update the Actor Second #################### - pi, first_log_p, _ = self.actor_net(states) - qf1_pi, qf2_pi = self.critic_net(states, pi) - min_qf_pi = torch.minimum(qf1_pi, qf2_pi) - - if self.reweight_actor: - weights = weights.detach() - a_loss = (self._alpha * first_log_p) - min_qf_pi - disc_actor_loss = a_loss * weights - ratio = torch.mean(a_loss) / torch.mean(disc_actor_loss) - ratio = ratio.detach() - actor_loss = ratio * torch.mean(disc_actor_loss) - else: - actor_loss = ((self._alpha * first_log_p) - min_qf_pi).mean() - - # Update the Actor - self.actor_net_optimiser.zero_grad() - actor_loss.backward() - self.actor_net_optimiser.step() - - # Update the temperature - alpha_loss = -( - self.log_alpha * (first_log_p + self.target_entropy).detach() - ).mean() - - self.log_alpha_optimizer.zero_grad() - alpha_loss.backward() - self.log_alpha_optimizer.step() - - if self.learn_counter % self.policy_update_freq == 0: - for target_param, param in zip( - self.target_critic_net.parameters(), self.critic_net.parameters() - ): - target_param.data.copy_( - param.data * self.tau + target_param.data * (1.0 - self.tau) - ) - - def train_world_model( - self, memory: PrioritizedReplayBuffer, batch_size: int - ) -> None: - experiences = memory.sample_uniform(batch_size) - states, actions, rewards, next_states, _, _ = experiences - - states = torch.FloatTensor(np.asarray(states)).to(self.device) - actions = torch.FloatTensor(np.asarray(actions)).to(self.device) - rewards = torch.FloatTensor(np.asarray(rewards)).to(self.device).unsqueeze(1) - next_states = torch.FloatTensor(np.asarray(next_states)).to(self.device) - - self.world_model.train_world( - states=states, - actions=actions, - next_states=next_states, - ) - self.world_model.train_reward( - states=states, - actions=actions, - next_states=next_states, - rewards=rewards, - ) - - def train_policy(self, memory: PrioritizedReplayBuffer, batch_size: int) -> None: - self.learn_counter += 1 - - experiences = memory.sample_uniform(batch_size) - states, actions, rewards, next_states, dones, _ = experiences - - # Convert into tensor - states = torch.FloatTensor(np.asarray(states)).to(self.device) - actions = torch.FloatTensor(np.asarray(actions)).to(self.device) - rewards = torch.FloatTensor(np.asarray(rewards)).to(self.device).unsqueeze(1) - next_states = torch.FloatTensor(np.asarray(next_states)).to(self.device) - dones = torch.LongTensor(np.asarray(dones)).to(self.device).unsqueeze(1) - full_weights = torch.ones(rewards.shape).to(self.device) - # Step 2 train as usual - self._train_policy( - states=states, - actions=actions, - rewards=rewards, - next_states=next_states, - dones=dones, - weights=full_weights, - ) - # # # Step 3 Dyna add more data - self._dyna_generate_and_train(next_states=next_states) - - def _dyna_generate_and_train(self, next_states): - """ - Only off-policy Dyna will work. - :param next_states: - """ - pred_states = [] - pred_actions = [] - pred_rs = [] - pred_n_states = [] - pred_uncerts = [] - with torch.no_grad(): - pred_state = next_states - for _ in range(self.horizon): - pred_state = torch.repeat_interleave(pred_state, self.num_samples, dim=0) - # This part is controversial. But random actions is empirically better. - rand_acts = np.random.uniform(-1, 1, (pred_state.shape[0], self.action_num)) - pred_acts = torch.FloatTensor(rand_acts).to(self.device) - - pred_next_state, _, pred_mean, pred_var = self.world_model.pred_next_states( - pred_state, pred_acts - ) - uncert = self.sampling(pred_state, pred_acts, pred_means=pred_mean, pred_vars=pred_var) - uncert = uncert.unsqueeze(dim=1).to(self.device) - pred_uncerts.append(uncert) - - pred_reward = self.world_model.pred_rewards(pred_state, pred_acts, pred_next_state) - pred_states.append(pred_state) - pred_actions.append(pred_acts.detach()) - pred_rs.append(pred_reward.detach()) - pred_n_states.append(pred_next_state.detach()) - pred_state = pred_next_state.detach() - pred_states = torch.vstack(pred_states) - pred_actions = torch.vstack(pred_actions) - pred_rs = torch.vstack(pred_rs) - pred_n_states = torch.vstack(pred_n_states) - pred_weights = torch.vstack(pred_uncerts) - # Pay attention to here! It is dones in the Cares RL Code! - pred_dones = torch.FloatTensor(np.zeros(pred_rs.shape)).to(self.device) - # states, actions, rewards, next_states, not_dones - self._train_policy( - pred_states, pred_actions, pred_rs, pred_n_states, pred_dones, pred_weights - ) - - def sampling(self, curr_states, curr_act, pred_means, pred_vars): - """ - High std means low uncertainty. Therefore, divided by 1 - - :param pred_means: - :param pred_vars: - :return: - - Args: - pred_act: - pred_state: - """ - with torch.no_grad(): - # 5 models. Each predict 10 next_states. - sample1 = torch.distributions.Normal(pred_means[0], pred_vars[0]).sample( - [self.sample_times]) - sample2 = torch.distributions.Normal(pred_means[1], pred_vars[1]).sample( - [self.sample_times]) - sample3 = torch.distributions.Normal(pred_means[2], pred_vars[2]).sample( - [self.sample_times]) - sample4 = torch.distributions.Normal(pred_means[3], pred_vars[3]).sample( - [self.sample_times]) - sample5 = torch.distributions.Normal(pred_means[4], pred_vars[4]).sample( - [self.sample_times]) - rs = [] - acts = [] - qs = [] - # Varying the next_state's distribution. - for i in range(self.sample_times): - sample1i = denormalize_observation_delta(sample1[i], self.world_model.statistics) - sample1i += curr_states - sample2i = denormalize_observation_delta(sample2[i], self.world_model.statistics) - sample2i += curr_states - sample3i = denormalize_observation_delta(sample3[i], self.world_model.statistics) - sample3i += curr_states - sample4i = denormalize_observation_delta(sample4[i], self.world_model.statistics) - sample4i += curr_states - sample5i = denormalize_observation_delta(sample5[i], self.world_model.statistics) - sample5i += curr_states - if self.reweight_critic == 1: - # 5 models, each sampled 10 times = 50, - pred_rwd1 = self.world_model.pred_rewards(curr_states, curr_act, sample1i) - pred_rwd2 = self.world_model.pred_rewards(curr_states, curr_act, sample2i) - pred_rwd3 = self.world_model.pred_rewards(curr_states, curr_act, sample3i) - pred_rwd4 = self.world_model.pred_rewards(curr_states, curr_act, sample4i) - pred_rwd5 = self.world_model.pred_rewards(curr_states, curr_act, sample5i) - rs.append(pred_rwd1) - rs.append(pred_rwd2) - rs.append(pred_rwd3) - rs.append(pred_rwd4) - rs.append(pred_rwd5) - # Each times, 5 models predict different actions. - # [2560, 17] - pred_act1, log_pi1, _ = self.actor_net(sample1i) - pred_act2, log_pi2, _ = self.actor_net(sample2i) - pred_act3, log_pi3, _ = self.actor_net(sample3i) - pred_act4, log_pi4, _ = self.actor_net(sample4i) - pred_act5, log_pi5, _ = self.actor_net(sample5i) - acts.append(log_pi1) - acts.append(log_pi2) - acts.append(log_pi3) - acts.append(log_pi4) - acts.append(log_pi5) - # How to become the same next state, different action. - # Now: sample1 sample2... same next state, different model. - # Pred_act1 pred_act2 same next_state, different actions. - # 5[] * 10[var of state] - qa1, qa2 = self.target_critic_net(sample1i, pred_act1) - qa = torch.minimum(qa1, qa2) - qb1, qb2 = self.target_critic_net(sample2i, pred_act2) - qb = torch.minimum(qb1, qb2) - qc1, qc2 = self.target_critic_net(sample3i, pred_act3) - qc = torch.minimum(qc1, qc2) - qd1, qd2 = self.target_critic_net(sample4i, pred_act4) - qd = torch.minimum(qd1, qd2) - qe1, qe2 = self.target_critic_net(sample5i, pred_act5) - qe = torch.minimum(qe1, qe2) - qs.append(qa) - qs.append(qb) - qs.append(qc) - qs.append(qd) - qs.append(qe) - if self.reweight_critic == 1: - rs = torch.stack(rs) - acts = torch.stack(acts) - qs = torch.stack(qs) - - if self.reweight_critic: - var_r = torch.var(rs, dim=0) - var_a = torch.var(acts, dim=0) - var_q = torch.var(qs, dim=0) - - mean_a = torch.mean(acts, dim=0, keepdim=True) - mean_q = torch.mean(qs, dim=0, keepdim=True) - diff_a = acts - mean_a - diff_q = qs - mean_q - cov_aq = torch.mean(diff_a * diff_q, dim=0) - - mean_r = torch.mean(rs, dim=0, keepdim=True) - diff_r = rs - mean_r - cov_rq = torch.mean(diff_r * diff_q, dim=0) - cov_ra = torch.mean(diff_r * diff_a, dim=0) - - gamma_sq = self.gamma * self.gamma - total_var = var_r + gamma_sq * var_a + gamma_sq * var_q + gamma_sq * 2 * cov_aq + \ - gamma_sq * 2 * cov_rq + gamma_sq * 2 * cov_ra - - if self.reweight_actor: - mean_a = torch.mean(acts, dim=0, keepdim=True) - mean_q = torch.mean(qs, dim=0, keepdim=True) - diff_a = acts - mean_a - diff_q = qs - mean_q - cov_aq = torch.mean(diff_a * diff_q, dim=0) - - var_a = torch.var(acts, dim=0) - var_q = torch.var(qs, dim=0) - # For actor: alpha^2 * var_a + var_q - total_var = (self._alpha ** 2) * var_a + var_q + (self._alpha ** 2) * cov_aq - - min_var = torch.min(total_var) - max_var = torch.max(total_var) - # As (max-min) decrease, threshold should go down. - threshold = self.threshold_scale * (max_var - min_var) + min_var - total_var[total_var <= threshold] = threshold - - total_var += 0.00000001 - total_stds = 1 / total_var - - return total_stds.detach() - - def set_statistics(self, stats: dict) -> None: - self.world_model.set_statistics(stats) - - def save_models(self, filename: str, filepath: str = "models") -> None: - path = f"{filepath}/models" if filepath != "models" else filepath - dir_exists = os.path.exists(path) - if not dir_exists: - os.makedirs(path) - torch.save(self.actor_net.state_dict(), f"{path}/{filename}_actor.pth") - torch.save(self.critic_net.state_dict(), f"{path}/{filename}_critic.pth") - logging.info("models has been saved...") - - def load_models(self, filepath: str, filename: str) -> None: - path = f"{filepath}/models" if filepath != "models" else filepath - self.actor_net.load_state_dict(torch.load(f"{path}/{filename}_actor.pth")) - self.critic_net.load_state_dict(torch.load(f"{path}/{filename}_critic.pth")) - logging.info("models has been loaded...") diff --git a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_SA_Immersive_Weight.py b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_SA_Immersive_Weight.py deleted file mode 100644 index fa6d47ed..00000000 --- a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_SA_Immersive_Weight.py +++ /dev/null @@ -1,397 +0,0 @@ -""" -Sutton, Richard S. "Dyna, an integrated architecture for learning, planning, and reacting." - -Original Paper: https://dl.acm.org/doi/abs/10.1145/122344.122377 - -This code runs automatic entropy tuning -""" - -import copy -import logging -import os - -import numpy as np -import torch -from cares_reinforcement_learning.memory import PrioritizedReplayBuffer -import torch.nn.functional as F - -from cares_reinforcement_learning.networks.world_models import ( - EnsembleWorldAndOneSAReward, -) - -from cares_reinforcement_learning.util.helpers import denormalize_observation_delta - -class DynaSAC_SABR: - """ - Max as ? - """ - - def __init__( - self, - actor_network: torch.nn.Module, - critic_network: torch.nn.Module, - world_network: EnsembleWorldAndOneSAReward, - gamma: float, - tau: float, - action_num: int, - actor_lr: float, - critic_lr: float, - alpha_lr: float, - num_samples: int, - horizon: int, - threshold_scale: float, - reweight_critic: bool, - reweight_actor: bool, - mode: int, - sample_times: int, - device: torch.device, - ): - self.type = "mbrl" - self.device = device - self.reweight_critic = reweight_critic - self.reweight_actor = reweight_actor - # this may be called policy_net in other implementations - self.actor_net = actor_network.to(self.device) - # this may be called soft_q_net in other implementations - self.critic_net = critic_network.to(self.device) - self.target_critic_net = copy.deepcopy(self.critic_net) - - self.gamma = gamma - self.tau = tau - - self.num_samples = num_samples - self.horizon = horizon - self.action_num = action_num - - self.learn_counter = 0 - self.policy_update_freq = 1 - - self.actor_net_optimiser = torch.optim.Adam( - self.actor_net.parameters(), lr=actor_lr - ) - self.critic_net_optimiser = torch.optim.Adam( - self.critic_net.parameters(), lr=critic_lr - ) - - # Set to initial alpha to 1.0 according to other baselines. - self.log_alpha = torch.tensor(np.log(1.0)).to(device) - self.log_alpha.requires_grad = True - self.target_entropy = -action_num - self.log_alpha_optimizer = torch.optim.Adam([self.log_alpha], lr=alpha_lr) - - # World model - self.world_model = world_network - # Parameter - self.threshold_scale = threshold_scale - self.mode = mode - self.sample_times = sample_times - - @property - def _alpha(self) -> float: - return self.log_alpha.exp() - - # pylint: disable-next=unused-argument to keep the same interface - def select_action_from_policy( - self, state: np.ndarray, evaluation: bool = False, noise_scale: float = 0 - ) -> np.ndarray: - # note that when evaluating this algorithm we need to select mu as - self.actor_net.eval() - with torch.no_grad(): - state_tensor = torch.FloatTensor(state).unsqueeze(0).to(self.device) - if evaluation is False: - (action, _, _) = self.actor_net(state_tensor) - else: - (_, _, action) = self.actor_net(state_tensor) - action = action.cpu().data.numpy().flatten() - self.actor_net.train() - return action - - def _train_policy( - self, - states: torch.Tensor, - actions: torch.Tensor, - rewards: torch.Tensor, - next_states: torch.Tensor, - dones: torch.Tensor, - weights: torch.Tensor, - ) -> None: - ################## Update the Critic First #################### - # Have more target values? - with torch.no_grad(): - next_actions, next_log_pi, _ = self.actor_net(next_states) - target_q_one, target_q_two = self.target_critic_net( - next_states, next_actions - ) - target_q_values = ( - torch.minimum(target_q_one, target_q_two) - self._alpha * next_log_pi - ) - q_target = rewards + self.gamma * (1 - dones) * target_q_values - - q_values_one, q_values_two = self.critic_net(states, actions) - - if self.reweight_critic: - # Reweighted loss function. weight not participant in training. - l2_loss_one = (q_values_one - q_target).pow(2) - l2_loss_two = (q_values_two - q_target).pow(2) - - weights = weights.detach() - disc_l2_loss_one = l2_loss_one * weights - disc_l2_loss_two = l2_loss_two * weights - # A ratio to scale the loss back to original loss scale. - - ratio_1 = torch.mean(l2_loss_one) / torch.mean(disc_l2_loss_one) - ratio_1 = ratio_1.detach() - ratio_2 = torch.mean(l2_loss_two) / torch.mean(disc_l2_loss_two) - ratio_2 = ratio_2.detach() - - critic_loss_one = disc_l2_loss_one.mean() * ratio_1 - critic_loss_two = disc_l2_loss_two.mean() * ratio_2 - - critic_loss_total = critic_loss_one + critic_loss_two - else: - critic_loss_one = F.mse_loss(q_values_one, q_target) - critic_loss_two = F.mse_loss(q_values_two, q_target) - critic_loss_total = critic_loss_one + critic_loss_two - - # Update the Critic - self.critic_net_optimiser.zero_grad() - critic_loss_total.backward() - self.critic_net_optimiser.step() - - ################## Update the Actor Second #################### - pi, first_log_p, _ = self.actor_net(states) - qf1_pi, qf2_pi = self.critic_net(states, pi) - min_qf_pi = torch.minimum(qf1_pi, qf2_pi) - - if self.reweight_actor: - weights = weights.detach() - a_loss = (self._alpha * first_log_p) - min_qf_pi - disc_actor_loss = a_loss * weights - ratio = torch.mean(a_loss) / torch.mean(disc_actor_loss) - ratio = ratio.detach() - actor_loss = ratio * torch.mean(disc_actor_loss) - else: - actor_loss = ((self._alpha * first_log_p) - min_qf_pi).mean() - - # Update the Actor - self.actor_net_optimiser.zero_grad() - actor_loss.backward() - self.actor_net_optimiser.step() - - # Update the temperature - alpha_loss = -( - self.log_alpha * (first_log_p + self.target_entropy).detach() - ).mean() - - self.log_alpha_optimizer.zero_grad() - alpha_loss.backward() - self.log_alpha_optimizer.step() - - if self.learn_counter % self.policy_update_freq == 0: - for target_param, param in zip( - self.target_critic_net.parameters(), self.critic_net.parameters() - ): - target_param.data.copy_( - param.data * self.tau + target_param.data * (1.0 - self.tau) - ) - - def train_world_model( - self, memory: PrioritizedReplayBuffer, batch_size: int - ) -> None: - experiences = memory.sample_uniform(batch_size) - states, actions, rewards, next_states, _, _ = experiences - - states = torch.FloatTensor(np.asarray(states)).to(self.device) - actions = torch.FloatTensor(np.asarray(actions)).to(self.device) - rewards = torch.FloatTensor(np.asarray(rewards)).to(self.device).unsqueeze(1) - next_states = torch.FloatTensor(np.asarray(next_states)).to(self.device) - - self.world_model.train_world( - states=states, - actions=actions, - next_states=next_states, - ) - self.world_model.train_reward( - states=states, - actions=actions, - rewards=rewards, - ) - - def train_policy(self, memory: PrioritizedReplayBuffer, batch_size: int) -> None: - self.learn_counter += 1 - - experiences = memory.sample_uniform(batch_size) - states, actions, rewards, next_states, dones, _ = experiences - - # Convert into tensor - states = torch.FloatTensor(np.asarray(states)).to(self.device) - actions = torch.FloatTensor(np.asarray(actions)).to(self.device) - rewards = torch.FloatTensor(np.asarray(rewards)).to(self.device).unsqueeze(1) - next_states = torch.FloatTensor(np.asarray(next_states)).to(self.device) - dones = torch.LongTensor(np.asarray(dones)).to(self.device).unsqueeze(1) - full_weights = torch.ones(rewards.shape).to(self.device) - # Step 2 train as usual - self._train_policy( - states=states, - actions=actions, - rewards=rewards, - next_states=next_states, - dones=dones, - weights=full_weights, - ) - # # # Step 3 Dyna add more data - self._dyna_generate_and_train(next_states=next_states) - - def _dyna_generate_and_train(self, next_states): - """ - Only off-policy Dyna will work. - :param next_states: - """ - pred_states = [] - pred_actions = [] - pred_rs = [] - pred_n_states = [] - pred_uncerts = [] - with torch.no_grad(): - pred_state = next_states - for _ in range(self.horizon): - pred_state = torch.repeat_interleave(pred_state, self.num_samples, dim=0) - # This part is controversial. But random actions is empirically better. - rand_acts = np.random.uniform(-1, 1, (pred_state.shape[0], self.action_num)) - pred_acts = torch.FloatTensor(rand_acts).to(self.device) - - pred_next_state, _, pred_mean, pred_var = self.world_model.pred_next_states( - pred_state, pred_acts - ) - uncert = self.sampling(curr_states=pred_state, pred_means=pred_mean, pred_vars=pred_var) - uncert = uncert.unsqueeze(dim=1).to(self.device) - pred_uncerts.append(uncert) - - pred_reward = self.world_model.pred_rewards(pred_state, pred_acts) - pred_states.append(pred_state) - pred_actions.append(pred_acts.detach()) - pred_rs.append(pred_reward.detach()) - pred_n_states.append(pred_next_state.detach()) - pred_state = pred_next_state.detach() - pred_states = torch.vstack(pred_states) - pred_actions = torch.vstack(pred_actions) - pred_rs = torch.vstack(pred_rs) - pred_n_states = torch.vstack(pred_n_states) - pred_weights = torch.vstack(pred_uncerts) - # Pay attention to here! It is dones in the Cares RL Code! - pred_dones = torch.FloatTensor(np.zeros(pred_rs.shape)).to(self.device) - # states, actions, rewards, next_states, not_dones - self._train_policy( - pred_states, pred_actions, pred_rs, pred_n_states, pred_dones, pred_weights - ) - - def sampling(self, curr_states, pred_means, pred_vars): - """ - High std means low uncertainty. Therefore, divided by 1 - - :param pred_means: - :param pred_vars: - :return: - """ - with torch.no_grad(): - # 5 models. Each predict 10 next_states. - sample1 = torch.distributions.Normal(pred_means[0], pred_vars[0]).sample( - [self.sample_times]) - sample2 = torch.distributions.Normal(pred_means[1], pred_vars[1]).sample( - [self.sample_times]) - sample3 = torch.distributions.Normal(pred_means[2], pred_vars[2]).sample( - [self.sample_times]) - sample4 = torch.distributions.Normal(pred_means[3], pred_vars[3]).sample( - [self.sample_times]) - sample5 = torch.distributions.Normal(pred_means[4], pred_vars[4]).sample( - [self.sample_times]) - acts = [] - qs = [] - # Varying the next_state's distribution. - for i in range(self.sample_times): - sample1i = denormalize_observation_delta(sample1[i], self.world_model.statistics) - sample1i += curr_states - sample2i = denormalize_observation_delta(sample2[i], self.world_model.statistics) - sample2i += curr_states - sample3i = denormalize_observation_delta(sample3[i], self.world_model.statistics) - sample3i += curr_states - sample4i = denormalize_observation_delta(sample4[i], self.world_model.statistics) - sample4i += curr_states - sample5i = denormalize_observation_delta(sample5[i], self.world_model.statistics) - sample5i += curr_states - # Each times, 5 models predict different actions. - # [2560, 17] - pred_act1, log_pi1, _ = self.actor_net(sample1i) - pred_act2, log_pi2, _ = self.actor_net(sample2i) - pred_act3, log_pi3, _ = self.actor_net(sample3i) - pred_act4, log_pi4, _ = self.actor_net(sample4i) - pred_act5, log_pi5, _ = self.actor_net(sample5i) - acts.append(log_pi1) - acts.append(log_pi2) - acts.append(log_pi3) - acts.append(log_pi4) - acts.append(log_pi5) - # How to become the same next state, different action. - # Now: sample1 sample2... same next state, different model. - # Pred_act1 pred_act2 same next_state, different actions. - # 5[] * 10[var of state] - qa1, qa2 = self.target_critic_net(sample1i, pred_act1) - qa = torch.minimum(qa1, qa2) - qb1, qb2 = self.target_critic_net(sample2i, pred_act2) - qb = torch.minimum(qb1, qb2) - qc1, qc2 = self.target_critic_net(sample3i, pred_act3) - qc = torch.minimum(qc1, qc2) - qd1, qd2 = self.target_critic_net(sample4i, pred_act4) - qd = torch.minimum(qd1, qd2) - qe1, qe2 = self.target_critic_net(sample5i, pred_act5) - qe = torch.minimum(qe1, qe2) - qs.append(qa) - qs.append(qb) - qs.append(qc) - qs.append(qd) - qs.append(qe) - acts = torch.stack(acts) - qs = torch.stack(qs) - - var_a = torch.var(acts, dim=0) - var_q = torch.var(qs, dim=0) - mean_a = torch.mean(acts, dim=0, keepdim=True) - mean_q = torch.mean(qs, dim=0, keepdim=True) - diff_a = acts - mean_a - diff_q = qs - mean_q - cov_aq = torch.mean(diff_a * diff_q, dim=0) - - if self.reweight_critic: - gamma_sq = self.gamma * self.gamma - total_var = gamma_sq * var_a + gamma_sq * var_q + gamma_sq * 2 * cov_aq - - if self.reweight_actor: - # For actor: alpha^2 * var_a + var_q - total_var = (self._alpha ** 2) * var_a + var_q + cov_aq - - min_var = torch.min(total_var) - max_var = torch.max(total_var) - # As (max-min) decrease, threshold should go down. - threshold = self.threshold_scale * (max_var - min_var) + min_var - total_var[total_var <= threshold] = threshold - total_var += 0.00000001 - total_stds = 1 / total_var - return total_stds.detach() - - def set_statistics(self, stats: dict) -> None: - self.world_model.set_statistics(stats) - - def save_models(self, filename: str, filepath: str = "models") -> None: - path = f"{filepath}/models" if filepath != "models" else filepath - dir_exists = os.path.exists(path) - if not dir_exists: - os.makedirs(path) - torch.save(self.actor_net.state_dict(), f"{path}/{filename}_actor.pth") - torch.save(self.critic_net.state_dict(), f"{path}/{filename}_critic.pth") - logging.info("models has been saved...") - - def load_models(self, filepath: str, filename: str) -> None: - path = f"{filepath}/models" if filepath != "models" else filepath - self.actor_net.load_state_dict(torch.load(f"{path}/{filename}_actor.pth")) - self.critic_net.load_state_dict(torch.load(f"{path}/{filename}_critic.pth")) - logging.info("models has been loaded...") diff --git a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_SUNRISE_Reweight.py b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_SUNRISE_NS.py similarity index 97% rename from cares_reinforcement_learning/algorithm/mbrl/DynaSAC_SUNRISE_Reweight.py rename to cares_reinforcement_learning/algorithm/mbrl/DynaSAC_SUNRISE_NS.py index a1730ad2..eb8036d1 100644 --- a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_SUNRISE_Reweight.py +++ b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_SUNRISE_NS.py @@ -12,11 +12,11 @@ import numpy as np import torch -from cares_reinforcement_learning.memory import PrioritizedReplayBuffer import torch.nn.functional as F -from cares_reinforcement_learning.networks.world_models import ( - EnsembleWorldAndOneNSReward, +from cares_reinforcement_learning.memory import MemoryBuffer +from cares_reinforcement_learning.networks.world_models.ensemble import ( + Ensemble_Dyna_Big ) from cares_reinforcement_learning.util.helpers import denormalize_observation_delta @@ -30,7 +30,7 @@ def __init__( self, actor_network: torch.nn.Module, critic_network: torch.nn.Module, - world_network: EnsembleWorldAndOneNSReward, + world_network: Ensemble_Dyna_Big, gamma: float, tau: float, action_num: int, @@ -74,7 +74,7 @@ def __init__( ) # Set to initial alpha to 1.0 according to other baselines. - self.log_alpha = torch.tensor(np.log(1.0)).to(device) + self.log_alpha = torch.FloatTensor([np.log(1.0)]).to(device) self.log_alpha.requires_grad = True self.target_entropy = -action_num self.log_alpha_optimizer = torch.optim.Adam([self.log_alpha], lr=alpha_lr) @@ -196,7 +196,7 @@ def _train_policy( ) def train_world_model( - self, memory: PrioritizedReplayBuffer, batch_size: int + self, memory: MemoryBuffer, batch_size: int ) -> None: experiences = memory.sample_uniform(batch_size) states, actions, rewards, next_states, _, _ = experiences @@ -216,7 +216,7 @@ def train_world_model( rewards=rewards, ) - def train_policy(self, memory: PrioritizedReplayBuffer, batch_size: int) -> None: + def train_policy(self, memory: MemoryBuffer, batch_size: int) -> None: self.learn_counter += 1 experiences = memory.sample_uniform(batch_size) diff --git a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_UWAC_Reweight.py b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_UWAC_NS.py similarity index 97% rename from cares_reinforcement_learning/algorithm/mbrl/DynaSAC_UWAC_Reweight.py rename to cares_reinforcement_learning/algorithm/mbrl/DynaSAC_UWAC_NS.py index b093d4d6..57c58e7b 100644 --- a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_UWAC_Reweight.py +++ b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_UWAC_NS.py @@ -12,11 +12,11 @@ import numpy as np import torch -from cares_reinforcement_learning.memory import PrioritizedReplayBuffer import torch.nn.functional as F -from cares_reinforcement_learning.networks.world_models import ( - EnsembleWorldAndOneNSReward, +from cares_reinforcement_learning.memory import MemoryBuffer +from cares_reinforcement_learning.networks.world_models.ensemble import ( + Ensemble_Dyna_Big ) from cares_reinforcement_learning.util.helpers import denormalize_observation_delta @@ -31,7 +31,7 @@ def __init__( self, actor_network: torch.nn.Module, critic_network: torch.nn.Module, - world_network: EnsembleWorldAndOneNSReward, + world_network: Ensemble_Dyna_Big, gamma: float, tau: float, action_num: int, @@ -75,7 +75,7 @@ def __init__( ) # Set to initial alpha to 1.0 according to other baselines. - self.log_alpha = torch.tensor(np.log(1.0)).to(device) + self.log_alpha = torch.FloatTensor([np.log(1.0)]).to(device) self.log_alpha.requires_grad = True self.target_entropy = -action_num self.log_alpha_optimizer = torch.optim.Adam([self.log_alpha], lr=alpha_lr) @@ -197,7 +197,7 @@ def _train_policy( ) def train_world_model( - self, memory: PrioritizedReplayBuffer, batch_size: int + self, memory: MemoryBuffer, batch_size: int ) -> None: experiences = memory.sample_uniform(batch_size) states, actions, rewards, next_states, _, _ = experiences @@ -217,7 +217,7 @@ def train_world_model( rewards=rewards, ) - def train_policy(self, memory: PrioritizedReplayBuffer, batch_size: int) -> None: + def train_policy(self, memory: MemoryBuffer, batch_size: int) -> None: self.learn_counter += 1 experiences = memory.sample_uniform(batch_size) diff --git a/cares_reinforcement_learning/algorithm/mbrl/STEVE_MEAN_SAC.py b/cares_reinforcement_learning/algorithm/mbrl/STEVE_MEAN_SAC.py index 9737ed08..325d8a9c 100644 --- a/cares_reinforcement_learning/algorithm/mbrl/STEVE_MEAN_SAC.py +++ b/cares_reinforcement_learning/algorithm/mbrl/STEVE_MEAN_SAC.py @@ -13,10 +13,9 @@ import torch import torch.nn.functional as F -from cares_reinforcement_learning.memory import PrioritizedReplayBuffer - -from cares_reinforcement_learning.networks.world_models import ( - EnsembleWorldEnsembleSASReward, +from cares_reinforcement_learning.memory import MemoryBuffer +from cares_reinforcement_learning.networks.world_models.ensemble import ( + Ensemble_Dyna_Big ) @@ -25,7 +24,7 @@ def __init__( self, actor_network: torch.nn.Module, critic_network: torch.nn.Module, - world_network: EnsembleWorldEnsembleSASReward, + world_network: Ensemble_Dyna_Big, gamma: float, tau: float, action_num: int, @@ -61,7 +60,7 @@ def __init__( ) # Set to initial alpha to 1.0 according to other baselines. - self.log_alpha = torch.tensor(np.log(1.0)).to(device) + self.log_alpha = torch.FloatTensor([np.log(1.0)]).to(device) self.log_alpha.requires_grad = True self.target_entropy = -action_num self.log_alpha_optimizer = torch.optim.Adam([self.log_alpha], lr=alpha_lr) @@ -185,7 +184,7 @@ def _train_policy( ) def train_world_model( - self, memory: PrioritizedReplayBuffer, batch_size: int + self, memory: MemoryBuffer, batch_size: int ) -> None: experiences = memory.sample_uniform(batch_size) @@ -208,7 +207,7 @@ def train_world_model( next_states=next_states ) - def train_policy(self, memory: PrioritizedReplayBuffer, batch_size: int) -> None: + def train_policy(self, memory: MemoryBuffer, batch_size: int) -> None: self.learn_counter += 1 experiences = memory.sample_uniform(batch_size) diff --git a/cares_reinforcement_learning/algorithm/mbrl/__init__.py b/cares_reinforcement_learning/algorithm/mbrl/__init__.py index 3ff88b02..379eb6b9 100644 --- a/cares_reinforcement_learning/algorithm/mbrl/__init__.py +++ b/cares_reinforcement_learning/algorithm/mbrl/__init__.py @@ -1,12 +1,7 @@ from .DynaSAC_NS import DynaSAC_NS -from .DynaSAC_NS_Immersive_Weight import DynaSAC_ScaleBatchReweight -from .DynaSAC_SA import DynaSAC_SA -from .DynaSAC_SA_Immersive_Weight import DynaSAC_SABR -from .DynaSAC_SAS import DynaSAC_SAS -from .DynaSAC_SAS_Immersive_Weight import DynaSAC_SAS_Immersive_Weight - -from .DynaSAC_SUNRISE_Reweight import DynaSAC_SUNRISEReweight -from .DynaSAC_UWAC_Reweight import DynaSAC_UWACReweight -from .DynaSAC_BIV_Reweight import DynaSAC_BIVReweight - +from .DynaSAC_Bounded import DynaSAC_Bounded +from .DynaSAC_IW_NS import DynaSAC_ScaleBatchReweight +from .DynaSAC_SUNRISE_NS import DynaSAC_SUNRISEReweight +from .DynaSAC_UWAC_NS import DynaSAC_UWACReweight +from .DynaSAC_BIV_NS import DynaSAC_BIVReweight from .STEVE_MEAN_SAC import STEVE_MEAN diff --git a/cares_reinforcement_learning/algorithm/policy/SAC.py b/cares_reinforcement_learning/algorithm/policy/SAC.py index 8ba82ae8..313cd3f3 100644 --- a/cares_reinforcement_learning/algorithm/policy/SAC.py +++ b/cares_reinforcement_learning/algorithm/policy/SAC.py @@ -59,7 +59,7 @@ def __init__( # Temperature (alpha) for the entropy loss # Set to initial alpha to 1.0 according to other baselines. init_temperature = 1.0 - self.log_alpha = torch.tensor(np.log(init_temperature)).to(device) + self.log_alpha = torch.FloatTensor([np.log(init_temperature)]).to(device) self.log_alpha.requires_grad = True self.log_alpha_optimizer = torch.optim.Adam( [self.log_alpha], lr=config.alpha_lr diff --git a/cares_reinforcement_learning/networks/world_models/__init__.py b/cares_reinforcement_learning/networks/world_models/__init__.py index 395dfb8b..3efe4bb0 100644 --- a/cares_reinforcement_learning/networks/world_models/__init__.py +++ b/cares_reinforcement_learning/networks/world_models/__init__.py @@ -1,19 +1,4 @@ # from cares_reinforcement_learning.networks.world_models.z_ensemble_integrated import ( # EnsembleWorldReward, # ) - -from cares_reinforcement_learning.networks.world_models.ensemble_ns_world import ( - EnsembleWorldAndOneNSReward, -) - -from cares_reinforcement_learning.networks.world_models.ensmeble_sa_world import ( - EnsembleWorldAndOneSAReward, -) - -from cares_reinforcement_learning.networks.world_models.ensemble_sas_world import ( - EnsembleWorldAndOneSASReward, -) - -from cares_reinforcement_learning.networks.world_models.ensemble_world_ensemble_sas_reward import ( - EnsembleWorldEnsembleSASReward, -) +from .world_model import World_Model diff --git a/cares_reinforcement_learning/networks/world_models/ensemble/__init__.py b/cares_reinforcement_learning/networks/world_models/ensemble/__init__.py new file mode 100644 index 00000000..29153282 --- /dev/null +++ b/cares_reinforcement_learning/networks/world_models/ensemble/__init__.py @@ -0,0 +1,2 @@ +from .world_ensemble_one_rwd import Ensemble_Dyna_One_Reward +from .world_ensemble_big import Ensemble_Dyna_Big \ No newline at end of file diff --git a/cares_reinforcement_learning/networks/world_models/ensemble/world_ensemble_big.py b/cares_reinforcement_learning/networks/world_models/ensemble/world_ensemble_big.py new file mode 100644 index 00000000..dec35183 --- /dev/null +++ b/cares_reinforcement_learning/networks/world_models/ensemble/world_ensemble_big.py @@ -0,0 +1,260 @@ +import math +import numpy as np +import torch +import torch.nn.functional as F +import torch.utils +from torch import optim +from cares_reinforcement_learning.networks.world_models.simple import Probabilistic_Dynamics +from cares_reinforcement_learning.networks.world_models import World_Model +from cares_reinforcement_learning.util.helpers import normalize_observation_delta +from cares_reinforcement_learning.util import denormalize_observation_delta, normalize_observation + +def sig(x): + """ + Sigmoid + :param x: + :return: + """ + return 1 / (1 + np.exp(-x)) + + +class Ensemble_Dyna_Big(World_Model): + """ + World Model + """ + def __init__(self, + observation_size: int, + num_actions: int, + device: str, + l_r: float = 0.001, + hidden_size=None, + sas: bool = True, + prob_rwd: bool = True, + num_models: int = 7, + boost_inter: int = 3, + ): + super().__init__(observation_size, num_actions, l_r, device, hidden_size, sas, prob_rwd) + self.num_models = num_models + self.observation_size = observation_size + self.num_actions = num_actions + self.l_r = l_r + self.curr_losses = np.ones((self.num_models,)) * 5 + + self.world_models = [] + for i in range(self.num_models): + i %= 3 + if i == 0: + model = Probabilistic_Dynamics( + observation_size=observation_size, + num_actions=num_actions, + hidden_size=[128, 128, 128], + ) + if i == 1: + model = Probabilistic_Dynamics( + observation_size=observation_size, + num_actions=num_actions, + hidden_size=[128, 128], + ) + if i == 2: + model = Probabilistic_Dynamics( + observation_size=observation_size, + num_actions=num_actions, + hidden_size=[256, 256], + ) + self.world_models.append(model) + + self.optimizers = [optim.Adam(self.world_models[i].parameters(), lr=l_r) for i in range(self.num_models)] + self.statistics = {} + # Bring all reward prediction and dynamic rediction networks to device. + self.device = device + for model in self.world_models: + model.to(device) + self.boost_inter = boost_inter + self.update_counter = 0 + + def pred_next_states( + self, observation: torch.Tensor, actions: torch.Tensor + ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: + assert ( + observation.shape[1] + actions.shape[1] + == self.observation_size + self.num_actions + ) + norm_means = [] + norm_vars = [] + normalized_observation = normalize_observation(observation, self.statistics) + # Iterate over the neural networks and get the predictions + for model in self.world_models: + # Predict delta + n_mean, n_var = model.forward(normalized_observation, actions) + norm_means.append(n_mean) + norm_vars.append(n_var) + predictions_vars = torch.stack(norm_vars) + predictions_norm_means = torch.stack(norm_means) + # Normalized + predictions_means = denormalize_observation_delta(predictions_norm_means, self.statistics) + all_predictions = predictions_means + observation + denorm_avg = torch.mean(predictions_means, dim=0) + prediction = denorm_avg + observation + return prediction, all_predictions, predictions_norm_means, predictions_vars + + def train_world( + self, + states: torch.Tensor, + actions: torch.Tensor, + next_states: torch.Tensor, + ) -> None: + # This boosting part is useless, cause inaccuracy. + # weights = 1.5 - sig(self.curr_losses) + # weights /= np.max(weights) + assert len(states.shape) >= 2 + assert len(actions.shape) == 2 + assert ( + states.shape[1] + actions.shape[1] + == self.num_actions + self.observation_size + ) + # min_ = np.min(self.curr_losses) + # max_ = np.max(self.curr_losses) + # delta = max_ - min_ + # if delta == 0: + # delta = 0.1 + # temp = (self.curr_losses - min_) / delta * 5.0 + # temp = sig(temp) + # temp[index] * + index = int(math.floor(self.update_counter / self.boost_inter)) + target = next_states - states + delta_targets_normalized = normalize_observation_delta(target, self.statistics) + normalized_state = normalize_observation(states, self.statistics) + n_mean, n_var = self.world_models[index].forward(normalized_state, actions) + model_loss = F.gaussian_nll_loss(input=n_mean, target=delta_targets_normalized, var=n_var).mean() + self.optimizers[index].zero_grad() + model_loss.backward() + self.optimizers[index].step() + self.curr_losses[index] = model_loss.item() + self.update_counter += 1 + self.update_counter %= self.boost_inter * self.num_models + + def estimate_uncertainty( + self, observation: torch.Tensor, actions: torch.Tensor, train_reward:bool + ) -> tuple[float, float, torch.Tensor]: + """ + Estimate uncertainty. + + :param observation: + :param actions: + """ + next_state_samples = None + uncert_rwd = 0.0 + means = [] + vars_s = [] + normalized_state = normalize_observation(observation, self.statistics) + for model in self.world_models: + mean, var = model.forward(normalized_state, actions) + means.append(mean) + vars_s.append(var) + vars_s = torch.stack(vars_s).squeeze() + noises = vars_s.cpu().detach().numpy() + aleatoric = (noises ** 2).mean(axis=0) ** 0.5 + all_means = torch.stack(means).squeeze() + epistemic = all_means.cpu().detach().numpy() + epistemic = epistemic.var(axis=0) ** 0.5 + aleatoric = np.minimum(aleatoric, 10e3) + epistemic = np.minimum(epistemic, 10e3) + total_unc = (aleatoric ** 2 + epistemic ** 2) ** 0.5 + uncert = np.mean(total_unc) + if train_reward: + # Reward Uncertainty + sample_times = 20 + means = torch.vstack(means) + dist = torch.distributions.Normal(means, vars_s) + samples = dist.sample([sample_times]) + samples = torch.reshape(samples, (sample_times * self.num_models, self.observation_size)) + samples = denormalize_observation_delta(samples, self.statistics) + observationss = torch.repeat_interleave(observation, repeats=sample_times * self.num_models, dim=0) + actionss = torch.repeat_interleave(actions, repeats=sample_times * self.num_models, dim=0) + samples += observationss + + if self.sas: + if self.prob_rwd: + rewards, rwd_var = self.reward_network(observationss, actionss, samples) + epis_uncert = torch.var(rewards, dim=0).item() + rwd_var = rwd_var.squeeze().detach().cpu().numpy().mean() + alea_uncert = rwd_var + epis_uncert = np.minimum(epis_uncert, 10e3) + alea_uncert = np.minimum(alea_uncert, 10e3) + uncert_rwd = ((epis_uncert ** 2) + (alea_uncert ** 2)) ** 0.5 + else: + rewards = self.reward_network(observationss, actionss, samples) + uncert_rwd = torch.var(rewards, dim=0).item() + else: + if self.prob_rwd: + rewards, rwd_var = self.reward_network(samples, actionss) + epis_uncert = torch.var(rewards, dim=0).item() + rwd_var = rwd_var.squeeze().detach().cpu().numpy().mean() + alea_uncert = rwd_var + epis_uncert = np.minimum(epis_uncert, 10e3) + alea_uncert = np.minimum(alea_uncert, 10e3) + uncert_rwd = ((epis_uncert ** 2) + (alea_uncert ** 2)) ** 0.5 + else: + rewards = self.reward_network(samples, actionss) + uncert_rwd = torch.var(rewards, dim=0).item() + else: + dist = torch.distributions.Normal(all_means, vars_s) + next_state_samples = dist.sample([20]) + next_state_samples = next_state_samples.reshape((self.num_models * 20, self.observation_size)) + next_state_samples = denormalize_observation_delta(next_state_samples, self.statistics) + next_state_samples += observation + return uncert, uncert_rwd, next_state_samples + + def train_together(self, states: torch.Tensor, actions: torch.Tensor, rewards: torch.Tensor): + sample_times = 20 + normalized_state = normalize_observation(states, self.statistics) + mean_s = [] + var_s = [] + act_s = [] + state_s = [] + rwd_s = [] + for i in range(self.num_models): + mean, var = self.world_models[i].forward(normalized_state, actions) + mean_s.append(mean) + var_s.append(var) + act_s.append(actions) + state_s.append(states) + rwd_s.append(rewards) + + mean_s = torch.vstack(mean_s) + var_s = torch.vstack(var_s) + act_s = torch.vstack(act_s) + state_s = torch.vstack(state_s) + rwd_s = torch.vstack(rwd_s) + + dist = torch.distributions.Normal(mean_s, var_s) + samples = (dist.sample([sample_times])) + + actions = torch.repeat_interleave(act_s.unsqueeze(dim=0), repeats=sample_times, dim=0) + states = torch.repeat_interleave(state_s.unsqueeze(dim=0), repeats=sample_times,dim=0) + rwd_s = torch.repeat_interleave(rwd_s.unsqueeze(dim=0), repeats=sample_times, dim=0) + + samples = torch.reshape(samples, (samples.shape[0] * samples.shape[1], self.observation_size)) + states = torch.reshape(states, (states.shape[0] * states.shape[1], states.shape[2])) + actions = torch.reshape(actions, (actions.shape[0] * actions.shape[1], actions.shape[2])) + rwd_s = torch.reshape(rwd_s, (rwd_s.shape[0] * rwd_s.shape[1], rwd_s.shape[2])) + + samples = denormalize_observation_delta(samples, self.statistics) + samples += states + + + if self.prob_rwd: + if self.sas: + rwd_mean, rwd_var = self.reward_network(states, actions, samples) + else: + rwd_mean, rwd_var = self.reward_network(samples) + rwd_loss = F.gaussian_nll_loss(rwd_mean, rwd_s, rwd_var) + else: + if self.sas: + rwd_mean = self.reward_network(states, actions, samples) + else: + rwd_mean = self.reward_network(samples) + rwd_loss = F.mse_loss(rwd_mean, rwd_s) + self.reward_optimizer.zero_grad() + rwd_loss.backward() + self.reward_optimizer.step() diff --git a/cares_reinforcement_learning/networks/world_models/ensemble/world_ensemble_one_rwd.py b/cares_reinforcement_learning/networks/world_models/ensemble/world_ensemble_one_rwd.py new file mode 100644 index 00000000..6affe44b --- /dev/null +++ b/cares_reinforcement_learning/networks/world_models/ensemble/world_ensemble_one_rwd.py @@ -0,0 +1,245 @@ +import math +import numpy as np +import torch +import torch.nn.functional as F +import torch.utils +from torch import optim +from cares_reinforcement_learning.networks.world_models.simple import Probabilistic_Dynamics +from cares_reinforcement_learning.networks.world_models import World_Model +from cares_reinforcement_learning.util.helpers import normalize_observation_delta +from cares_reinforcement_learning.util import denormalize_observation_delta, normalize_observation + +def sig(x): + """ + Sigmoid + :param x: + :return: + """ + return 1 / (1 + np.exp(-x)) + + +class Ensemble_Dyna_One_Reward(World_Model): + """ + World Model + """ + def __init__(self, + observation_size: int, + num_actions: int, + device: str, + num_models: int = 5, + l_r: float = 0.001, + boost_inter: int = 3, + hidden_size=None, + sas: bool = True, + prob_rwd: bool = True): + super().__init__(observation_size, num_actions, l_r, device, hidden_size, sas, prob_rwd) + if hidden_size is None: + hidden_size = [128, 128] + self.num_models = num_models + self.observation_size = observation_size + self.num_actions = num_actions + self.l_r = l_r + self.curr_losses = np.ones((self.num_models,)) * 5 + self.world_models = [ + Probabilistic_Dynamics( + observation_size=observation_size, + num_actions=num_actions, + hidden_size=hidden_size, + ) + for _ in range(self.num_models) + ] + self.optimizers = [optim.Adam(self.world_models[i].parameters(), lr=l_r) for i in range(self.num_models)] + self.statistics = {} + # Bring all reward prediction and dynamic rediction networks to device. + self.device = device + for model in self.world_models: + model.to(device) + self.boost_inter = boost_inter + self.update_counter = 0 + + def pred_next_states( + self, observation: torch.Tensor, actions: torch.Tensor + ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: + assert ( + observation.shape[1] + actions.shape[1] + == self.observation_size + self.num_actions + ) + norm_means = [] + norm_vars = [] + normalized_observation = normalize_observation(observation, self.statistics) + # Iterate over the neural networks and get the predictions + for model in self.world_models: + # Predict delta + n_mean, n_var = model.forward(normalized_observation, actions) + norm_means.append(n_mean) + norm_vars.append(n_var) + predictions_vars = torch.stack(norm_vars) + predictions_norm_means = torch.stack(norm_means) + # Normalized + predictions_means = denormalize_observation_delta(predictions_norm_means, self.statistics) + all_predictions = predictions_means + observation + denorm_avg = torch.mean(predictions_means, dim=0) + prediction = denorm_avg + observation + return prediction, all_predictions, predictions_norm_means, predictions_vars + + def train_world( + self, + states: torch.Tensor, + actions: torch.Tensor, + next_states: torch.Tensor, + ) -> None: + # This boosting part is useless, cause inaccuracy. + # weights = 1.5 - sig(self.curr_losses) + # weights /= np.max(weights) + assert len(states.shape) >= 2 + assert len(actions.shape) == 2 + assert ( + states.shape[1] + actions.shape[1] + == self.num_actions + self.observation_size + ) + # min_ = np.min(self.curr_losses) + # max_ = np.max(self.curr_losses) + # delta = max_ - min_ + # if delta == 0: + # delta = 0.1 + # temp = (self.curr_losses - min_) / delta * 5.0 + # temp = sig(temp) + # temp[index] * + index = int(math.floor(self.update_counter / self.boost_inter)) + target = next_states - states + delta_targets_normalized = normalize_observation_delta(target, self.statistics) + normalized_state = normalize_observation(states, self.statistics) + n_mean, n_var = self.world_models[index].forward(normalized_state, actions) + model_loss = F.gaussian_nll_loss(input=n_mean, target=delta_targets_normalized, var=n_var).mean() + self.optimizers[index].zero_grad() + model_loss.backward() + self.optimizers[index].step() + self.curr_losses[index] = model_loss.item() + self.update_counter += 1 + self.update_counter %= self.boost_inter * self.num_models + + def estimate_uncertainty( + self, observation: torch.Tensor, actions: torch.Tensor, train_reward:bool + ) -> tuple[float, float, torch.Tensor]: + """ + Estimate uncertainty. + + :param observation: + :param actions: + """ + next_state_samples = None + uncert_rwd = 0.0 + means = [] + vars_s = [] + normalized_state = normalize_observation(observation, self.statistics) + for model in self.world_models: + mean, var = model.forward(normalized_state, actions) + means.append(mean) + vars_s.append(var) + vars_s = torch.stack(vars_s).squeeze() + noises = vars_s.cpu().detach().numpy() + aleatoric = (noises ** 2).mean(axis=0) ** 0.5 + all_means = torch.stack(means).squeeze() + epistemic = all_means.cpu().detach().numpy() + epistemic = epistemic.var(axis=0) ** 0.5 + aleatoric = np.minimum(aleatoric, 10e3) + epistemic = np.minimum(epistemic, 10e3) + total_unc = (aleatoric ** 2 + epistemic ** 2) ** 0.5 + uncert = np.mean(total_unc) + if train_reward: + # Reward Uncertainty + sample_times = 20 + means = torch.vstack(means) + dist = torch.distributions.Normal(means, vars_s) + samples = dist.sample([sample_times]) + samples = torch.reshape(samples, (sample_times * self.num_models, self.observation_size)) + samples = denormalize_observation_delta(samples, self.statistics) + observationss = torch.repeat_interleave(observation, repeats=sample_times * self.num_models, dim=0) + actionss = torch.repeat_interleave(actions, repeats=sample_times * self.num_models, dim=0) + samples += observationss + + if self.sas: + if self.prob_rwd: + rewards, rwd_var = self.reward_network(observationss, actionss, samples) + epis_uncert = torch.var(rewards, dim=0).item() + rwd_var = rwd_var.squeeze().detach().cpu().numpy().mean() + alea_uncert = rwd_var + epis_uncert = np.minimum(epis_uncert, 10e3) + alea_uncert = np.minimum(alea_uncert, 10e3) + uncert_rwd = ((epis_uncert ** 2) + (alea_uncert ** 2)) ** 0.5 + else: + rewards = self.reward_network(observationss, actionss, samples) + uncert_rwd = torch.var(rewards, dim=0).item() + else: + if self.prob_rwd: + rewards, rwd_var = self.reward_network(samples, actionss) + epis_uncert = torch.var(rewards, dim=0).item() + rwd_var = rwd_var.squeeze().detach().cpu().numpy().mean() + alea_uncert = rwd_var + epis_uncert = np.minimum(epis_uncert, 10e3) + alea_uncert = np.minimum(alea_uncert, 10e3) + uncert_rwd = ((epis_uncert ** 2) + (alea_uncert ** 2)) ** 0.5 + else: + rewards = self.reward_network(samples, actionss) + uncert_rwd = torch.var(rewards, dim=0).item() + else: + dist = torch.distributions.Normal(all_means, vars_s) + next_state_samples = dist.sample([20]) + next_state_samples = next_state_samples.reshape((self.num_models * 20, self.observation_size)) + next_state_samples = denormalize_observation_delta(next_state_samples, self.statistics) + next_state_samples += observation + return uncert, uncert_rwd, next_state_samples + + def train_together(self, states: torch.Tensor, actions: torch.Tensor, rewards: torch.Tensor): + sample_times = 20 + normalized_state = normalize_observation(states, self.statistics) + mean_s = [] + var_s = [] + act_s = [] + state_s = [] + rwd_s = [] + for i in range(self.num_models): + mean, var = self.world_models[i].forward(normalized_state, actions) + mean_s.append(mean) + var_s.append(var) + act_s.append(actions) + state_s.append(states) + rwd_s.append(rewards) + + mean_s = torch.vstack(mean_s) + var_s = torch.vstack(var_s) + act_s = torch.vstack(act_s) + state_s = torch.vstack(state_s) + rwd_s = torch.vstack(rwd_s) + + dist = torch.distributions.Normal(mean_s, var_s) + samples = (dist.sample([sample_times])) + + actions = torch.repeat_interleave(act_s.unsqueeze(dim=0), repeats=sample_times, dim=0) + states = torch.repeat_interleave(state_s.unsqueeze(dim=0), repeats=sample_times,dim=0) + rwd_s = torch.repeat_interleave(rwd_s.unsqueeze(dim=0), repeats=sample_times, dim=0) + + samples = torch.reshape(samples, (samples.shape[0] * samples.shape[1], self.observation_size)) + states = torch.reshape(states, (states.shape[0] * states.shape[1], states.shape[2])) + actions = torch.reshape(actions, (actions.shape[0] * actions.shape[1], actions.shape[2])) + rwd_s = torch.reshape(rwd_s, (rwd_s.shape[0] * rwd_s.shape[1], rwd_s.shape[2])) + + samples = denormalize_observation_delta(samples, self.statistics) + samples += states + + + if self.prob_rwd: + if self.sas: + rwd_mean, rwd_var = self.reward_network(states, actions, samples) + else: + rwd_mean, rwd_var = self.reward_network(samples, actions) + rwd_loss = F.gaussian_nll_loss(rwd_mean, rwd_s, rwd_var) + else: + if self.sas: + rwd_mean = self.reward_network(states, actions, samples) + else: + rwd_mean = self.reward_network(samples, actions) + rwd_loss = F.mse_loss(rwd_mean, rwd_s) + self.reward_optimizer.zero_grad() + rwd_loss.backward() + self.reward_optimizer.step() diff --git a/cares_reinforcement_learning/networks/world_models/ensemble_ns_world.py b/cares_reinforcement_learning/networks/world_models/ensemble_ns_world.py deleted file mode 100644 index 94c3ac1e..00000000 --- a/cares_reinforcement_learning/networks/world_models/ensemble_ns_world.py +++ /dev/null @@ -1,169 +0,0 @@ -import logging -import math -import random -import sys - -import numpy as np -import torch -import torch.nn.functional as F -import torch.utils -from torch import optim - -from cares_reinforcement_learning.networks.world_models.probabilistic_dynamics import ( - Probabilistic_Dynamics, -) -from cares_reinforcement_learning.networks.world_models.simple_ns_reward import ( - Simple_NS_Reward, -) -from cares_reinforcement_learning.util.helpers import normalize_observation_delta - - -class EnsembleWorldAndOneNSReward: - """ - Spec - """ - def __init__( - self, - observation_size: int, - num_actions: int, - num_models: int, - lr: float, - device: str, - hidden_size: int = 128, - ): - self.num_models = num_models - self.observation_size = observation_size - self.num_actions = num_actions - - self.reward_network = Simple_NS_Reward( - observation_size=observation_size, - num_actions=num_actions, - hidden_size=hidden_size, - ) - self.reward_optimizer = optim.Adam(self.reward_network.parameters(), lr=lr) - - self.models = [ - Probabilistic_Dynamics( - observation_size=observation_size, - num_actions=num_actions, - hidden_size=hidden_size, - ) - for _ in range(self.num_models) - ] - - self.optimizers = [optim.Adam(self.models[i].parameters(), lr=lr) for i in range(self.num_models)] - - self.statistics = {} - - # Bring all reward prediction and dynamic rediction networks to device. - self.device = device - self.reward_network.to(self.device) - for model in self.models: - model.to(device) - - def set_statistics(self, statistics: dict) -> None: - """ - Update all statistics for normalization for all world models and the - ensemble itself. - - :param (Dictionary) statistics: - """ - for key, value in statistics.items(): - if isinstance(value, np.ndarray): - statistics[key] = torch.FloatTensor(statistics[key]).to(self.device) - - self.statistics = statistics - for model in self.models: - model.statistics = statistics - - def pred_rewards(self, observation: torch.Tensor): - pred_rewards = self.reward_network(observation) - return pred_rewards - - def pred_next_states( - self, observation: torch.Tensor, actions: torch.Tensor - ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: - assert ( - observation.shape[1] + actions.shape[1] - == self.observation_size + self.num_actions - ) - means = [] - norm_means = [] - norm_vars = [] - # Iterate over the neural networks and get the predictions - for model in self.models: - # Predict delta - mean, n_mean, n_var = model.forward(observation, actions) - means.append(mean) - norm_means.append(n_mean) - norm_vars.append(n_var) - # Normalized - predictions_means = torch.stack(means) - predictions_norm_means = torch.stack(norm_means) - predictions_vars = torch.stack(norm_vars) - # Get rid of the nans - not_nans = [] - for i in range(self.num_models): - if not torch.any(torch.isnan(predictions_means[i])): - not_nans.append(i) - if len(not_nans) == 0: - logging.info("Predicting all Nans") - sys.exit() - # Random Take next state. - rand_ind = random.randint(0, len(not_nans) - 1) - prediction = predictions_means[not_nans[rand_ind]] - # next = current + delta - prediction += observation - all_predictions = torch.stack(means) - for j in range(all_predictions.shape[0]): - all_predictions[j] += observation - return prediction, all_predictions, predictions_norm_means, predictions_vars - - def train_world( - self, - states: torch.Tensor, - actions: torch.Tensor, - next_states: torch.Tensor, - ) -> None: - - assert len(states.shape) >= 2 - assert len(actions.shape) == 2 - assert ( - states.shape[1] + actions.shape[1] - == self.num_actions + self.observation_size - ) - # For each model, train with different data. - mini_batch_size = int(math.floor(states.shape[0] / self.num_models)) - - for i in range(self.num_models): - sub_states = states[i * mini_batch_size: (i + 1) * mini_batch_size] - sub_actions = actions[i * mini_batch_size: (i + 1) * mini_batch_size] - sub_next_states = next_states[i * mini_batch_size: (i + 1) * mini_batch_size] - sub_target = sub_next_states - sub_states - - delta_targets_normalized = normalize_observation_delta(sub_target, self.statistics) - _, n_mean, n_var = self.models[i].forward(sub_states, sub_actions) - model_loss = F.gaussian_nll_loss(input=n_mean, target=delta_targets_normalized, var=n_var).mean() - - self.optimizers[i].zero_grad() - model_loss.backward() - self.optimizers[i].step() - - def train_reward( - self, - next_states: torch.Tensor, - rewards: torch.Tensor, - ) -> None: - assert len(next_states.shape) >= 2 - # assert len(actions.shape) == 2 - # assert ( - # next_states.shape[1] + actions.shape[1] - # == self.num_actions + self.observation_size - # ) - self.reward_optimizer.zero_grad() - rwd_mean = self.reward_network.forward(next_states) - reward_loss = F.mse_loss(rwd_mean, rewards) - reward_loss.backward() - self.reward_optimizer.step() - - diff --git a/cares_reinforcement_learning/networks/world_models/ensemble_sas_world.py b/cares_reinforcement_learning/networks/world_models/ensemble_sas_world.py deleted file mode 100644 index da7b17b3..00000000 --- a/cares_reinforcement_learning/networks/world_models/ensemble_sas_world.py +++ /dev/null @@ -1,166 +0,0 @@ -import logging -import math -import random -import sys - -import numpy as np -import torch -import torch.nn.functional as F -import torch.utils -from torch import optim - -from cares_reinforcement_learning.networks.world_models.probabilistic_dynamics import ( - Probabilistic_Dynamics, -) -from cares_reinforcement_learning.networks.world_models.simple_sas_reward import ( - Simple_SAS_Reward, -) -from cares_reinforcement_learning.util.helpers import normalize_observation_delta - - -class EnsembleWorldAndOneSASReward: - """ - - """ - def __init__( - self, - observation_size: int, - num_actions: int, - num_models: int, - lr: float, - device: str, - hidden_size: int = 128, - ): - self.num_models = num_models - self.observation_size = observation_size - self.num_actions = num_actions - - self.reward_network = Simple_SAS_Reward( - observation_size=observation_size, - num_actions=num_actions, - hidden_size=hidden_size, - ) - self.reward_optimizer = optim.Adam(self.reward_network.parameters(), lr=lr) - - self.models = [ - Probabilistic_Dynamics( - observation_size=observation_size, - num_actions=num_actions, - hidden_size=hidden_size, - ) - for _ in range(self.num_models) - ] - - self.optimizers = [optim.Adam(self.models[i].parameters(), lr=lr) for i in range(self.num_models)] - - self.statistics = {} - - # Bring all reward prediction and dynamic rediction networks to device. - self.device = device - self.reward_network.to(self.device) - for model in self.models: - model.to(device) - - def set_statistics(self, statistics: dict) -> None: - """ - Update all statistics for normalization for all world models and the - ensemble itself. - - :param (Dictionary) statistics: - """ - for key, value in statistics.items(): - if isinstance(value, np.ndarray): - statistics[key] = torch.FloatTensor(statistics[key]).to(self.device) - - self.statistics = statistics - for model in self.models: - model.statistics = statistics - - def pred_rewards(self, observation: torch.Tensor, action: torch.Tensor, next_observation:torch.Tensor): - pred_rewards = self.reward_network(observation, action, next_observation) - return pred_rewards - - def pred_next_states( - self, observation: torch.Tensor, actions: torch.Tensor - ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: - assert ( - observation.shape[1] + actions.shape[1] - == self.observation_size + self.num_actions - ) - means = [] - norm_means = [] - norm_vars = [] - # Iterate over the neural networks and get the predictions - for model in self.models: - # Predict delta - mean, n_mean, n_var = model.forward(observation, actions) - means.append(mean) - norm_means.append(n_mean) - norm_vars.append(n_var) - # Normalized - predictions_means = torch.stack(means) - predictions_norm_means = torch.stack(norm_means) - predictions_vars = torch.stack(norm_vars) - # Get rid of the nans - not_nans = [] - for i in range(self.num_models): - if not torch.any(torch.isnan(predictions_means[i])): - not_nans.append(i) - if len(not_nans) == 0: - logging.info("Predicting all Nans") - sys.exit() - # Random Take next state. - rand_ind = random.randint(0, len(not_nans) - 1) - prediction = predictions_means[not_nans[rand_ind]] - # next = current + delta - prediction += observation - all_predictions = torch.stack(means) - for j in range(all_predictions.shape[0]): - all_predictions[j] += observation - return prediction, all_predictions, predictions_norm_means, predictions_vars - - def train_world( - self, - states: torch.Tensor, - actions: torch.Tensor, - next_states: torch.Tensor, - ) -> None: - - assert len(states.shape) >= 2 - assert len(actions.shape) == 2 - assert ( - states.shape[1] + actions.shape[1] - == self.num_actions + self.observation_size - ) - # For each model, train with different data. - mini_batch_size = int(math.floor(states.shape[0] / self.num_models)) - - for i in range(self.num_models): - sub_states = states[i * mini_batch_size: (i + 1) * mini_batch_size] - sub_actions = actions[i * mini_batch_size: (i + 1) * mini_batch_size] - sub_next_states = next_states[i * mini_batch_size: (i + 1) * mini_batch_size] - sub_target = sub_next_states - sub_states - - delta_targets_normalized = normalize_observation_delta(sub_target, self.statistics) - _, n_mean, n_var = self.models[i].forward(sub_states, sub_actions) - model_loss = F.gaussian_nll_loss(input=n_mean, target=delta_targets_normalized, var=n_var).mean() - - self.optimizers[i].zero_grad() - model_loss.backward() - self.optimizers[i].step() - - def train_reward( - self, - states: torch.Tensor, - actions: torch.Tensor, - next_states: torch.Tensor, - rewards: torch.Tensor, - ) -> None: - assert len(next_states.shape) >= 2 - self.reward_optimizer.zero_grad() - rwd_mean = self.reward_network(states, actions, next_states) - reward_loss = F.mse_loss(rwd_mean, rewards) - reward_loss.backward() - self.reward_optimizer.step() - - diff --git a/cares_reinforcement_learning/networks/world_models/ensemble_world_ensemble_sas_reward.py b/cares_reinforcement_learning/networks/world_models/ensemble_world_ensemble_sas_reward.py deleted file mode 100644 index 183fb7eb..00000000 --- a/cares_reinforcement_learning/networks/world_models/ensemble_world_ensemble_sas_reward.py +++ /dev/null @@ -1,246 +0,0 @@ -import logging -import math -import random -import sys -import numpy as np -import torch -import torch.nn.functional as F -import torch.utils -from torch import optim - -from cares_reinforcement_learning.networks.world_models.probabilistic_dynamics import ( - Probabilistic_Dynamics, -) -from cares_reinforcement_learning.networks.world_models.probabilistic_sas_reward import ( - Probabilistic_SAS_Reward, -) -# from cares_reinforcement_learning.networks.world_models.simple_sas_done import ( -# SASDone, -# ) -from cares_reinforcement_learning.util.helpers import normalize_observation_delta - - -class EnsembleWorldEnsembleSASReward: - """ - This class consist of an ensemble of all components for critic update. - Q_label = REWARD + gamma * (1 - DONES) * Q(NEXT_STATES). - - """ - - def __init__( - self, - observation_size: int, - num_actions: int, - num_world_models: int, - num_reward_models: int, - lr: float, - device: str, - hidden_size: int = 128, - ): - self.num_reward_models = num_reward_models - self.num_world_models = num_world_models - - self.observation_size = observation_size - self.num_actions = num_actions - self.device = device - - self.world_models = [Probabilistic_Dynamics(observation_size=observation_size, num_actions=num_actions, - hidden_size=hidden_size) for _ in range(self.num_world_models)] - self.reward_models = [Probabilistic_SAS_Reward(observation_size=observation_size, num_actions=num_actions, - hidden_size=hidden_size) for _ in range(self.num_reward_models)] - self.world_optimizers = [optim.Adam(self.world_models[i].parameters(), lr=lr) for i in - range(self.num_world_models)] - self.reward_optimizers = [optim.Adam(self.reward_models[i].parameters(), lr=lr) for i in - range(self.num_reward_models)] - - # Bring all reward prediction and dynamic rediction networks to device. - for reward_model in self.reward_models: - reward_model.to(self.device) - for world_model in self.world_models: - world_model.to(self.device) - - # self.done_model = SASDone(observation_size=observation_size, num_actions=num_actions, - # hidden_size=hidden_size) - # self.done_optimizers = optim.Adam(self.done_model.parameters(), lr=lr) - # self.done_model.to(self.device) - self.statistics = {} - - def set_statistics(self, statistics: dict) -> None: - """ - Update all statistics for normalization for all world models and the - ensemble itself. - - :param (Dictionary) statistics: - """ - for key, value in statistics.items(): - if isinstance(value, np.ndarray): - statistics[key] = torch.FloatTensor(statistics[key]).to(self.device) - self.statistics = statistics - for model in self.world_models: - model.statistics = statistics - - def pred_multiple_rewards(self, observation: torch.Tensor, action: torch.Tensor, next_observation: torch.Tensor): - """ - predict reward based on current observation and action and next state - """ - assert len(next_observation.shape) == 3 - pred_reward_means = [] - pred_reward_vars = [] - # 5 - for j in range(next_observation.shape[0]): - next_obs = next_observation[j] - # 5 - for i in range(self.num_reward_models): - pred_reward, reward_var = self.reward_models[i].forward(observation, action, next_obs) - pred_reward_means.append(pred_reward) - pred_reward_vars.append(reward_var) - pred_reward_means = torch.stack(pred_reward_means) - pred_reward_vars = torch.stack(pred_reward_vars) - return pred_reward_means, pred_reward_vars - - def pred_rewards(self, observation: torch.Tensor, - action: torch.Tensor, next_observation: torch.Tensor): - """ - predict reward based on current observation and action and next state - """ - pred_reward_means = [] - pred_reward_vars = [] - for i in range(self.num_reward_models): - pred_reward, reward_var = self.reward_models[i].forward(observation, action, next_observation) - pred_reward_means.append(pred_reward) - pred_reward_vars.append(reward_var) - pred_reward_means = torch.stack(pred_reward_means) - pred_reward_vars = torch.stack(pred_reward_vars) - pred_rewards = torch.mean(pred_reward_means, dim=0) - - return pred_rewards, pred_reward_means, pred_reward_vars - - def pred_next_states( - self, observation: torch.Tensor, actions: torch.Tensor - ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: - """ - Predict the next state based on the current state and action. - - The output is - Args: - observation: - actions: - - Returns: - prediction: Single prediction, probably mean. - all_predictions: all means from different model. - predictions_norm_means: normalized means. - predictions_vars: normalized vars. - """ - assert ( - observation.shape[1] + actions.shape[1] - == self.observation_size + self.num_actions - ) - means = [] - norm_means = [] - norm_vars = [] - # Iterate over the neural networks and get the predictions - for model in self.world_models: - # Predict delta - mean, n_mean, n_var = model.forward(observation, actions) - means.append(mean) - norm_means.append(n_mean) - norm_vars.append(n_var) - # Normalized - predictions_means = torch.stack(means) - predictions_norm_means = torch.stack(norm_means) - predictions_vars = torch.stack(norm_vars) - # Get rid of the nans - not_nans = [] - for i in range(self.num_world_models): - if not torch.any(torch.isnan(predictions_means[i])): - not_nans.append(i) - if len(not_nans) == 0: - logging.info("Predicting all Nans") - sys.exit() - # Random Take next state. - rand_ind = random.randint(0, len(not_nans) - 1) - prediction = predictions_means[not_nans[rand_ind]] - # next = current + delta - prediction += observation - all_predictions = torch.stack(means) - for j in range(all_predictions.shape[0]): - all_predictions[j] += observation - - return prediction, all_predictions, predictions_norm_means, predictions_vars - - def train_world( - self, - states: torch.Tensor, - actions: torch.Tensor, - next_states: torch.Tensor, - ) -> None: - """ - Train the world with S, A, SN. Different sub-batch. - - Args: - states: - actions: - next_states: - """ - assert len(states.shape) >= 2 - assert len(actions.shape) == 2 - assert ( - states.shape[1] + actions.shape[1] - == self.num_actions + self.observation_size - ) - # For each model, train with different data. - mini_batch_size = int(math.floor(states.shape[0] / self.num_world_models)) - - for i in range(self.num_world_models): - sub_states = states[i * mini_batch_size: (i + 1) * mini_batch_size] - sub_actions = actions[i * mini_batch_size: (i + 1) * mini_batch_size] - sub_next_states = next_states[i * mini_batch_size: (i + 1) * mini_batch_size] - sub_target = sub_next_states - sub_states - delta_targets_normalized = normalize_observation_delta(sub_target, self.statistics) - _, n_mean, n_var = self.world_models[i].forward(sub_states, sub_actions) - model_loss = F.gaussian_nll_loss(input=n_mean, target=delta_targets_normalized, var=n_var).mean() - self.world_optimizers[i].zero_grad() - model_loss.backward() - self.world_optimizers[i].step() - - def train_reward( - self, - states: torch.Tensor, - actions: torch.Tensor, - next_states: torch.Tensor, - rewards: torch.Tensor, - ) -> None: - """ - Train the reward with S, A, SN to eliminate difference between them. - - Args: - states: - actions: - next_states: - rewards: - """ - mini_batch_size = int(math.floor(states.shape[0] / self.num_reward_models)) - for i in range(self.num_reward_models): - sub_states = states[i * mini_batch_size: (i + 1) * mini_batch_size] - sub_actions = actions[i * mini_batch_size: (i + 1) * mini_batch_size] - sub_next_states = next_states[i * mini_batch_size: (i + 1) * mini_batch_size] - sub_rewards = rewards[i * mini_batch_size: (i + 1) * mini_batch_size] - self.reward_optimizers[i].zero_grad() - rwd_mean, rwd_var = self.reward_models[i].forward(sub_states, sub_actions, sub_next_states) - # reward_loss = F.mse_loss(rwd_mean, sub_rewards) - reward_loss = F.gaussian_nll_loss(input=rwd_mean, target=sub_rewards, var=rwd_var).mean() - reward_loss.backward() - self.reward_optimizers[i].step() - - # def train_done( - # self, - # states: torch.Tensor, - # actions: torch.Tensor, - # dones: torch.Tensor, - # ) -> None: - # self.reward_optimizer.zero_grad() - # prob_dones = self.reward_network.forward(states, actions) - # reward_loss = F.binary_cross_entropy(prob_dones, dones) - # reward_loss.backward() - # self.reward_optimizer.step() diff --git a/cares_reinforcement_learning/networks/world_models/ensmeble_sa_world.py b/cares_reinforcement_learning/networks/world_models/ensmeble_sa_world.py deleted file mode 100644 index f7e90a36..00000000 --- a/cares_reinforcement_learning/networks/world_models/ensmeble_sa_world.py +++ /dev/null @@ -1,165 +0,0 @@ -import logging -import math -import random -import sys - -import numpy as np -import torch -import torch.nn.functional as F -import torch.utils -from torch import optim - -from cares_reinforcement_learning.networks.world_models.probabilistic_dynamics import ( - Probabilistic_Dynamics, -) -from cares_reinforcement_learning.networks.world_models.simple_sa_reward import ( - Simple_SA_Reward, -) -from cares_reinforcement_learning.util.helpers import normalize_observation_delta - - -class EnsembleWorldAndOneSAReward: - """ - Specifications: - - """ - def __init__( - self, - observation_size: int, - num_actions: int, - num_models: int, - lr: float, - device: str, - hidden_size: int = 128, - ): - self.num_models = num_models - self.observation_size = observation_size - self.num_actions = num_actions - - self.reward_network = Simple_SA_Reward( - observation_size=observation_size, - num_actions=num_actions, - hidden_size=hidden_size, - ) - self.reward_optimizer = optim.Adam(self.reward_network.parameters(), lr=lr) - - self.models = [ - Probabilistic_Dynamics( - observation_size=observation_size, - num_actions=num_actions, - hidden_size=hidden_size, - ) - for _ in range(self.num_models) - ] - - self.optimizers = [optim.Adam(self.models[i].parameters(), lr=lr) for i in range(self.num_models)] - - self.statistics = {} - - # Bring all reward prediction and dynamic rediction networks to device. - self.device = device - self.reward_network.to(self.device) - for model in self.models: - model.to(device) - - def set_statistics(self, statistics: dict) -> None: - """ - Update all statistics for normalization for all world models and the - ensemble itself. - - :param (Dictionary) statistics: - """ - for key, value in statistics.items(): - if isinstance(value, np.ndarray): - statistics[key] = torch.FloatTensor(statistics[key]).to(self.device) - - self.statistics = statistics - for model in self.models: - model.statistics = statistics - - def pred_rewards(self, observation: torch.Tensor, action: torch.Tensor): - pred_rewards = self.reward_network(observation, action) - return pred_rewards - - def pred_next_states( - self, observation: torch.Tensor, actions: torch.Tensor - ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: - assert ( - observation.shape[1] + actions.shape[1] - == self.observation_size + self.num_actions - ) - means = [] - norm_means = [] - norm_vars = [] - # Iterate over the neural networks and get the predictions - for model in self.models: - # Predict delta - mean, n_mean, n_var = model.forward(observation, actions) - means.append(mean) - norm_means.append(n_mean) - norm_vars.append(n_var) - # Normalized - predictions_means = torch.stack(means) - predictions_norm_means = torch.stack(norm_means) - predictions_vars = torch.stack(norm_vars) - # Get rid of the nans - not_nans = [] - for i in range(self.num_models): - if not torch.any(torch.isnan(predictions_means[i])): - not_nans.append(i) - if len(not_nans) == 0: - logging.info("Predicting all Nans") - sys.exit() - # Random Take next state. - rand_ind = random.randint(0, len(not_nans) - 1) - prediction = predictions_means[not_nans[rand_ind]] - # next = current + delta - prediction += observation - all_predictions = torch.stack(means) - for j in range(all_predictions.shape[0]): - all_predictions[j] += observation - return prediction, all_predictions, predictions_norm_means, predictions_vars - - def train_world( - self, - states: torch.Tensor, - actions: torch.Tensor, - next_states: torch.Tensor, - ) -> None: - - assert len(states.shape) >= 2 - assert len(actions.shape) == 2 - assert ( - states.shape[1] + actions.shape[1] - == self.num_actions + self.observation_size - ) - # For each model, train with different data. - mini_batch_size = int(math.floor(states.shape[0] / self.num_models)) - - for i in range(self.num_models): - sub_states = states[i * mini_batch_size: (i + 1) * mini_batch_size] - sub_actions = actions[i * mini_batch_size: (i + 1) * mini_batch_size] - sub_next_states = next_states[i * mini_batch_size: (i + 1) * mini_batch_size] - sub_target = sub_next_states - sub_states - - delta_targets_normalized = normalize_observation_delta(sub_target, self.statistics) - _, n_mean, n_var = self.models[i].forward(sub_states, sub_actions) - model_loss = F.gaussian_nll_loss(input=n_mean, target=delta_targets_normalized, var=n_var).mean() - - self.optimizers[i].zero_grad() - model_loss.backward() - self.optimizers[i].step() - - def train_reward( - self, - states: torch.Tensor, - actions: torch.Tensor, - rewards: torch.Tensor, - ) -> None: - self.reward_optimizer.zero_grad() - rwd_mean = self.reward_network.forward(states, actions) - reward_loss = F.mse_loss(rwd_mean, rewards) - reward_loss.backward() - self.reward_optimizer.step() - - diff --git a/cares_reinforcement_learning/networks/world_models/simple/__init__.py b/cares_reinforcement_learning/networks/world_models/simple/__init__.py new file mode 100644 index 00000000..96f070c5 --- /dev/null +++ b/cares_reinforcement_learning/networks/world_models/simple/__init__.py @@ -0,0 +1,5 @@ +from .simple_ns_reward import Simple_NS_Reward +from .simple_sas_reward import Simple_SAS_Reward +from .probabilistic_ns_reward import Probabilistic_NS_Reward +from .probabilistic_sas_reward import Probabilistic_SAS_Reward +from .probabilistic_dynamic import Probabilistic_Dynamics diff --git a/cares_reinforcement_learning/networks/world_models/probabilistic_dynamics.py b/cares_reinforcement_learning/networks/world_models/simple/probabilistic_dynamic.py similarity index 60% rename from cares_reinforcement_learning/networks/world_models/probabilistic_dynamics.py rename to cares_reinforcement_learning/networks/world_models/simple/probabilistic_dynamic.py index ad0262fb..682423a2 100644 --- a/cares_reinforcement_learning/networks/world_models/probabilistic_dynamics.py +++ b/cares_reinforcement_learning/networks/world_models/simple/probabilistic_dynamic.py @@ -1,9 +1,8 @@ import torch -import torch.nn.functional as F import torch.utils from torch import nn -import cares_reinforcement_learning.util.helpers as hlp +from cares_reinforcement_learning.util import weight_init_pnn, MLP class Probabilistic_Dynamics(nn.Module): @@ -22,23 +21,25 @@ class Probabilistic_Dynamics(nn.Module): :param (int) hidden_size -- size of neurons in hidden layers. """ - def __init__(self, observation_size: int, num_actions: int, hidden_size: int): + def __init__(self, observation_size: int, num_actions: int, hidden_size: list): + print("Create a Prob Dynamics") super().__init__() self.observation_size = observation_size self.num_actions = num_actions - self.layer1 = nn.Linear(observation_size + num_actions, hidden_size) - self.layer2 = nn.Linear(hidden_size, hidden_size) - self.mean_layer = nn.Linear(hidden_size, observation_size) - self.logvar_layer = nn.Linear(hidden_size, observation_size) + self.model = MLP(input_size=observation_size + num_actions, + hidden_sizes=hidden_size, + output_size=2 * observation_size) - self.apply(hlp.weight_init) + self.add_module('mlp', self.model) + + self.model.apply(weight_init_pnn) self.statistics = {} def forward( - self, observation: torch.Tensor, actions: torch.Tensor - ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + self, observation: torch.Tensor, actions: torch.Tensor + ) -> tuple[torch.Tensor, torch.Tensor]: """ Forward the inputs throught the network. @@ -52,24 +53,16 @@ def forward( :return (Tensors) normalized_var -- normalized delta of var for uncertainty estimation. """ - + assert ( + observation.shape[1] + actions.shape[1] + == self.observation_size + self.num_actions + ) # Always normalized obs - normalized_obs = hlp.normalize_observation(observation, self.statistics) - - x = torch.cat((normalized_obs, actions), dim=1) - x = self.layer1(x) - x = F.relu(x) - x = self.layer2(x) - x = F.relu(x) - - normalized_mean = self.mean_layer(x) - logvar = self.logvar_layer(x) - + x = torch.cat((observation, actions), dim=1) + pred = self.model(x) + logvar = pred[:, :self.observation_size] + normalized_mean = pred[:, self.observation_size:] logvar = torch.tanh(logvar) normalized_var = torch.exp(logvar) - # Always denormalized delta - mean_deltas = hlp.denormalize_observation_delta( - normalized_mean, self.statistics - ) - return mean_deltas, normalized_mean, normalized_var + return normalized_mean, normalized_var diff --git a/cares_reinforcement_learning/networks/world_models/simple/probabilistic_ns_reward.py b/cares_reinforcement_learning/networks/world_models/simple/probabilistic_ns_reward.py new file mode 100644 index 00000000..f9dbe781 --- /dev/null +++ b/cares_reinforcement_learning/networks/world_models/simple/probabilistic_ns_reward.py @@ -0,0 +1,36 @@ +import torch +from torch import nn, Tensor +import torch.nn.functional as F +from cares_reinforcement_learning.util import weight_init_pnn, MLP, weight_init + + +class Probabilistic_NS_Reward(nn.Module): + def __init__(self, observation_size: int, num_actions: int, hidden_size: list, normalize:bool): + """ + Note, This reward function is limited to 0 ~ 1 for dm_control. + A reward model with fully connected layers. It takes current states (s) + and current actions (a), and predict rewards (r). + """ + super().__init__() + print("Create a Prob NS Rewrad") + self.normalize = normalize + self.observation_size = observation_size + self.num_actions = num_actions + self.model = MLP(input_size=observation_size, hidden_sizes=hidden_size, output_size=2) + self.add_module('mlp', self.model) + self.model.apply(weight_init) + def forward( + self, + next_observation: torch.Tensor) -> tuple[Tensor, Tensor]: + """ + Forward the inputs throught the network. + Note: For DMCS environment, the reward is from 0~1. + """ + pred = self.model(next_observation) + var_mean = pred[:, 1].unsqueeze(dim=1) + rwd_mean = pred[:, 0].unsqueeze(dim=1) + logvar = torch.tanh(var_mean) + normalized_var = torch.exp(logvar) + if self.normalize: + rwd_mean = F.sigmoid(rwd_mean) + return rwd_mean, normalized_var diff --git a/cares_reinforcement_learning/networks/world_models/probabilistic_sas_reward.py b/cares_reinforcement_learning/networks/world_models/simple/probabilistic_sas_reward.py similarity index 65% rename from cares_reinforcement_learning/networks/world_models/probabilistic_sas_reward.py rename to cares_reinforcement_learning/networks/world_models/simple/probabilistic_sas_reward.py index 3d84f33a..af6fa196 100644 --- a/cares_reinforcement_learning/networks/world_models/probabilistic_sas_reward.py +++ b/cares_reinforcement_learning/networks/world_models/simple/probabilistic_sas_reward.py @@ -1,11 +1,11 @@ import torch from torch import nn, Tensor import torch.nn.functional as F -from cares_reinforcement_learning.util.helpers import weight_init +from cares_reinforcement_learning.util import weight_init_pnn, MLP, weight_init class Probabilistic_SAS_Reward(nn.Module): - def __init__(self, observation_size: int, num_actions: int, hidden_size: int): + def __init__(self, observation_size: int, num_actions: int, hidden_size: list, normalize: bool): """ Note, This reward function is limited to 0 ~ 1 for dm_control. A reward model with fully connected layers. It takes current states (s) @@ -16,13 +16,17 @@ def __init__(self, observation_size: int, num_actions: int, hidden_size: int): :param (int) hidden_size -- size of neurons in hidden layers. """ super().__init__() + print("Create a Prob SAS Rewrad") + self.normalize = normalize self.observation_size = observation_size self.num_actions = num_actions - self.linear1 = nn.Linear(2 * observation_size + num_actions, hidden_size) - self.linear2 = nn.Linear(hidden_size, hidden_size) - self.linear3 = nn.Linear(hidden_size, 1) - self.linear4 = nn.Linear(hidden_size, 1) - self.apply(weight_init) + + self.model = MLP(input_size=2 * observation_size + num_actions, + hidden_sizes=hidden_size, + output_size=2) + + self.add_module('mlp', self.model) + self.model.apply(weight_init) def forward( self, observation: torch.Tensor, actions: torch.Tensor, next_observation: torch.Tensor) -> tuple[Tensor, Tensor]: @@ -36,19 +40,12 @@ def forward( :return (Tensors) x -- predicted rewards. """ - assert ( - observation.shape[1] + actions.shape[1] - == self.observation_size + self.num_actions - ) x = torch.cat((observation, actions, next_observation), dim=1) - x = self.linear1(x) - x = F.relu(x) - x = self.linear2(x) - x = F.relu(x) - rwd_mean = self.linear3(x) - var_mean = self.linear4(x) + pred = self.model(x) + rwd_mean = pred[:, 0].unsqueeze(dim=1) + var_mean = pred[:, 1].unsqueeze(dim=1) logvar = torch.tanh(var_mean) normalized_var = torch.exp(logvar) - # if normalized: - # rwd_mean = F.sigmoid(rwd_mean) + if self.normalize: + rwd_mean = F.sigmoid(rwd_mean) return rwd_mean, normalized_var diff --git a/cares_reinforcement_learning/networks/world_models/simple_sa_reward.py b/cares_reinforcement_learning/networks/world_models/simple/simple_ns_reward.py similarity index 62% rename from cares_reinforcement_learning/networks/world_models/simple_sa_reward.py rename to cares_reinforcement_learning/networks/world_models/simple/simple_ns_reward.py index 0e28bed7..776ca2c7 100644 --- a/cares_reinforcement_learning/networks/world_models/simple_sa_reward.py +++ b/cares_reinforcement_learning/networks/world_models/simple/simple_ns_reward.py @@ -1,12 +1,11 @@ import torch -import torch.nn.functional as F from torch import nn - -import cares_reinforcement_learning.util.helpers as hlp +import torch.nn.functional as F +from cares_reinforcement_learning.util import weight_init_pnn, MLP, weight_init -class Simple_SA_Reward(nn.Module): - def __init__(self, observation_size: int, num_actions: int, hidden_size: int): +class Simple_NS_Reward(nn.Module): + def __init__(self, observation_size: int, num_actions: int, hidden_size: list, normalize:bool): """ Note, This reward function is limited to 0 ~ 1 for dm_control. A reward model with fully connected layers. It takes current states (s) @@ -17,17 +16,16 @@ def __init__(self, observation_size: int, num_actions: int, hidden_size: int): :param (int) hidden_size -- size of neurons in hidden layers. """ super().__init__() + print("Create a Simple NS Rewrad") + self.normalize = normalize self.observation_size = observation_size self.num_actions = num_actions - self.linear1 = nn.Linear(observation_size + num_actions, hidden_size) - self.linear2 = nn.Linear(hidden_size, hidden_size) - self.linear3 = nn.Linear(hidden_size, 1) - - self.apply(hlp.weight_init) + self.model = MLP(input_size=observation_size, hidden_sizes=hidden_size, output_size=1) + self.add_module('mlp', self.model) + self.model.apply(weight_init) def forward( - self, observation: torch.Tensor, actions: torch.Tensor, normalized: bool = False - ) -> torch.Tensor: + self, observation: torch.Tensor) -> torch.Tensor: """ Forward the inputs throught the network. Note: For DMCS environment, the reward is from 0~1. @@ -38,12 +36,7 @@ def forward( :return (Tensors) x -- predicted rewards. """ - x = torch.cat((observation, actions), dim=1) - x = self.linear1(x) - x = F.relu(x) - x = self.linear2(x) - x = F.relu(x) - rwd_mean = self.linear3(x) - if normalized: + rwd_mean = self.model(observation) + if self.normalize: rwd_mean = F.sigmoid(rwd_mean) return rwd_mean diff --git a/cares_reinforcement_learning/networks/world_models/simple_sas_reward.py b/cares_reinforcement_learning/networks/world_models/simple/simple_sas_reward.py similarity index 72% rename from cares_reinforcement_learning/networks/world_models/simple_sas_reward.py rename to cares_reinforcement_learning/networks/world_models/simple/simple_sas_reward.py index 3e01cc31..c3348c1e 100644 --- a/cares_reinforcement_learning/networks/world_models/simple_sas_reward.py +++ b/cares_reinforcement_learning/networks/world_models/simple/simple_sas_reward.py @@ -1,11 +1,11 @@ import torch from torch import nn import torch.nn.functional as F -from cares_reinforcement_learning.util.helpers import weight_init +from cares_reinforcement_learning.util import weight_init_pnn, MLP, weight_init class Simple_SAS_Reward(nn.Module): - def __init__(self, observation_size: int, num_actions: int, hidden_size: int): + def __init__(self, observation_size: int, num_actions: int, hidden_size: list, normalize: bool): """ Note, This reward function is limited to 0 ~ 1 for dm_control. A reward model with fully connected layers. It takes current states (s) @@ -16,16 +16,16 @@ def __init__(self, observation_size: int, num_actions: int, hidden_size: int): :param (int) hidden_size -- size of neurons in hidden layers. """ super().__init__() + print("Create a Simple SAS Rewrad") + self.normalize = normalize self.observation_size = observation_size self.num_actions = num_actions - self.linear1 = nn.Linear(2 * observation_size + num_actions, hidden_size) - self.linear2 = nn.Linear(hidden_size, hidden_size) - self.linear3 = nn.Linear(hidden_size, 1) - self.apply(weight_init) + self.model = MLP(input_size=2 * observation_size + num_actions, hidden_sizes=hidden_size, output_size=1) + self.add_module('mlp', self.model) + self.model.apply(weight_init) def forward( - self, observation: torch.Tensor, actions: torch.Tensor, next_observation: torch.Tensor, normalized: bool = False - ) -> torch.Tensor: + self, observation: torch.Tensor, actions: torch.Tensor, next_observation: torch.Tensor) -> torch.Tensor: """ Forward the inputs throught the network. Note: For DMCS environment, the reward is from 0~1. @@ -41,11 +41,7 @@ def forward( == self.observation_size + self.num_actions ) x = torch.cat((observation, actions, next_observation), dim=1) - x = self.linear1(x) - x = F.relu(x) - x = self.linear2(x) - x = F.relu(x) - rwd_mean = self.linear3(x) - if normalized: + rwd_mean = self.model(x) + if self.normalize: rwd_mean = F.sigmoid(rwd_mean) return rwd_mean diff --git a/cares_reinforcement_learning/networks/world_models/simple_ns_reward.py b/cares_reinforcement_learning/networks/world_models/simple_ns_reward.py deleted file mode 100644 index 67491a87..00000000 --- a/cares_reinforcement_learning/networks/world_models/simple_ns_reward.py +++ /dev/null @@ -1,53 +0,0 @@ -import torch -from torch import nn -import torch.nn.functional as F -from cares_reinforcement_learning.util.helpers import weight_init - - -class Simple_NS_Reward(nn.Module): - def __init__(self, observation_size: int, num_actions: int, hidden_size: int): - """ - Note, This reward function is limited to 0 ~ 1 for dm_control. - A reward model with fully connected layers. It takes current states (s) - and current actions (a), and predict rewards (r). - - :param (int) observation_size -- dimension of states - :param (int) num_actions -- dimension of actions - :param (int) hidden_size -- size of neurons in hidden layers. - """ - super().__init__() - self.observation_size = observation_size - self.num_actions = num_actions - self.linear1 = nn.Linear(observation_size, hidden_size) - # self.linear1 = nn.Linear(observation_size + num_actions, hidden_size) - self.linear2 = nn.Linear(hidden_size, hidden_size) - self.linear3 = nn.Linear(hidden_size, 1) - self.apply(weight_init) - - def forward( - self, observation: torch.Tensor, normalized: bool = False - ) -> torch.Tensor: - """ - Forward the inputs throught the network. - Note: For DMCS environment, the reward is from 0~1. - - :param (Tensors) obs -- dimension of states - :param (Tensors) actions -- dimension of actions - :param (Bool) normalized -- whether normalized reward to 0~1 - - :return (Tensors) x -- predicted rewards. - """ - # assert ( - # observation.shape[1] + actions.shape[1] - # == self.observation_size + self.num_actions - # ) - # x = torch.cat((observation, actions), dim=1) - x = observation - x = self.linear1(x) - x = F.relu(x) - x = self.linear2(x) - x = F.relu(x) - rwd_mean = self.linear3(x) - if normalized: - rwd_mean = F.sigmoid(rwd_mean) - return rwd_mean diff --git a/cares_reinforcement_learning/networks/world_models/simple_sas_done.py b/cares_reinforcement_learning/networks/world_models/simple_sas_done.py deleted file mode 100644 index d0810e53..00000000 --- a/cares_reinforcement_learning/networks/world_models/simple_sas_done.py +++ /dev/null @@ -1,50 +0,0 @@ -import torch -from torch import nn -import torch.nn.functional as F -from cares_reinforcement_learning.util.helpers import weight_init - - -class Simple_SAS_Done(nn.Module): - def __init__(self, observation_size: int, num_actions: int, hidden_size: int): - """ - Note, This reward function is limited to 0 ~ 1 for dm_control. - A reward model with fully connected layers. It takes current states (s) - and current actions (a), and predict rewards (r). - - :param (int) observation_size -- dimension of states - :param (int) num_actions -- dimension of actions - :param (int) hidden_size -- size of neurons in hidden layers. - """ - super().__init__() - self.observation_size = observation_size - self.num_actions = num_actions - self.linear1 = nn.Linear(2 * observation_size + num_actions, hidden_size) - self.linear2 = nn.Linear(hidden_size, hidden_size) - self.linear3 = nn.Linear(hidden_size, 1) - self.apply(weight_init) - - def forward( - self, observation: torch.Tensor, actions: torch.Tensor, next_observation: torch.Tensor, normalized: bool = False - ) -> torch.Tensor: - """ - Forward the inputs throught the network. - Note: For DMCS environment, the reward is from 0~1. - - :param (Tensors) obs -- dimension of states - :param (Tensors) actions -- dimension of actions - :param (Bool) normalized -- whether normalized reward to 0~1 - - :return (Tensors) x -- predicted rewards. - """ - assert ( - observation.shape[1] + actions.shape[1] - == self.observation_size + self.num_actions - ) - x = torch.cat((observation, actions, next_observation), dim=1) - x = self.linear1(x) - x = F.relu(x) - x = self.linear2(x) - x = F.relu(x) - x = self.linear3(x) - prob_x = F.sigmoid(x) - return prob_x diff --git a/cares_reinforcement_learning/networks/world_models/world_model.py b/cares_reinforcement_learning/networks/world_models/world_model.py new file mode 100644 index 00000000..b6ec2744 --- /dev/null +++ b/cares_reinforcement_learning/networks/world_models/world_model.py @@ -0,0 +1,174 @@ +import logging +import torch +import numpy as np +from cares_reinforcement_learning.networks.world_models.simple import Probabilistic_SAS_Reward, Probabilistic_NS_Reward +from cares_reinforcement_learning.networks.world_models.simple import Simple_SAS_Reward, Simple_NS_Reward +import torch.nn.functional as F +import torch.utils +from torch import optim + + +class World_Model: + """ + World Model + """ + + def __init__( + self, + observation_size: int, + num_actions: int, + l_r: float, + device: str, + hidden_size=None, + sas: bool = True, + prob_rwd: bool = False, + ): + if hidden_size is None: + hidden_size = [128, 128] + self.sas = None + self.prob_rwd = None + self.statistics = {} + self.device = device + self.sas = sas + self.prob_rwd = prob_rwd + self.statistics = {} + if prob_rwd: + if sas: + self.reward_network = Probabilistic_SAS_Reward( + observation_size=observation_size, + num_actions=num_actions, + hidden_size=hidden_size, + normalize=False + ) + else: + self.reward_network = Probabilistic_NS_Reward( + observation_size=observation_size, + num_actions=num_actions, + hidden_size=hidden_size, + normalize=False + ) + else: + if sas: + self.reward_network = Simple_SAS_Reward( + observation_size=observation_size, + num_actions=num_actions, + hidden_size=hidden_size, + normalize=False + ) + else: + self.reward_network = Simple_NS_Reward( + observation_size=observation_size, + num_actions=num_actions, + hidden_size=hidden_size, + normalize=False + ) + self.reward_network.to(self.device) + self.reward_optimizer = optim.Adam(self.reward_network.parameters(), lr=l_r) + + def set_statistics(self, statistics: dict) -> None: + """ + Update all statistics for normalization for all world models and the + ensemble itself. + + :param (Dictionary) statistics: + """ + for key, value in statistics.items(): + if isinstance(value, np.ndarray): + statistics[key] = torch.FloatTensor(statistics[key]).to(self.device) + self.statistics = statistics + + def train_world( + self, + states: torch.Tensor, + actions: torch.Tensor, + next_states: torch.Tensor, + ) -> None: + """ + Train the dynamic of world model. + :param states: + :param actions: + :param next_states: + """ + logging.info(" Train world Not Implemented") + + def pred_next_states( + self, observation: torch.Tensor, actions: torch.Tensor + ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: + """ + Make a prediction of next state. + :param observation: + :param actions: + :return: Next_state Prediction, Next_state Means, Next_State Variance. + """ + logging.info("Predict Next Not Implemented") + return torch.zeros(observation.shape), torch.zeros(observation.shape), torch.zeros(observation.shape) + + def train_reward( + self, + states: torch.Tensor, + actions: torch.Tensor, + next_states: torch.Tensor, + rewards: torch.Tensor, + ) -> None: + """ + Train the reward prediction with or without world model dynamics. + + :param states: + :param actions: + :param next_states: + :param rewards: + """ + self.reward_optimizer.zero_grad() + if self.prob_rwd: + if self.sas: + rwd_mean, rwd_var = self.reward_network(states, actions, next_states) + else: + rwd_mean, rwd_var = self.reward_network(next_states, actions) + reward_loss = F.gaussian_nll_loss(input=rwd_mean, target=rewards, var=rwd_var) + else: + if self.sas: + rwd_mean = self.reward_network(states, actions, next_states) + else: + rwd_mean = self.reward_network(next_states, actions) + reward_loss = F.mse_loss(rwd_mean, rewards) + reward_loss.backward() + self.reward_optimizer.step() + + def pred_rewards(self, observation: torch.Tensor, action: torch.Tensor, next_observation: torch.Tensor + ) -> tuple[torch.Tensor, torch.Tensor]: + """ + Predict reward based on SAS + :param observation: + :param action: + :param next_observation: + :return: Predicted rewards, Means of rewards, Variances of rewards + """ + + if self.prob_rwd: + if self.sas: + pred_rewards, rwd_var = self.reward_network(observation, action, next_observation) + else: + pred_rewards, rwd_var = self.reward_network(next_observation) + return pred_rewards, rwd_var + else: + if self.sas: + pred_rewards = self.reward_network(observation, action, next_observation) + else: + pred_rewards = self.reward_network(next_observation) + return pred_rewards, None + + def estimate_uncertainty( + self, observation: torch.Tensor, actions: torch.Tensor, train_reward:bool, + ) -> tuple[float, float, torch.Tensor]: + """ + Estimate next state uncertainty and reward uncertainty. + + :param observation: + :param actions: + :return: Dynamic Uncertainty, Reward Uncertainty + """ + logging.info("Estimating Uncertainty Not Implemented") + return 0.0, 0.0, None + + def train_together(self, states: torch.Tensor, actions: torch.Tensor, rewards: torch.Tensor, ): + logging.info("Train Together Not Implemented") diff --git a/cares_reinforcement_learning/networks/world_models/z_ensemble_integrated.py b/cares_reinforcement_learning/networks/world_models/z_ensemble_integrated.py deleted file mode 100644 index fa8f1339..00000000 --- a/cares_reinforcement_learning/networks/world_models/z_ensemble_integrated.py +++ /dev/null @@ -1,307 +0,0 @@ -import logging -import math -import random -import sys - -import numpy as np -import torch -import torch.nn.functional as F -import torch.utils -from torch import optim - -from cares_reinforcement_learning.networks.world_models.probabilistic_dynamics import ( - Probabilistic_Dynamics, -) -from cares_reinforcement_learning.networks.world_models.simple_ns_reward import ( - Simple_NS_Reward, -) - -from cares_reinforcement_learning.util.helpers import normalize_observation_delta - - -class IntegratedWorldModel: - """ - A integrated world model aims to train the reward prediciton and next state - prediciton together. - - :param (int) observation_size -- dimension of states - :param (int) num_actions -- dimension of actions - :param (int) hidden_size -- size of neurons in hidden layers. - """ - - def __init__( - self, - observation_size: int, - num_actions: int, - hidden_size: int, - lr: float = 0.001, - ): - self.dyna_network = Probabilistic_Dynamics( - observation_size=observation_size, - num_actions=num_actions, - hidden_size=hidden_size, - ) - self.reward_network = Simple_NS_Reward( - observation_size=observation_size, - num_actions=num_actions, - hidden_size=hidden_size, - ) - - self.reward_optimizer = optim.Adam(self.reward_network.parameters(), lr=lr) - - self.dyna_optimizer = optim.Adam(self.dyna_network.parameters(), lr=lr) - - self.all_optimizer = optim.Adam( - list(self.reward_network.parameters()) - + list(self.dyna_network.parameters()), - lr=lr, - ) - - self.statistics = {} - - def train_dynamics( - self, states: torch.Tensor, actions: torch.Tensor, next_states: torch.Tensor - ) -> None: - """ - Train the dynamics (next state prediciton) alone. Predicting the delta - rather than the next state. - - :param (Tensor) states -- states input - :param (Tensor) actions -- actions input - :param (Tensor) next_states -- target label. - """ - target = next_states - states - delta_targets_normalized = hlp.normalize_observation_delta( - target, self.statistics - ) - - _, n_mean, n_var = self.dyna_network.forward(states, actions) - - model_loss = F.gaussian_nll_loss( - input=n_mean, target=delta_targets_normalized, var=n_var - ).mean() - - self.dyna_optimizer.zero_grad() - model_loss.backward() - self.dyna_optimizer.step() - - def train_overall( - self, - states: torch.Tensor, - actions: torch.Tensor, - next_states: torch.Tensor, - next_actions: torch.Tensor, - next_rewards: torch.Tensor, - ) -> None: - """ - Do one step preidiciton, train both network together. Add Two loss - functions. - - :param (Tensor) states: - :param (Tensor) actions: - :param (Tensor) next_states: - :param (Tensor) next_actions: - :param (Tensor) next_rewards: - """ - # Get the dynamics training losses first - mean_deltas, normalized_mean, normalized_var = self.dyna_network.forward( - states, actions - ) - - # Always denormalized delta - pred_next_state = mean_deltas + states - target = next_states - states - - delta_targets_normalized = hlp.normalize_observation_delta( - target, self.statistics - ) - - model_loss = F.gaussian_nll_loss( - input=normalized_mean, target=delta_targets_normalized, var=normalized_var - ).mean() - - rwd_mean = self.reward_network.forward(pred_next_state, next_actions) - # rwd_loss = F.gaussian_nll_loss(input=rwd_mean, target=next_rewards, var=rwd_var) - rwd_loss = F.mse_loss(rwd_mean, next_rewards) - all_loss = rwd_loss + model_loss.mean() - - # Update - self.all_optimizer.zero_grad() - all_loss.backward() - self.all_optimizer.step() - - -class EnsembleWorldReward: - """ - Ensemble the integrated dynamic reward models. It works like a group of - experts. The predicted results can be used to estimate the uncertainty. - - :param (int) observation_size -- dimension of states - :param (int) num_actions -- dimension of actions - :param (int) num_models -- number of world models in this ensemble. - :param (int) hidden_size -- size of neurons in hidden layers. - """ - - def __init__( - self, - observation_size: int, - num_actions: int, - num_models: int, - lr: float, - device: torch.device, - hidden_size: int = 128, - ): - self.num_models = num_models - self.observation_size = observation_size - self.num_actions = num_actions - - self.models = [ - IntegratedWorldModel( - observation_size=observation_size, - num_actions=num_actions, - hidden_size=hidden_size, - lr=lr, - ) - for _ in range(self.num_models) - ] - self.statistics = {} - - # Bring all reward prediction and dynamic rediction networks to device. - self.device = device - for model in self.models: - model.dyna_network.to(device) - model.reward_network.to(device) - - def set_statistics(self, statistics: dict) -> None: - """ - Update all statistics for normalization for all world models and the - ensemble itself. - - :param (Dictionary) statistics: - """ - for key, value in statistics.items(): - if isinstance(value, np.ndarray): - statistics[key] = torch.FloatTensor(statistics[key]).to(self.device) - - self.statistics = statistics - for model in self.models: - model.statistics = statistics - model.dyna_network.statistics = statistics - - def pred_rewards( - self, observation: torch.Tensor, actions: torch.Tensor - ) -> torch.Tensor: - """ - Make a prediciton of rewards based on current state and actions. Take - the mean of rewards as final for now. - - :param (Tensors) obs -- dimension of states - :param (Tensors) actions -- dimension of actions - - :return (Tensors) reward -- predicted mean rewards. - :return (List) rewards -- A list of predicted rewards. For STEVE use. - """ - rewards = [] - for model in self.models: - pred_rewards = model.reward_network.forward(observation, actions) - rewards.append(pred_rewards) - - # Use average - rewards = torch.stack(rewards) - - rand_ind = random.randint(0, rewards.shape[0]) - 1 - reward = rewards[rand_ind] - - # reward = torch.min(rewards, dim=0).values # Pessimetic - return reward, rewards - - def pred_next_states( - self, observation: torch.Tensor, actions: torch.Tensor - ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: - """ - Predict the next state based on current state and action, using an - ensemble of world models. The world model is probablisitic. It is - trained with Gaussian NLL loss. - - :param (Tensors) obs -- dimension of states - :param (Tensors) actions -- dimension of actions - - :return (Tensors) random picked next state predicitons - :return (Tensors) all next state predicitons - :return (Tensors) all normalized delta' means - :return (Tensors) all normalized delta' vars - """ - means = [] - norm_means = [] - norm_vars = [] - - # Iterate over the neural networks and get the predictions - for model in self.models: - # Predict delta - mean, n_mean, n_var = model.dyna_network.forward(observation, actions) - means.append(mean) - norm_means.append(n_mean) - norm_vars.append(n_var) - - # Normalized - predictions_means = torch.stack(means) - predictions_norm_means = torch.stack(norm_means) - predictions_vars = torch.stack(norm_vars) - - # Get rid of the nans - not_nans = [] - for i in range(self.num_models): - if not torch.any(torch.isnan(predictions_means[i])): - not_nans.append(i) - if len(not_nans) == 0: - logging.info("Predicting all Nans") - sys.exit() - - # Random Take next state. - rand_ind = random.randint(0, len(not_nans) - 1) - prediction = predictions_means[not_nans[rand_ind]] - - # next = current + delta - prediction += observation - all_predictions = torch.stack(means) - for j in range(all_predictions.shape[0]): - all_predictions[j] += observation - - return prediction, all_predictions, predictions_norm_means, predictions_vars - - def train_world( - self, - states: torch.Tensor, - actions: torch.Tensor, - rewards: torch.Tensor, - next_states: torch.Tensor, - next_actions: torch.Tensor, - next_rewards: torch.Tensor, - ) -> None: - # pylint: disable-next=unused-argument - """ - This function decides how to train both reward prediciton and dynamic - prediction. - - :param (Tensors) input states: - :param (Tensors) input actions: - :param (Tensors) input rewards: - :param (Tensors) input next_states: - :param (Tensors) input next_actions: - :param (Tensors) input next_rewards: - """ - # For each model, train with different data. - mini_batch_size = int(math.floor(states.shape[0] / self.num_models)) - for i in range(self.num_models): - self.models[i].train_dynamics( - states[i * mini_batch_size : (i + 1) * mini_batch_size], - actions[i * mini_batch_size : (i + 1) * mini_batch_size], - next_states[i * mini_batch_size : (i + 1) * mini_batch_size], - ) - self.models[i].train_overall( - states[i * mini_batch_size : (i + 1) * mini_batch_size], - actions[i * mini_batch_size : (i + 1) * mini_batch_size], - next_states[i * mini_batch_size : (i + 1) * mini_batch_size], - next_actions[i * mini_batch_size : (i + 1) * mini_batch_size], - next_rewards[i * mini_batch_size : (i + 1) * mini_batch_size], - ) diff --git a/cares_reinforcement_learning/util/configurations.py b/cares_reinforcement_learning/util/configurations.py index cca6f9cd..8048256e 100644 --- a/cares_reinforcement_learning/util/configurations.py +++ b/cares_reinforcement_learning/util/configurations.py @@ -1,6 +1,12 @@ -from typing import List, Optional +from typing import Any + +import pydantic from pydantic import BaseModel, Field from torch import nn +from cares_reinforcement_learning.encoders.configurations import ( + BurgessConfig, + VanillaAEConfig, +) # pylint disbale-next=unused-import @@ -112,52 +118,6 @@ class AlgorithmConfig(SubscriptableClass): image_observation: int = 0 -################################### -# DQN Algorithms # -################################### - - -class DQNConfig(AlgorithmConfig): - algorithm: str = Field("DQN", Literal=True) - lr: float = 1e-3 - gamma: float = 0.99 - - exploration_min: float = 1e-3 - exploration_decay: float = 0.95 - - network_config: MLPConfig = MLPConfig(hidden_sizes=[512, 512]) - - -class DoubleDQNConfig(DQNConfig): - algorithm: str = Field("DoubleDQN", Literal=True) - lr: float = 1e-3 - gamma: float = 0.99 - tau: float = 0.005 - - exploration_min: float = 1e-3 - exploration_decay: float = 0.95 - - network_config: MLPConfig = MLPConfig(hidden_sizes=[512, 512]) - - -class DuelingDQNConfig(AlgorithmConfig): - algorithm: str = Field("DuelingDQN", Literal=True) - lr: float = 1e-3 - gamma: float = 0.99 - - exploration_min: float = 1e-3 - exploration_decay: float = 0.95 - - feature_layer_config: MLPConfig = MLPConfig(hidden_sizes=[512, 512]) - value_stream_config: MLPConfig = MLPConfig(hidden_sizes=[512]) - advantage_stream_config: MLPConfig = MLPConfig(hidden_sizes=[512]) - - -################################### -# PPO Algorithms # -################################### - - class PPOConfig(AlgorithmConfig): algorithm: str = Field("PPO", Literal=True) actor_lr: float = 1e-4 @@ -174,7 +134,6 @@ class PPOConfig(AlgorithmConfig): ) critic_config: MLPConfig = MLPConfig(hidden_sizes=[1024, 1024]) - ################################### # SAC Algorithms # ################################### @@ -193,28 +152,12 @@ class SACConfig(AlgorithmConfig): critic_config: MLPConfig = MLPConfig(hidden_sizes=[256, 256]) -class DynaSAC_SASConfig(AlgorithmConfig): - algorithm: str = Field("DynaSAC_SAS", Literal=True) - actor_lr: float = 3e-4 - critic_lr: float = 3e-4 - alpha_lr: float = 3e-4 - gamma: float = 0.99 - tau: float = 0.005 - reward_scale: float = 1.0 - log_std_bounds: list[float] = [-20, 2] - policy_update_freq: int = 1 - target_update_freq: int = 1 - actor_config: MLPConfig = MLPConfig(hidden_sizes=[256, 256]) - critic_config: MLPConfig = MLPConfig(hidden_sizes=[256, 256]) - - num_models: int = 5 - world_model_lr: float = 0.001 - horizon: int = 3 - num_samples: int = 10 - - class DynaSAC_NSConfig(AlgorithmConfig): algorithm: str = Field("DynaSAC_NS", Literal=True) + type: str = Field("mbrl", Literal=True) + G: int = 1, + G_model: float = 1, + actor_lr: float = 3e-4 critic_lr: float = 3e-4 alpha_lr: float = 3e-4 @@ -227,14 +170,23 @@ class DynaSAC_NSConfig(AlgorithmConfig): actor_config: MLPConfig = MLPConfig(hidden_sizes=[256, 256]) critic_config: MLPConfig = MLPConfig(hidden_sizes=[256, 256]) + max_steps_exploration: int = 256 num_models: int = 5 world_model_lr: float = 0.001 horizon: int = 3 num_samples: int = 10 + sas: bool = False + train_reward: bool = True + train_both: bool = True + gripper: bool = False + +class DynaSAC_BoundedConfig(AlgorithmConfig): + algorithm: str = Field("DynaSAC_Bounded", Literal=True) + type: str = Field("mbrl", Literal=True) + G: int = 1, + G_model: float = 1, -class DynaSAC_BoundedNSConfig(AlgorithmConfig): - algorithm: str = Field("DynaSAC_BoundedNS", Literal=True) actor_lr: float = 3e-4 critic_lr: float = 3e-4 alpha_lr: float = 3e-4 @@ -247,16 +199,22 @@ class DynaSAC_BoundedNSConfig(AlgorithmConfig): actor_config: MLPConfig = MLPConfig(hidden_sizes=[256, 256]) critic_config: MLPConfig = MLPConfig(hidden_sizes=[256, 256]) + max_steps_exploration: int = 256 num_models: int = 5 world_model_lr: float = 0.001 horizon: int = 3 num_samples: int = 10 - + sas: bool = False + train_reward: bool = True + train_both: bool = True + gripper: bool = False threshold: float = 0.1 + exploration_sample: int = 5 class STEVE_MEANConfig(AlgorithmConfig): - algorithm: str = Field("DynaSAC_NS", Literal=True) + algorithm: str = Field("STEVE", Literal=True) + type: str = Field("mbrl", Literal=True) actor_lr: float = 3e-4 critic_lr: float = 3e-4 alpha_lr: float = 3e-4 @@ -273,10 +231,15 @@ class STEVE_MEANConfig(AlgorithmConfig): world_model_lr: float = 0.001 horizon: int = 3 num_samples: int = 10 + sas: bool = False + train_reward: bool = True + train_both: bool = True + gripper: bool = False class DynaSAC_SAS_Immersive_WeightConfig(AlgorithmConfig): algorithm: str = Field("DynaSAC_IWNS", Literal=True) + type: str = Field("mbrl", Literal=True) actor_lr: float = 3e-4 critic_lr: float = 3e-4 alpha_lr: float = 3e-4 @@ -297,9 +260,14 @@ class DynaSAC_SAS_Immersive_WeightConfig(AlgorithmConfig): threshold: float = 0.1 reweight_actor: bool = False + train_reward: bool = True + train_both: bool = True + gripper: bool = False + class DynaSAC_BIVReweightConfig(AlgorithmConfig): algorithm: str = Field("DynaSAC_BIVNS", Literal=True) + type: str = Field("mbrl", Literal=True) actor_lr: float = 3e-4 critic_lr: float = 3e-4 alpha_lr: float = 3e-4 @@ -320,9 +288,14 @@ class DynaSAC_BIVReweightConfig(AlgorithmConfig): threshold: float = 0.1 reweight_actor: bool = False + train_reward: bool = True + train_both: bool = True + gripper: bool = False + class DynaSAC_SUNRISEReweightConfig(AlgorithmConfig): algorithm: str = Field("DynaSAC_SUNRISENS", Literal=True) + type: str = Field("mbrl", Literal=True) actor_lr: float = 3e-4 critic_lr: float = 3e-4 alpha_lr: float = 3e-4 @@ -343,9 +316,14 @@ class DynaSAC_SUNRISEReweightConfig(AlgorithmConfig): threshold: float = 0.1 reweight_actor: bool = False + train_reward: bool = True + train_both: bool = True + gripper: bool = False + class DynaSAC_UWACReweightConfig(AlgorithmConfig): algorithm: str = Field("DynaSAC_UWACNS", Literal=True) + type: str = Field("mbrl", Literal=True) actor_lr: float = 3e-4 critic_lr: float = 3e-4 alpha_lr: float = 3e-4 @@ -365,3 +343,583 @@ class DynaSAC_UWACReweightConfig(AlgorithmConfig): threshold: float = 0.1 reweight_actor: bool = False + + train_reward: bool = True + train_both: bool = True + gripper: bool = False + + +############ Useless to me ########### +class DQNConfig(AlgorithmConfig): + algorithm: str = Field("DQN", Literal=True) + lr: float = 1e-3 + gamma: float = 0.99 + + exploration_min: float = 1e-3 + exploration_decay: float = 0.95 + + network_config: MLPConfig = MLPConfig(hidden_sizes=[512, 512]) + + +class DoubleDQNConfig(DQNConfig): + algorithm: str = Field("DoubleDQN", Literal=True) + lr: float = 1e-3 + gamma: float = 0.99 + tau: float = 0.005 + + exploration_min: float = 1e-3 + exploration_decay: float = 0.95 + + network_config: MLPConfig = MLPConfig(hidden_sizes=[512, 512]) + + +class DuelingDQNConfig(AlgorithmConfig): + algorithm: str = Field("DuelingDQN", Literal=True) + lr: float = 1e-3 + gamma: float = 0.99 + + exploration_min: float = 1e-3 + exploration_decay: float = 0.95 + + feature_layer_config: MLPConfig = MLPConfig(hidden_sizes=[512, 512]) + value_stream_config: MLPConfig = MLPConfig(hidden_sizes=[512]) + advantage_stream_config: MLPConfig = MLPConfig(hidden_sizes=[512]) + + +class SACDConfig(AlgorithmConfig): + algorithm: str = Field("SACD", Literal=True) + actor_lr: float = 3e-4 + critic_lr: float = 3e-4 + alpha_lr: float = 3e-4 + + batch_size = 64 + + target_entropy_multiplier = 0.98 + + max_steps_exploration = 20000 + number_steps_per_train_policy = 4 + + gamma: float = 0.99 + tau: float = 0.005 + reward_scale: float = 1.0 + + policy_update_freq: int = 1 + target_update_freq: int = 1 + + actor_config: MLPConfig = MLPConfig(hidden_sizes=[512, 512]) + critic_config: MLPConfig = MLPConfig(hidden_sizes=[512, 512]) + + +class SACAEConfig(SACConfig): + algorithm: str = Field("SACAE", Literal=True) + + image_observation: int = 1 + batch_size: int = 128 + + actor_lr: float = 1e-3 + critic_lr: float = 1e-3 + alpha_lr: float = 1e-4 + + gamma: float = 0.99 + tau: float = 0.005 + reward_scale: float = 1.0 + + log_std_bounds: list[float] = [-20, 2] + + policy_update_freq: int = 2 + target_update_freq: int = 2 + + actor_config: MLPConfig = MLPConfig(hidden_sizes=[1024, 1024]) + critic_config: MLPConfig = MLPConfig(hidden_sizes=[1024, 1024]) + + encoder_tau: float = 0.05 + decoder_update_freq: int = 1 + + vector_observation: int = 0 + + autoencoder_config: VanillaAEConfig = VanillaAEConfig( + latent_dim=50, + num_layers=4, + num_filters=32, + kernel_size=3, + latent_lambda=1e-6, + encoder_optim_kwargs={"lr": 1e-3}, + decoder_optim_kwargs={"lr": 1e-3, "weight_decay": 1e-7}, + ) + + +class PERSACConfig(SACConfig): + algorithm: str = Field("PERSAC", Literal=True) + + actor_lr: float = 3e-4 + critic_lr: float = 3e-4 + gamma: float = 0.99 + tau: float = 0.005 + + beta: float = 0.4 + per_alpha: float = 0.6 + min_priority: float = 1e-6 + + log_std_bounds: list[float] = [-20, 2] + + policy_update_freq: int = 1 + target_update_freq: int = 1 + + actor_config: MLPConfig = MLPConfig(hidden_sizes=[256, 256]) + critic_config: MLPConfig = MLPConfig(hidden_sizes=[256, 256]) + + +class REDQConfig(SACConfig): + algorithm: str = Field("REDQ", Literal=True) + actor_lr: float = 3e-4 + critic_lr: float = 3e-4 + + gamma: float = 0.99 + tau: float = 0.005 + ensemble_size: int = 10 + num_sample_critics: int = 2 + + G: int = 20 + + policy_update_freq: int = 1 + target_update_freq: int = 1 + + actor_config: MLPConfig = MLPConfig(hidden_sizes=[256, 256]) + critic_config: MLPConfig = MLPConfig(hidden_sizes=[256, 256]) + + +class TQCConfig(SACConfig): + algorithm: str = Field("TQC", Literal=True) + actor_lr: float = 3e-4 + critic_lr: float = 3e-4 + alpha_lr: float = 3e-4 + + gamma: float = 0.99 + tau: float = 0.005 + top_quantiles_to_drop: int = 2 + num_quantiles: int = 25 + num_critics: int = 5 + + log_std_bounds: list[float] = [-20, 2] + + policy_update_freq: int = 1 + target_update_freq: int = 1 + + actor_config: MLPConfig = MLPConfig(hidden_sizes=[256, 256]) + critic_config: MLPConfig = MLPConfig(hidden_sizes=[512, 512, 512]) + + +class LAPSACConfig(SACConfig): + algorithm: str = Field("LAPSAC", Literal=True) + + actor_lr: float = 3e-4 + critic_lr: float = 3e-4 + alpha_lr: float = 3e-4 + + gamma: float = 0.99 + tau: float = 0.005 + per_alpha: float = 0.6 + reward_scale: float = 1.0 + min_priority: float = 1.0 + + log_std_bounds: list[float] = [-20, 2] + + policy_update_freq: int = 1 + target_update_freq: int = 1 + + actor_config: MLPConfig = MLPConfig(hidden_sizes=[256, 256]) + critic_config: MLPConfig = MLPConfig(hidden_sizes=[256, 256]) + + +class LA3PSACConfig(SACConfig): + algorithm: str = Field("LA3PSAC", Literal=True) + + actor_lr: float = 3e-4 + critic_lr: float = 3e-4 + alpha_lr: float = 3e-4 + gamma: float = 0.99 + tau: float = 0.005 + reward_scale: float = 5.0 + + beta: float = 0.4 + per_alpha: float = 0.4 + min_priority: float = 1.0 + prioritized_fraction: float = 0.5 + + log_std_bounds: list[float] = [-20, 2] + + target_update_freq: int = 1 + + actor_config: MLPConfig = MLPConfig(hidden_sizes=[256, 256]) + critic_config: MLPConfig = MLPConfig(hidden_sizes=[256, 256]) + + +class MAPERSACConfig(SACConfig): + algorithm: str = Field("MAPERSAC", Literal=True) + + max_steps_exploration: int = 10000 + + actor_lr: float = 7.3e-4 + critic_lr: float = 7.3e-4 + alpha_lr: float = 7.3e-4 + gamma: float = 0.98 + tau: float = 0.02 + + beta: float = 0.4 + per_alpha: float = 0.7 + min_priority: float = 1e-6 + + G: int = 64 + number_steps_per_train_policy: int = 64 + + log_std_bounds: list[float] = [-20, 2] + + policy_update_freq: int = 1 + target_update_freq: int = 1 + + actor_config: MLPConfig = MLPConfig(hidden_sizes=[400, 300]) + critic_config: MLPConfig = MLPConfig(hidden_sizes=[400, 300]) + + +class RDSACConfig(SACConfig): + algorithm: str = Field("RDSAC", Literal=True) + + actor_lr: float = 3e-4 + critic_lr: float = 3e-4 + gamma: float = 0.99 + tau: float = 0.005 + + beta: float = 0.4 + per_alpha: float = 0.7 + min_priority: float = 1.0 + + log_std_bounds: list[float] = [-20, 2] + + policy_update_freq: int = 1 + target_update_freq: int = 1 + + actor_config: MLPConfig = MLPConfig(hidden_sizes=[256, 256]) + critic_config: MLPConfig = MLPConfig(hidden_sizes=[256, 256]) + + +class CrossQConfig(AlgorithmConfig): + algorithm: str = Field("CrossQ", Literal=True) + actor_lr: float = 1e-3 + critic_lr: float = 1e-3 + alpha_lr: float = 1e-3 + + gamma: float = 0.99 + reward_scale: float = 1.0 + + log_std_bounds: list[float] = [-20, 2] + + policy_update_freq: int = 3 + + actor_config: MLPConfig = MLPConfig( + input_layer="BatchRenorm1d", + linear_layer_args={"bias": False}, + hidden_sizes=[256, 256], + batch_layer="BatchRenorm1d", + batch_layer_args={"momentum": 0.01}, + layer_order=["activation", "batch"], + ) + critic_config: MLPConfig = MLPConfig( + input_layer="BatchRenorm1d", + linear_layer_args={"bias": False}, + hidden_sizes=[2048, 2048], + batch_layer="BatchRenorm1d", + batch_layer_args={"momentum": 0.01}, + layer_order=["activation", "batch"], + ) + + +class DroQConfig(SACConfig): + algorithm: str = Field("DroQ", Literal=True) + actor_lr: float = 3e-4 + critic_lr: float = 3e-4 + alpha_lr: float = 3e-4 + + gamma: float = 0.99 + tau: float = 0.005 + reward_scale: float = 1.0 + + G: int = 20 + + log_std_bounds: list[float] = [-20, 2] + + policy_update_freq: int = 1 + target_update_freq: int = 1 + + hidden_size_actor: list[int] = [256, 256] + hidden_size_critic: list[int] = [256, 256] + + actor_config: MLPConfig = MLPConfig(hidden_sizes=[256, 256]) + critic_config: MLPConfig = MLPConfig( + hidden_sizes=[256, 256], + dropout_layer="Dropout", + dropout_layer_args={"p": 0.005}, + norm_layer="LayerNorm", + layer_order=["dropout", "layernorm", "activation"], + ) + +class DDPGConfig(AlgorithmConfig): + algorithm: str = Field("DDPG", Literal=True) + actor_lr: float = 1e-4 + critic_lr: float = 1e-3 + + gamma: float = 0.99 + tau: float = 0.005 + + actor_config: MLPConfig = MLPConfig( + hidden_sizes=[1024, 1024], output_activation_function=nn.Tanh.__name__ + ) + critic_config: MLPConfig = MLPConfig(hidden_sizes=[1024, 1024]) + + +class TD3Config(AlgorithmConfig): + algorithm: str = Field("TD3", Literal=True) + actor_lr: float = 3e-4 + critic_lr: float = 3e-4 + + gamma: float = 0.99 + tau: float = 0.005 + + policy_update_freq: int = 2 + + actor_config: MLPConfig = MLPConfig( + hidden_sizes=[256, 256], output_activation_function=nn.Tanh.__name__ + ) + critic_config: MLPConfig = MLPConfig(hidden_sizes=[256, 256]) + + +class TD3AEConfig(TD3Config): + algorithm: str = Field("TD3AE", Literal=True) + + image_observation: int = 1 + batch_size: int = 128 + + actor_lr: float = 1e-3 + critic_lr: float = 1e-3 + alpha_lr: float = 1e-4 + + gamma: float = 0.99 + tau: float = 0.005 + + policy_update_freq: int = 2 + + actor_config: MLPConfig = MLPConfig( + hidden_sizes=[1024, 1024], output_activation_function=nn.Tanh.__name__ + ) + critic_config: MLPConfig = MLPConfig(hidden_sizes=[1024, 1024]) + + encoder_tau: float = 0.05 + decoder_update_freq: int = 1 + + vector_observation: int = 0 + + autoencoder_config: VanillaAEConfig = VanillaAEConfig( + latent_dim=50, + num_layers=4, + num_filters=32, + kernel_size=3, + latent_lambda=1e-6, + encoder_optim_kwargs={"lr": 1e-3}, + decoder_optim_kwargs={"lr": 1e-3, "weight_decay": 1e-7}, + ) + + +class NaSATD3Config(TD3Config): + algorithm: str = Field("NaSATD3", Literal=True) + + image_observation: int = 1 + + actor_lr: float = 1e-4 + critic_lr: float = 1e-3 + epm_lr: float = 1e-4 + + gamma: float = 0.99 + tau: float = 0.005 + ensemble_size: int = 3 + + policy_update_freq: int = 2 + + actor_config: MLPConfig = MLPConfig( + hidden_sizes=[1024, 1024], output_activation_function=nn.Tanh.__name__ + ) + critic_config: MLPConfig = MLPConfig(hidden_sizes=[1024, 1024]) + epm_config: MLPConfig = MLPConfig(hidden_sizes=[512, 512]) + + intrinsic_on: int = 1 + + vector_observation: int = 0 + + autoencoder_config: VanillaAEConfig = VanillaAEConfig( + latent_dim=200, + num_layers=4, + num_filters=32, + kernel_size=3, + latent_lambda=1e-6, + encoder_optim_kwargs={"lr": 1e-3}, + decoder_optim_kwargs={"lr": 1e-3, "weight_decay": 1e-7}, + ) + + # autoencoder_config: AEConfig] = VAEConfig( + # latent_dim=200, + # num_layers=4, + # num_filters=32, + # kernel_size=3, + # latent_lambda=1e-6, + # encoder_optim_kwargs={"lr": 1e-3}, + # decoder_optim_kwargs={"lr": 1e-3, "weight_decay": 1e-7}, + # ) + + +class PERTD3Config(TD3Config): + algorithm: str = Field("PERTD3", Literal=True) + + actor_lr: float = 3e-4 + critic_lr: float = 3e-4 + gamma: float = 0.99 + tau: float = 0.005 + + beta: float = 0.4 + per_alpha: float = 0.6 + min_priority: float = 1e-6 + + policy_update_freq: int = 2 + + actor_config: MLPConfig = MLPConfig( + hidden_sizes=[256, 256], output_activation_function=nn.Tanh.__name__ + ) + critic_config: MLPConfig = MLPConfig(hidden_sizes=[256, 256]) + + +class LAPTD3Config(TD3Config): + algorithm: str = Field("LAPTD3", Literal=True) + + actor_lr: float = 3e-4 + critic_lr: float = 3e-4 + gamma: float = 0.99 + tau: float = 0.005 + + beta: float = 0.4 + per_alpha: float = 0.4 + min_priority: float = 1.0 + + policy_update_freq: int = 2 + + actor_config: MLPConfig = MLPConfig( + hidden_sizes=[256, 256], output_activation_function=nn.Tanh.__name__ + ) + critic_config: MLPConfig = MLPConfig(hidden_sizes=[256, 256]) + + +class PALTD3Config(TD3Config): + algorithm: str = Field("PALTD3", Literal=True) + + actor_lr: float = 3e-4 + critic_lr: float = 3e-4 + gamma: float = 0.99 + tau: float = 0.005 + + beta: float = 0.4 + per_alpha: float = 0.4 + min_priority: float = 1.0 + + policy_update_freq: int = 2 + + actor_config: MLPConfig = MLPConfig( + hidden_sizes=[256, 256], output_activation_function=nn.Tanh.__name__ + ) + critic_config: MLPConfig = MLPConfig(hidden_sizes=[256, 256]) + + +class LA3PTD3Config(TD3Config): + algorithm: str = Field("LA3PTD3", Literal=True) + + actor_lr: float = 3e-4 + critic_lr: float = 3e-4 + gamma: float = 0.99 + tau: float = 0.005 + + beta: float = 0.4 + per_alpha: float = 0.4 + min_priority: float = 1.0 + prioritized_fraction: float = 0.5 + + policy_update_freq: int = 2 + + actor_config: MLPConfig = MLPConfig( + hidden_sizes=[256, 256], output_activation_function=nn.Tanh.__name__ + ) + critic_config: MLPConfig = MLPConfig(hidden_sizes=[256, 256]) + + +class MAPERTD3Config(TD3Config): + algorithm: str = Field("MAPERTD3", Literal=True) + + max_steps_exploration: int = 10000 + + batch_size: int = 100 + + actor_lr: float = 1e-3 + critic_lr: float = 1e-3 + gamma: float = 0.98 + tau: float = 0.005 + + beta: float = 1.0 + per_alpha: float = 0.7 + min_priority: float = 1e-6 + + G: int = 64 + number_steps_per_train_policy: int = 64 + + policy_update_freq: int = 2 + + actor_config: MLPConfig = MLPConfig( + hidden_sizes=[256, 256], output_activation_function=nn.Tanh.__name__ + ) + critic_config: MLPConfig = MLPConfig(hidden_sizes=[256, 256]) + + +class RDTD3Config(TD3Config): + algorithm: str = Field("RDTD3", Literal=True) + + actor_lr: float = 3e-4 + critic_lr: float = 3e-4 + gamma: float = 0.99 + tau: float = 0.005 + + beta: float = 0.4 + per_alpha: float = 0.7 + min_priority: float = 1.0 + + policy_update_freq: int = 2 + + actor_config: MLPConfig = MLPConfig( + hidden_sizes=[256, 256], output_activation_function=nn.Tanh.__name__ + ) + critic_config: MLPConfig = MLPConfig(hidden_sizes=[256, 256]) + + +class CTD4Config(TD3Config): + algorithm: str = Field("CTD4", Literal=True) + + actor_lr: float = 1e-4 + critic_lr: float = 1e-3 + gamma: float = 0.99 + tau: float = 0.005 + ensemble_size: int = 3 + + min_noise: float = 0.0 + noise_decay: float = 0.999999 + noise_scale: float = 0.1 + + policy_update_freq: int = 2 + + actor_config: MLPConfig = MLPConfig( + hidden_sizes=[256, 256], output_activation_function=nn.Tanh.__name__ + ) + critic_config: MLPConfig = MLPConfig(hidden_sizes=[256, 256]) + + fusion_method: str = "kalman" # kalman, minimum, average diff --git a/cares_reinforcement_learning/util/helpers.py b/cares_reinforcement_learning/util/helpers.py index 7980075e..859edf72 100644 --- a/cares_reinforcement_learning/util/helpers.py +++ b/cares_reinforcement_learning/util/helpers.py @@ -4,6 +4,50 @@ import numpy as np import torch +from torch import nn as nn +import torch.nn.functional as F +import time + +class MLP(nn.Module): + def __init__(self, input_size: int, hidden_sizes: list[int], output_size: int): + super().__init__() + + self.fully_connected_layers = [] + for i, next_size in enumerate(hidden_sizes): + fully_connected_layer = nn.Linear(input_size, next_size) + self.add_module(f"fully_connected_layer_{i}", fully_connected_layer) + self.fully_connected_layers.append(fully_connected_layer) + input_size = next_size + + self.output_layer = nn.Linear(input_size, output_size) + + def forward(self, state): + for fully_connected_layer in self.fully_connected_layers: + state = F.relu(fully_connected_layer(state)) + output = self.output_layer(state) + return output + + +def weight_init_pnn(module: torch.nn.Module) -> None: + """ + Custom weight init for Conv2D and Linear layers + + delta-orthogonal init from https://arxiv.org/pdf/1806.05393.pdf + """ + if isinstance(module, torch.nn.Linear): + torch.manual_seed(int(time.time())) + torch.cuda.manual_seed_all(int(time.time())) + torch.nn.init.xavier_uniform_(module.weight) + module.bias.data.uniform_(-0.5, 0.5) + + elif isinstance(module, (torch.nn.Conv2d, torch.nn.ConvTranspose2d)): + assert module.weight.size(2) == module.weight.size(3) + module.weight.data.fill_(0.0) + module.bias.data.fill_(0.0) + mid = module.weight.size(2) // 2 + gain = torch.nn.init.calculate_gain("relu") + torch.nn.init.orthogonal_(module.weight.data[:, :, mid, mid], gain) + def get_device() -> torch.device: device = torch.device("cpu") diff --git a/cares_reinforcement_learning/util/network_factory.py b/cares_reinforcement_learning/util/network_factory.py index c01464c1..303d3ae0 100644 --- a/cares_reinforcement_learning/util/network_factory.py +++ b/cares_reinforcement_learning/util/network_factory.py @@ -12,247 +12,46 @@ # DQN Algorithms # ################################### - -def create_STEVE_MEAN(observation_size, action_num, config: AlgorithmConfig): - """ - Create networks for model-based SAC agent. The Actor and Critic is same. - An extra world model is added. - - """ - from cares_reinforcement_learning.algorithm.mbrl import STEVE_MEAN - from cares_reinforcement_learning.networks.SAC import Actor, Critic - from cares_reinforcement_learning.networks.world_models.ensemble_world_ensemble_sas_reward import \ - EnsembleWorldEnsembleSASReward - - actor = Actor(observation_size, action_num) - critic = Critic(observation_size, action_num) - - device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - - world_model = EnsembleWorldEnsembleSASReward( - observation_size=observation_size, - num_actions=action_num, - num_world_models=config.num_world_models, - num_reward_models=config.num_reward_models, - lr=config.world_model_lr, - device=device, - ) - - agent = STEVE_MEAN( - actor_network=actor, - critic_network=critic, - world_network=world_model, - gamma=config.gamma, - tau=config.tau, - action_num=action_num, - actor_lr=config.actor_lr, - critic_lr=config.critic_lr, - alpha_lr=config.alpha_lr, - horizon=config.horizon, - L=config.num_critic_models, - device=device, - ) - return agent - - -def create_DynaSAC_SA(observation_size, action_num, config: AlgorithmConfig): - """ - Create networks for model-based SAC agent. The Actor and Critic is same. - An extra world model is added. - - """ - from cares_reinforcement_learning.algorithm.mbrl import DynaSAC_SA - from cares_reinforcement_learning.networks.SAC import Actor, Critic - from cares_reinforcement_learning.networks.world_models.ensmeble_sa_world import EnsembleWorldAndOneSAReward - - actor = Actor(observation_size, action_num) - critic = Critic(observation_size, action_num) - - device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - - world_model = EnsembleWorldAndOneSAReward( - observation_size=observation_size, - num_actions=action_num, - num_models=config.num_models, - lr=config.world_model_lr, - device=device, - ) - - agent = DynaSAC_SA( - actor_network=actor, - critic_network=critic, - world_network=world_model, - actor_lr=config.actor_lr, - critic_lr=config.critic_lr, - gamma=config.gamma, - tau=config.tau, - action_num=action_num, - alpha_lr=config.alpha_lr, - horizon=config.horizon, - num_samples=config.num_samples, - device=device, - ) - return agent - - - - -def create_DynaSAC_SAS(observation_size, action_num, config: AlgorithmConfig): - """ - Create networks for model-based SAC agent. The Actor and Critic is same. - An extra world model is added. - - """ - from cares_reinforcement_learning.algorithm.mbrl import DynaSAC_SAS - from cares_reinforcement_learning.networks.SAC import Actor, Critic - from cares_reinforcement_learning.networks.world_models import EnsembleWorldAndOneSASReward - - actor = Actor(observation_size, action_num) - critic = Critic(observation_size, action_num) - - device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - - world_model = EnsembleWorldAndOneSASReward( - observation_size=observation_size, - num_actions=action_num, - num_models=config.num_models, - lr=config.world_model_lr, - device=device, - ) - - agent = DynaSAC_SAS( - actor_network=actor, - critic_network=critic, - world_network=world_model, - actor_lr=config.actor_lr, - critic_lr=config.critic_lr, - gamma=config.gamma, - tau=config.tau, - action_num=action_num, - alpha_lr=config.alpha_lr, - horizon=config.horizon, - num_samples=config.num_samples, - device=device, - ) - return agent - - -def create_DynaSAC_SAS_Immersive_Weight(observation_size, action_num, config: AlgorithmConfig): - """ - Create networks for model-based SAC agent. The Actor and Critic is same. - An extra world model is added. - - """ - from cares_reinforcement_learning.algorithm.mbrl import DynaSAC_SAS_Immersive_Weight - from cares_reinforcement_learning.networks.SAC import Actor, Critic - from cares_reinforcement_learning.networks.world_models import EnsembleWorldAndOneSASReward - - actor = Actor(observation_size, action_num) - critic = Critic(observation_size, action_num) - - device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - - world_model = EnsembleWorldAndOneSASReward( - observation_size=observation_size, - num_actions=action_num, - num_models=config.num_models, - device=device, - lr=config.world_model_lr, - ) - - agent = DynaSAC_SAS_Immersive_Weight( - actor_network=actor, - critic_network=critic, - world_network=world_model, - actor_lr=config.actor_lr, - critic_lr=config.critic_lr, - gamma=config.gamma, - tau=config.tau, - action_num=action_num, - device=device, - alpha_lr=config.alpha_lr, - horizon=config.horizon, - num_samples=config.num_samples, - threshold_scale=config.threshold_scale, - reweight_critic=config.reweight_critic, - reweight_actor=config.reweight_actor, - mode=config.mode, - sample_times=config.sample_times, - ) - return agent - - - - -def create_DynaSAC_BIVReweight(observation_size, action_num, config: AlgorithmConfig): - """ - Create networks for model-based SAC agent. The Actor and Critic is same. - An extra world model is added. - - """ - from cares_reinforcement_learning.algorithm.mbrl import DynaSAC_BIVReweight +def create_SAC(observation_size, action_num, config: acf.SACConfig): + from cares_reinforcement_learning.algorithm.policy import SAC from cares_reinforcement_learning.networks.SAC import Actor, Critic - from cares_reinforcement_learning.networks.world_models import EnsembleWorldAndOneNSReward - - actor = Actor(observation_size, action_num) - critic = Critic(observation_size, action_num) - device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - - world_model = EnsembleWorldAndOneNSReward( - observation_size=observation_size, - num_actions=action_num, - num_models=config.num_models, - device=device, - lr=config.world_model_lr, - ) + actor = Actor(observation_size, action_num, config=config) + critic = Critic(observation_size, action_num, config=config) - agent = DynaSAC_BIVReweight( + device = hlp.get_device() + agent = SAC( actor_network=actor, critic_network=critic, - world_network=world_model, - actor_lr=config.actor_lr, - critic_lr=config.critic_lr, - gamma=config.gamma, - tau=config.tau, - action_num=action_num, + config=config, device=device, - alpha_lr=config.alpha_lr, - horizon=config.horizon, - num_samples=config.num_samples, - threshold_scale=config.threshold_scale, - reweight_critic=config.reweight_critic, - reweight_actor=config.reweight_actor, - mode=config.mode, - sample_times=config.sample_times, ) return agent -def create_DynaSAC_SUNRISEReweight(observation_size, action_num, config: AlgorithmConfig): +def create_DynaSAC_NS(observation_size, action_num, config: acf.DynaSAC_NSConfig): """ Create networks for model-based SAC agent. The Actor and Critic is same. An extra world model is added. - """ - from cares_reinforcement_learning.algorithm.mbrl import DynaSAC_SUNRISEReweight + from cares_reinforcement_learning.algorithm.mbrl import DynaSAC_NS from cares_reinforcement_learning.networks.SAC import Actor, Critic - from cares_reinforcement_learning.networks.world_models import EnsembleWorldAndOneNSReward + from cares_reinforcement_learning.networks.world_models.ensemble import Ensemble_Dyna_Big - actor = Actor(observation_size, action_num) - critic = Critic(observation_size, action_num) + actor = Actor(observation_size, action_num, config=config) + critic = Critic(observation_size, action_num, config=config) - device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + device = hlp.get_device() - world_model = EnsembleWorldAndOneNSReward( + world_model = Ensemble_Dyna_Big( observation_size=observation_size, num_actions=action_num, num_models=config.num_models, device=device, - lr=config.world_model_lr, + sas=config.sas ) - agent = DynaSAC_SUNRISEReweight( + agent = DynaSAC_NS( actor_network=actor, critic_network=critic, world_network=world_model, @@ -261,87 +60,43 @@ def create_DynaSAC_SUNRISEReweight(observation_size, action_num, config: Algorit gamma=config.gamma, tau=config.tau, action_num=action_num, - device=device, alpha_lr=config.alpha_lr, horizon=config.horizon, num_samples=config.num_samples, - threshold_scale=config.threshold_scale, - reweight_critic=config.reweight_critic, - reweight_actor=config.reweight_actor, - mode=config.mode, - sample_times=config.sample_times, - ) - return agent - - -def create_DynaSAC_UWACReweight(observation_size, action_num, config: AlgorithmConfig): - """ - Create networks for model-based SAC agent. The Actor and Critic is same. - An extra world model is added. - - """ - from cares_reinforcement_learning.algorithm.mbrl import DynaSAC_UWACReweight - from cares_reinforcement_learning.networks.SAC import Actor, Critic - from cares_reinforcement_learning.networks.world_models import EnsembleWorldAndOneNSReward - - actor = Actor(observation_size, action_num) - critic = Critic(observation_size, action_num) - - device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - - world_model = EnsembleWorldAndOneNSReward( - observation_size=observation_size, - num_actions=action_num, - num_models=config.num_models, device=device, - lr=config.world_model_lr, - ) - - agent = DynaSAC_UWACReweight( - actor_network=actor, - critic_network=critic, - world_network=world_model, - actor_lr=config.actor_lr, - critic_lr=config.critic_lr, - gamma=config.gamma, - tau=config.tau, - action_num=action_num, - device=device, - alpha_lr=config.alpha_lr, - horizon=config.horizon, - num_samples=config.num_samples, - threshold_scale=config.threshold_scale, - reweight_critic=config.reweight_critic, - reweight_actor=config.reweight_actor, - mode=config.mode, - sample_times=config.sample_times, + train_both=config.train_both, + train_reward=config.train_reward, + gripper=config.gripper ) return agent -def create_DynaSAC(observation_size, action_num, config: AlgorithmConfig): +def create_DynaSAC_Bounded(observation_size, action_num, config: acf.DynaSAC_BoundedConfig): """ Create networks for model-based SAC agent. The Actor and Critic is same. An extra world model is added. """ - from cares_reinforcement_learning.algorithm.mbrl import DynaSAC_NS + from cares_reinforcement_learning.algorithm.mbrl import DynaSAC_Bounded from cares_reinforcement_learning.networks.SAC import Actor, Critic - from cares_reinforcement_learning.networks.world_models import EnsembleWorldAndOneNSReward + from cares_reinforcement_learning.networks.world_models.ensemble import Ensemble_Dyna_Big - actor = Actor(observation_size, action_num) - critic = Critic(observation_size, action_num) + actor = Actor(observation_size, action_num, config=config) + critic = Critic(observation_size, action_num, config=config) - device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + device = hlp.get_device() - world_model = EnsembleWorldAndOneNSReward( + world_model = Ensemble_Dyna_Big( observation_size=observation_size, num_actions=action_num, num_models=config.num_models, - lr=config.world_model_lr, device=device, + l_r=config.world_model_lr, + sas=config.sas, + prob_rwd=True, + boost_inter=30 ) - agent = DynaSAC_NS( + agent = DynaSAC_Bounded( actor_network=actor, critic_network=critic, world_network=world_model, @@ -354,55 +109,283 @@ def create_DynaSAC(observation_size, action_num, config: AlgorithmConfig): horizon=config.horizon, num_samples=config.num_samples, device=device, + train_both=config.train_both, + train_reward=config.train_reward, + gripper=config.gripper, + threshold=config.threshold, + exploration_sample=config.exploration_sample ) - return agent - - -def create_SAC(observation_size, action_num, config: acf.SACConfig): - from cares_reinforcement_learning.algorithm.policy import SAC - from cares_reinforcement_learning.networks.SAC import Actor, Critic - actor = Actor(observation_size, action_num, config=config) - critic = Critic(observation_size, action_num, config=config) - - device = hlp.get_device() - agent = SAC( - actor_network=actor, - critic_network=critic, - config=config, - device=device, - ) return agent +# def create_DynaSAC_SAS_Immersive_Weight(observation_size, action_num, config: AlgorithmConfig): +# """ +# Create networks for model-based SAC agent. The Actor and Critic is same. +# An extra world model is added. +# +# """ +# from cares_reinforcement_learning.algorithm.mbrl import DynaSAC_SAS_Immersive_Weight +# from cares_reinforcement_learning.networks.SAC import Actor, Critic +# from cares_reinforcement_learning.networks.world_models import EnsembleWorldAndOneSASReward +# +# actor = Actor(observation_size, action_num) +# critic = Critic(observation_size, action_num) +# +# device = torch.device("cuda" if torch.cuda.is_available() else "cpu") +# +# world_model = EnsembleWorldAndOneSASReward( +# observation_size=observation_size, +# num_actions=action_num, +# num_models=config.num_models, +# device=device, +# lr=config.world_model_lr, +# ) +# +# agent = DynaSAC_SAS_Immersive_Weight( +# actor_network=actor, +# critic_network=critic, +# world_network=world_model, +# actor_lr=config.actor_lr, +# critic_lr=config.critic_lr, +# gamma=config.gamma, +# tau=config.tau, +# action_num=action_num, +# device=device, +# alpha_lr=config.alpha_lr, +# horizon=config.horizon, +# num_samples=config.num_samples, +# threshold_scale=config.threshold_scale, +# reweight_critic=config.reweight_critic, +# reweight_actor=config.reweight_actor, +# mode=config.mode, +# sample_times=config.sample_times, +# ) +# return agent + + +# def create_STEVE_MEAN(observation_size, action_num, config: acf.STEVE_MEANConfig): +# """ +# Create networks for model-based SAC agent. The Actor and Critic is same. +# An extra world model is added. +# +# """ +# from cares_reinforcement_learning.algorithm.mbrl import STEVE_MEAN +# from cares_reinforcement_learning.networks.SAC import Actor, Critic +# from cares_reinforcement_learning.networks.world_models.ensemble_world_ensemble_sas_reward import \ +# EnsembleWorldEnsembleSASReward +# +# actor = Actor(observation_size, action_num) +# critic = Critic(observation_size, action_num) +# +# device = torch.device("cuda" if torch.cuda.is_available() else "cpu") +# +# world_model = EnsembleWorldEnsembleSASReward( +# observation_size=observation_size, +# num_actions=action_num, +# num_world_models=config.num_world_models, +# num_reward_models=config.num_reward_models, +# lr=config.world_model_lr, +# device=device, +# ) +# +# agent = STEVE_MEAN( +# actor_network=actor, +# critic_network=critic, +# world_network=world_model, +# gamma=config.gamma, +# tau=config.tau, +# action_num=action_num, +# actor_lr=config.actor_lr, +# critic_lr=config.critic_lr, +# alpha_lr=config.alpha_lr, +# horizon=config.horizon, +# L=config.num_critic_models, +# device=device, +# ) +# return agent + + +# def create_DynaSAC_SAS(observation_size, action_num, config: AlgorithmConfig): +# """ +# Create networks for model-based SAC agent. The Actor and Critic is same. +# An extra world model is added. +# +# """ +# from cares_reinforcement_learning.algorithm.mbrl import DynaSAC_SAS +# from cares_reinforcement_learning.networks.SAC import Actor, Critic +# from cares_reinforcement_learning.networks.world_models import EnsembleWorldAndOneSASReward +# +# actor = Actor(observation_size, action_num) +# critic = Critic(observation_size, action_num) +# +# device = torch.device("cuda" if torch.cuda.is_available() else "cpu") +# +# world_model = EnsembleWorldAndOneSASReward( +# observation_size=observation_size, +# num_actions=action_num, +# num_models=config.num_models, +# lr=config.world_model_lr, +# device=device, +# ) +# +# agent = DynaSAC_SAS( +# actor_network=actor, +# critic_network=critic, +# world_network=world_model, +# actor_lr=config.actor_lr, +# critic_lr=config.critic_lr, +# gamma=config.gamma, +# tau=config.tau, +# action_num=action_num, +# alpha_lr=config.alpha_lr, +# horizon=config.horizon, +# num_samples=config.num_samples, +# device=device, +# ) +# return agent + + + + + + + +# def create_DynaSAC_BIVReweight(observation_size, action_num, config: AlgorithmConfig): +# """ +# Create networks for model-based SAC agent. The Actor and Critic is same. +# An extra world model is added. +# +# """ +# from cares_reinforcement_learning.algorithm.mbrl import DynaSAC_BIVReweight +# from cares_reinforcement_learning.networks.SAC import Actor, Critic +# from cares_reinforcement_learning.networks.world_models import EnsembleWorldAndOneNSReward +# +# actor = Actor(observation_size, action_num) +# critic = Critic(observation_size, action_num) +# +# device = torch.device("cuda" if torch.cuda.is_available() else "cpu") +# +# world_model = EnsembleWorldAndOneNSReward( +# observation_size=observation_size, +# num_actions=action_num, +# num_models=config.num_models, +# device=device, +# lr=config.world_model_lr, +# ) +# +# agent = DynaSAC_BIVReweight( +# actor_network=actor, +# critic_network=critic, +# world_network=world_model, +# actor_lr=config.actor_lr, +# critic_lr=config.critic_lr, +# gamma=config.gamma, +# tau=config.tau, +# action_num=action_num, +# device=device, +# alpha_lr=config.alpha_lr, +# horizon=config.horizon, +# num_samples=config.num_samples, +# threshold_scale=config.threshold_scale, +# reweight_critic=config.reweight_critic, +# reweight_actor=config.reweight_actor, +# mode=config.mode, +# sample_times=config.sample_times, +# ) +# return agent +# +# +# def create_DynaSAC_SUNRISEReweight(observation_size, action_num, config: AlgorithmConfig): +# """ +# Create networks for model-based SAC agent. The Actor and Critic is same. +# An extra world model is added. +# +# """ +# from cares_reinforcement_learning.algorithm.mbrl import DynaSAC_SUNRISEReweight +# from cares_reinforcement_learning.networks.SAC import Actor, Critic +# from cares_reinforcement_learning.networks.world_models import EnsembleWorldAndOneNSReward +# +# actor = Actor(observation_size, action_num) +# critic = Critic(observation_size, action_num) +# +# device = torch.device("cuda" if torch.cuda.is_available() else "cpu") +# +# world_model = EnsembleWorldAndOneNSReward( +# observation_size=observation_size, +# num_actions=action_num, +# num_models=config.num_models, +# device=device, +# lr=config.world_model_lr, +# ) +# +# agent = DynaSAC_SUNRISEReweight( +# actor_network=actor, +# critic_network=critic, +# world_network=world_model, +# actor_lr=config.actor_lr, +# critic_lr=config.critic_lr, +# gamma=config.gamma, +# tau=config.tau, +# action_num=action_num, +# device=device, +# alpha_lr=config.alpha_lr, +# horizon=config.horizon, +# num_samples=config.num_samples, +# threshold_scale=config.threshold_scale, +# reweight_critic=config.reweight_critic, +# reweight_actor=config.reweight_actor, +# mode=config.mode, +# sample_times=config.sample_times, +# ) +# return agent +# +# +# def create_DynaSAC_UWACReweight(observation_size, action_num, config: AlgorithmConfig): +# """ +# Create networks for model-based SAC agent. The Actor and Critic is same. +# An extra world model is added. +# +# """ +# from cares_reinforcement_learning.algorithm.mbrl import DynaSAC_UWACReweight +# from cares_reinforcement_learning.networks.SAC import Actor, Critic +# from cares_reinforcement_learning.networks.world_models import EnsembleWorldAndOneNSReward +# +# actor = Actor(observation_size, action_num) +# critic = Critic(observation_size, action_num) +# +# device = torch.device("cuda" if torch.cuda.is_available() else "cpu") +# +# world_model = EnsembleWorldAndOneNSReward( +# observation_size=observation_size, +# num_actions=action_num, +# num_models=config.num_models, +# device=device, +# lr=config.world_model_lr, +# ) +# +# agent = DynaSAC_UWACReweight( +# actor_network=actor, +# critic_network=critic, +# world_network=world_model, +# actor_lr=config.actor_lr, +# critic_lr=config.critic_lr, +# gamma=config.gamma, +# tau=config.tau, +# action_num=action_num, +# device=device, +# alpha_lr=config.alpha_lr, +# horizon=config.horizon, +# num_samples=config.num_samples, +# threshold_scale=config.threshold_scale, +# reweight_critic=config.reweight_critic, +# reweight_actor=config.reweight_actor, +# mode=config.mode, +# sample_times=config.sample_times, +# ) +# return agent -def create_DynaSAC(observation_size, action_num, config: acf.DynaSACConfig): - """ - Create networks for model-based SAC agent. The Actor and Critic is same. - An extra world model is added. - """ - from cares_reinforcement_learning.algorithm.mbrl import DynaSAC - from cares_reinforcement_learning.networks.DynaSAC import Actor, Critic - from cares_reinforcement_learning.networks.world_models import EnsembleWorldReward - actor = Actor(observation_size, action_num, config=config) - critic = Critic(observation_size, action_num, config=config) - device = hlp.get_device() - world_model = EnsembleWorldReward( - observation_size=observation_size, - num_actions=action_num, - num_models=config.num_models, - lr=config.world_model_lr, - device=device, - ) - agent=DynaSAC( - actor_network=actor, - critic_network=critic, - world_network=world_model, - config=config, - device=device, - ) - return agent class NetworkFactory: From b7b1963f76e45827d616ba315d5433266742d1c6 Mon Sep 17 00:00:00 2001 From: tony Date: Sun, 22 Dec 2024 00:08:55 +1300 Subject: [PATCH 68/91] merge --- cares_reinforcement_learning/algorithm/mbrl/DynaSAC_Bounded.py | 2 +- cares_reinforcement_learning/algorithm/mbrl/DynaSAC_NS.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_Bounded.py b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_Bounded.py index f84c99b1..9d83c2af 100644 --- a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_Bounded.py +++ b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_Bounded.py @@ -242,7 +242,7 @@ def train_policy(self, memory: MemoryBuffer, batch_size: int) -> None: dones=dones, weights=torch.ones(rewards.shape) ) - # self._dyna_generate_and_train(next_states) + self._dyna_generate_and_train(next_states) def _dyna_generate_and_train(self, next_states: torch.Tensor) -> None: pred_states = [] diff --git a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_NS.py b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_NS.py index 00a65d55..4624e392 100644 --- a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_NS.py +++ b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_NS.py @@ -211,7 +211,7 @@ def train_policy(self, memory: MemoryBuffer, batch_size: int) -> None: dones=dones, weights=torch.ones(rewards.shape) ) - # self._dyna_generate_and_train(next_states) + self._dyna_generate_and_train(next_states) def _dyna_generate_and_train(self, next_states: torch.Tensor) -> None: pred_states = [] From 0842673b98dc876305af82a57c9259d736b181c2 Mon Sep 17 00:00:00 2001 From: tony Date: Sun, 22 Dec 2024 10:13:02 +1300 Subject: [PATCH 69/91] merge --- .../networks/world_models/world_model.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cares_reinforcement_learning/networks/world_models/world_model.py b/cares_reinforcement_learning/networks/world_models/world_model.py index b6ec2744..b97a36ec 100644 --- a/cares_reinforcement_learning/networks/world_models/world_model.py +++ b/cares_reinforcement_learning/networks/world_models/world_model.py @@ -123,13 +123,13 @@ def train_reward( if self.sas: rwd_mean, rwd_var = self.reward_network(states, actions, next_states) else: - rwd_mean, rwd_var = self.reward_network(next_states, actions) + rwd_mean, rwd_var = self.reward_network(next_states) reward_loss = F.gaussian_nll_loss(input=rwd_mean, target=rewards, var=rwd_var) else: if self.sas: rwd_mean = self.reward_network(states, actions, next_states) else: - rwd_mean = self.reward_network(next_states, actions) + rwd_mean = self.reward_network(next_states) reward_loss = F.mse_loss(rwd_mean, rewards) reward_loss.backward() self.reward_optimizer.step() From 53b578de5af64a3f85a31815f1e87f82fdac3445 Mon Sep 17 00:00:00 2001 From: tony Date: Tue, 24 Dec 2024 20:53:52 +1300 Subject: [PATCH 70/91] merge --- .../algorithm/mbrl/DynaSAC_Bounded.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_Bounded.py b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_Bounded.py index 9d83c2af..0af1e58f 100644 --- a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_Bounded.py +++ b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_Bounded.py @@ -11,7 +11,7 @@ import numpy as np import torch - +from torch import nn from cares_reinforcement_learning.memory import MemoryBuffer from cares_reinforcement_learning.networks.world_models.ensemble import ( @@ -84,10 +84,17 @@ def __init__( # World model self.world_model = world_network + self.k_l = nn.KLDivLoss(reduction='batchmean', log_target=True) + @property def _alpha(self) -> float: return self.log_alpha.exp() + def _jsd(self, p, q): + p, q = p.view(-1, p.size(-1)).log_softmax(-1), q.view(-1, q.size(-1)).log_softmax(-1) + m = (0.5 * (p + q)) + return 0.5 * (self.k_l(m, p) + self.k_l(m, q)) + def select_action_from_policy( self, state: np.ndarray, evaluation: bool = False, noise_scale: float = 0 ) -> np.ndarray: @@ -116,11 +123,15 @@ def select_action_from_policy( multi_log_pi = multi_log_pi.squeeze() policy_dist = F.softmax(multi_log_pi, dim=0) world_dist = F.softmax(uncert, dim=0) + world_dist -= torch.min(world_dist) + final_dist = (1 - self.threshold) * policy_dist + self.threshold * world_dist final_dist = F.softmax(final_dist, dim=0) candi = torch.argmax(final_dist) # new_dist = torch.distributions.Categorical(final_dist) # candi = new_dist.sample([5]).squeeze() + # print(self._jsd(policy_dist, final_dist)) + action = multi_action[candi] else: (action, _, _) = self.actor_net(state_tensor) From 671e18c5cbfb3cbf1e0d882cedc2edd91e92a471 Mon Sep 17 00:00:00 2001 From: tony Date: Fri, 27 Dec 2024 17:10:12 +1300 Subject: [PATCH 71/91] merge --- .../algorithm/mbrl/DynaSAC_Bounded.py | 6 +- .../mbrl/{STEVE_MEAN_SAC.py => STEVESAC.py} | 79 +++-- .../algorithm/mbrl/STEVESAC_Bounded.py | 313 ++++++++++++++++++ .../algorithm/mbrl/__init__.py | 3 +- .../ensemble/world_ensemble_big.py | 14 +- .../networks/world_models/world_model.py | 164 ++++++--- .../util/configurations.py | 71 +++- .../util/network_factory.py | 147 +++++--- 8 files changed, 657 insertions(+), 140 deletions(-) rename cares_reinforcement_learning/algorithm/mbrl/{STEVE_MEAN_SAC.py => STEVESAC.py} (81%) create mode 100644 cares_reinforcement_learning/algorithm/mbrl/STEVESAC_Bounded.py diff --git a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_Bounded.py b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_Bounded.py index 0af1e58f..3645d877 100644 --- a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_Bounded.py +++ b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_Bounded.py @@ -111,6 +111,7 @@ def select_action_from_policy( (multi_action, multi_log_pi, _) = self.actor_net(multi_state_tensor) # Estimate uncertainty # [6, 10, 17] + print("--------------------") _, _, nstate_means, nstate_vars = self.world_model.pred_next_states( observation=multi_state_tensor, actions=multi_action) # [10, 17] @@ -120,13 +121,14 @@ def select_action_from_policy( epistemic = torch.clamp(epistemic, max=10e3) total_unc = (aleatoric ** 2 + epistemic ** 2) ** 0.5 uncert = torch.mean(total_unc, dim=1) + multi_log_pi = multi_log_pi.squeeze() policy_dist = F.softmax(multi_log_pi, dim=0) world_dist = F.softmax(uncert, dim=0) world_dist -= torch.min(world_dist) - final_dist = (1 - self.threshold) * policy_dist + self.threshold * world_dist - final_dist = F.softmax(final_dist, dim=0) + final_dist = policy_dist + self.threshold * world_dist + #final_dist = F.softmax(final_dist, dim=0) candi = torch.argmax(final_dist) # new_dist = torch.distributions.Categorical(final_dist) # candi = new_dist.sample([5]).squeeze() diff --git a/cares_reinforcement_learning/algorithm/mbrl/STEVE_MEAN_SAC.py b/cares_reinforcement_learning/algorithm/mbrl/STEVESAC.py similarity index 81% rename from cares_reinforcement_learning/algorithm/mbrl/STEVE_MEAN_SAC.py rename to cares_reinforcement_learning/algorithm/mbrl/STEVESAC.py index 325d8a9c..b00dde06 100644 --- a/cares_reinforcement_learning/algorithm/mbrl/STEVE_MEAN_SAC.py +++ b/cares_reinforcement_learning/algorithm/mbrl/STEVESAC.py @@ -8,18 +8,18 @@ import copy import logging -import os + import numpy as np import torch -import torch.nn.functional as F - +from torch import nn from cares_reinforcement_learning.memory import MemoryBuffer + from cares_reinforcement_learning.networks.world_models.ensemble import ( - Ensemble_Dyna_Big + Ensemble_Dyna_Big, ) -class STEVE_MEAN: +class STEVESAC: def __init__( self, actor_network: torch.nn.Module, @@ -32,21 +32,31 @@ def __init__( critic_lr: float, alpha_lr: float, horizon: int, - L: int, device: torch.device, + train_reward: bool, + train_both: bool, + gripper: bool, ): - self.L = L - self.horizon = horizon + logging.info("----------------------------------------") + logging.info("----I am runing the STEVESAC Agent! ----") + logging.info("----------------------------------------") + self.train_reward = train_reward + self.train_both = train_both + self.gripper = gripper + self.set_stat = False self.type = "mbrl" self.device = device + # this may be called policy_net in other implementations self.actor_net = actor_network.to(self.device) # this may be called soft_q_net in other implementations self.critic_net = critic_network.to(self.device) self.target_critic_net = copy.deepcopy(self.critic_net) + self.gamma = gamma self.tau = tau + self.horizon = horizon self.action_num = action_num self.learn_counter = 0 @@ -68,11 +78,12 @@ def __init__( # World model self.world_model = world_network + self.k_l = nn.KLDivLoss(reduction='batchmean', log_target=True) + @property def _alpha(self) -> float: return self.log_alpha.exp() - # pylint: disable-next=unused-argument to keep the same interface def select_action_from_policy( self, state: np.ndarray, evaluation: bool = False, noise_scale: float = 0 ) -> np.ndarray: @@ -95,23 +106,25 @@ def _train_policy( rewards: torch.Tensor, next_states: torch.Tensor, dones: torch.Tensor, + weights: torch.Tensor, ) -> None: + if weights is None: + weights = torch.ones(rewards.shape) ################## Update the Critic First #################### with torch.no_grad(): not_dones = (1 - dones) q_means = [] q_weights = [] - accum_dist_rewards = torch.repeat_interleave(rewards.unsqueeze(dim=0), repeats=25, dim=0) + accum_dist_rewards = torch.repeat_interleave(rewards.unsqueeze(dim=0), repeats=30, dim=0) # 5 * 5 * 4 = 100 for hori in range(self.horizon): - _, curr_hori_log_pi, curr_hori_action= self.actor_net(next_states) + _, curr_hori_log_pi, curr_hori_action = self.actor_net(next_states) mean_predictions, all_mean_next, _, _ = self.world_model.pred_next_states(next_states, curr_hori_action) - pred_rewards, _ = self.world_model.pred_multiple_rewards(observation=next_states, - action=curr_hori_action, - next_observation=all_mean_next) + pred_rewards, _ = self.world_model.pred_all_rewards(observation=next_states, + action=curr_hori_action, + next_observation=all_mean_next) pred_rewards *= (self.gamma ** (hori + 1)) accum_dist_rewards += pred_rewards - # V = Q - alpha * logi pred_q1, pred_q2 = self.target_critic_net(next_states, curr_hori_action) pred_q3, pred_q4 = self.critic_net(next_states, curr_hori_action) @@ -192,7 +205,6 @@ def train_world_model( states = torch.FloatTensor(np.asarray(states)).to(self.device) actions = torch.FloatTensor(np.asarray(actions)).to(self.device) - rewards = torch.FloatTensor(np.asarray(rewards)).to(self.device).unsqueeze(1) next_states = torch.FloatTensor(np.asarray(next_states)).to(self.device) self.world_model.train_world( @@ -200,12 +212,16 @@ def train_world_model( actions=actions, next_states=next_states, ) - self.world_model.train_reward( - states=states, - actions=actions, - rewards=rewards, - next_states=next_states - ) + + batch_size = len(states) + # Reshape to batch_size x whatever + if self.train_reward: + rewards = torch.FloatTensor(np.asarray(rewards)).to(self.device) + rewards = rewards.unsqueeze(0).reshape(batch_size, 1) + if self.train_both: + self.world_model.train_together(states, actions, rewards) + else: + self.world_model.train_reward(states, actions, next_states, rewards) def train_policy(self, memory: MemoryBuffer, batch_size: int) -> None: self.learn_counter += 1 @@ -227,22 +243,23 @@ def train_policy(self, memory: MemoryBuffer, batch_size: int) -> None: rewards=rewards, next_states=next_states, dones=dones, + weights=torch.ones(rewards.shape) ) def set_statistics(self, stats: dict) -> None: self.world_model.set_statistics(stats) + self.set_stat = True def save_models(self, filename: str, filepath: str = "models") -> None: - path = f"{filepath}/models" if filepath != "models" else filepath - dir_exists = os.path.exists(path) - if not dir_exists: - os.makedirs(path) - torch.save(self.actor_net.state_dict(), f"{path}/{filename}_actor.pth") - torch.save(self.critic_net.state_dict(), f"{path}/{filename}_critic.pth") + # if not os.path.exists(filepath): + # os.makedirs(filepath) + # print(filepath) + # logging.info(filepath) + # torch.save(self.actor_net.state_dict(), f"{filepath}/{filename}_actor.pht") + # torch.save(self.critic_net.state_dict(), f"{filepath}/{filename}_critic.pht") logging.info("models has been saved...") def load_models(self, filepath: str, filename: str) -> None: - path = f"{filepath}/models" if filepath != "models" else filepath - self.actor_net.load_state_dict(torch.load(f"{path}/{filename}_actor.pth")) - self.critic_net.load_state_dict(torch.load(f"{path}/{filename}_critic.pth")) + self.actor_net.load_state_dict(torch.load(f"{filepath}/{filename}_actor.pht")) + self.critic_net.load_state_dict(torch.load(f"{filepath}/{filename}_critic.pht")) logging.info("models has been loaded...") diff --git a/cares_reinforcement_learning/algorithm/mbrl/STEVESAC_Bounded.py b/cares_reinforcement_learning/algorithm/mbrl/STEVESAC_Bounded.py new file mode 100644 index 00000000..4e8bd0b7 --- /dev/null +++ b/cares_reinforcement_learning/algorithm/mbrl/STEVESAC_Bounded.py @@ -0,0 +1,313 @@ +""" +Sutton, Richard S. "Dyna, an integrated architecture for learning, planning, and reacting." + +Original Paper: https://dl.acm.org/doi/abs/10.1145/122344.122377 + +This code runs automatic entropy tuning +""" + +import copy +import logging + +import numpy as np +import torch +from torch import nn +from cares_reinforcement_learning.memory import MemoryBuffer + +from cares_reinforcement_learning.networks.world_models.ensemble import ( + Ensemble_Dyna_Big, +) +import torch.nn.functional as F + + +class STEVESAC_Bounded: + def __init__( + self, + actor_network: torch.nn.Module, + critic_network: torch.nn.Module, + world_network: Ensemble_Dyna_Big, + gamma: float, + tau: float, + action_num: int, + actor_lr: float, + critic_lr: float, + alpha_lr: float, + horizon: int, + device: torch.device, + train_reward: bool, + train_both: bool, + gripper: bool, + threshold: float, + exploration_sample: int + ): + logging.info("------------------------------------------------") + logging.info("----I am runing the STEVESAC_Bounded Agent! ----") + logging.info("------------------------------------------------") + self.train_reward = train_reward + self.train_both = train_both + self.gripper = gripper + self.exploration_sample = exploration_sample + self.threshold = threshold + self.set_stat = False + self.type = "mbrl" + self.device = device + + # this may be called policy_net in other implementations + self.actor_net = actor_network.to(self.device) + # this may be called soft_q_net in other implementations + self.critic_net = critic_network.to(self.device) + self.target_critic_net = copy.deepcopy(self.critic_net) + + self.gamma = gamma + self.tau = tau + + self.horizon = horizon + self.action_num = action_num + + self.learn_counter = 0 + self.policy_update_freq = 1 + + self.actor_net_optimiser = torch.optim.Adam( + self.actor_net.parameters(), lr=actor_lr + ) + self.critic_net_optimiser = torch.optim.Adam( + self.critic_net.parameters(), lr=critic_lr + ) + + # Set to initial alpha to 1.0 according to other baselines. + self.log_alpha = torch.FloatTensor([np.log(1.0)]).to(device) + self.log_alpha.requires_grad = True + self.target_entropy = -action_num + self.log_alpha_optimizer = torch.optim.Adam([self.log_alpha], lr=alpha_lr) + + # World model + self.world_model = world_network + + self.k_l = nn.KLDivLoss(reduction='batchmean', log_target=True) + + @property + def _alpha(self) -> float: + return self.log_alpha.exp() + + def select_action_from_policy( + self, state: np.ndarray, evaluation: bool = False, noise_scale: float = 0 + ) -> np.ndarray: + # note that when evaluating this algorithm we need to select mu as + self.actor_net.eval() + with torch.no_grad(): + state_tensor = torch.FloatTensor(state).unsqueeze(0).to(self.device) + if evaluation is False: + (action, _, _) = self.actor_net(state_tensor) + # if self.threshold == 0: + # (action, _, _) = self.actor_net(state_tensor) + # else: + # if self.set_stat: + # multi_state_tensor = torch.repeat_interleave(state_tensor, self.exploration_sample, dim=0) + # (multi_action, multi_log_pi, _) = self.actor_net(multi_state_tensor) + # # Estimate uncertainty + # # [6, 10, 17] + # _, _, nstate_means, nstate_vars = self.world_model.pred_next_states( + # observation=multi_state_tensor, actions=multi_action) + # # [10, 17] + # aleatoric = torch.mean(nstate_vars ** 2, dim=0) ** 0.5 + # epistemic = torch.var(nstate_means, dim=0) ** 0.5 + # aleatoric = torch.clamp(aleatoric, max=10e3) + # epistemic = torch.clamp(epistemic, max=10e3) + # total_unc = (aleatoric ** 2 + epistemic ** 2) ** 0.5 + # uncert = torch.mean(total_unc, dim=1) + # multi_log_pi = multi_log_pi.squeeze() + # policy_dist = F.softmax(multi_log_pi, dim=0) + # world_dist = F.softmax(uncert, dim=0) + # world_dist -= torch.min(world_dist) + # final_dist = (1 - self.threshold) * policy_dist + self.threshold * world_dist + # final_dist = F.softmax(final_dist, dim=0) + # candi = torch.argmax(final_dist) + # # new_dist = torch.distributions.Categorical(final_dist) + # # candi = new_dist.sample([5]).squeeze() + # # print(self._jsd(policy_dist, final_dist)) + # action = multi_action[candi] + # else: + # (action, _, _) = self.actor_net(state_tensor) + else: + (_, _, action) = self.actor_net(state_tensor) + action = action.cpu().data.numpy().flatten() + self.actor_net.train() + return action + + def _train_policy( + self, + states: torch.Tensor, + actions: torch.Tensor, + rewards: torch.Tensor, + next_states: torch.Tensor, + dones: torch.Tensor, + weights: torch.Tensor, + ) -> None: + if weights is None: + weights = torch.ones(rewards.shape) + ################## Update the Critic First #################### + with torch.no_grad(): + not_dones = (1 - dones) + q_means = [] + q_weights = [] + accum_dist_rewards = torch.repeat_interleave(rewards.unsqueeze(dim=0), repeats=30, dim=0) + # 5 * 5 * 4 = 100 + for hori in range(self.horizon): + _, curr_hori_log_pi, curr_hori_action = self.actor_net(next_states) + mean_predictions, all_mean_next, _, _ = self.world_model.pred_next_states(next_states, curr_hori_action) + pred_rewards, _ = self.world_model.pred_all_rewards(observation=next_states, + action=curr_hori_action, + next_observation=all_mean_next) + pred_rewards *= (self.gamma ** (hori + 1)) + accum_dist_rewards += pred_rewards + # V = Q - alpha * logi + pred_q1, pred_q2 = self.target_critic_net(next_states, curr_hori_action) + pred_q3, pred_q4 = self.critic_net(next_states, curr_hori_action) + pred_v1 = pred_q1 - self._alpha * curr_hori_log_pi + pred_v2 = pred_q2 - self._alpha * curr_hori_log_pi + pred_v3 = pred_q3 - self._alpha * curr_hori_log_pi + pred_v4 = pred_q4 - self._alpha * curr_hori_log_pi + q_0 = [] + for i in range(pred_rewards.shape[0]): + pred_tq1 = accum_dist_rewards[i] + not_dones * (self.gamma ** (hori + 2)) * pred_v1 + pred_tq2 = accum_dist_rewards[i] + not_dones * (self.gamma ** (hori + 2)) * pred_v2 + pred_tq3 = accum_dist_rewards[i] + not_dones * (self.gamma ** (hori + 2)) * pred_v3 + pred_tq4 = accum_dist_rewards[i] + not_dones * (self.gamma ** (hori + 2)) * pred_v4 + q_0.append(pred_tq1) + q_0.append(pred_tq2) + q_0.append(pred_tq3) + q_0.append(pred_tq4) + q_0 = torch.stack(q_0) + # Compute var, mean and add them to the queue + # [100, 256, 1] -> [256, 1] + mean_0 = torch.mean(q_0, dim=0) + q_means.append(mean_0) + var_0 = torch.var(q_0, dim=0) + var_0[torch.abs(var_0) < 0.0001] = 0.0001 + weights_0 = 1.0 / var_0 + q_weights.append(weights_0) + next_states = mean_predictions + all_means = torch.stack(q_means) + all_weights = torch.stack(q_weights) + total_weights = torch.sum(all_weights, dim=0) + for n in range(self.horizon): + all_weights[n] /= total_weights + q_target = torch.sum(all_weights * all_means, dim=0) + + q_values_one, q_values_two = self.critic_net(states, actions) + critic_loss_one = ((q_values_one - q_target).pow(2)).mean() + critic_loss_two = ((q_values_two - q_target).pow(2)).mean() + critic_loss_total = critic_loss_one + critic_loss_two + # Update the Critic + self.critic_net_optimiser.zero_grad() + critic_loss_total.backward() + self.critic_net_optimiser.step() + + ################## Update the Actor Second #################### + pi, first_log_p, _ = self.actor_net(states) + qf1_pi, qf2_pi = self.critic_net(states, pi) + min_qf_pi = torch.minimum(qf1_pi, qf2_pi) + actor_loss = ((self._alpha * first_log_p) - min_qf_pi).mean() + + # Update the Actor + self.actor_net_optimiser.zero_grad() + actor_loss.backward() + self.actor_net_optimiser.step() + + # Update the temperature + alpha_loss = -( + self.log_alpha * (first_log_p + self.target_entropy).detach() + ).mean() + + self.log_alpha_optimizer.zero_grad() + alpha_loss.backward() + self.log_alpha_optimizer.step() + + if self.learn_counter % self.policy_update_freq == 0: + for target_param, param in zip( + self.target_critic_net.parameters(), self.critic_net.parameters() + ): + target_param.data.copy_( + param.data * self.tau + target_param.data * (1.0 - self.tau) + ) + + def train_world_model( + self, memory: MemoryBuffer, batch_size: int + ) -> None: + + experiences = memory.sample_uniform(batch_size) + states, actions, rewards, next_states, _, _ = experiences + + states = torch.FloatTensor(np.asarray(states)).to(self.device) + actions = torch.FloatTensor(np.asarray(actions)).to(self.device) + next_states = torch.FloatTensor(np.asarray(next_states)).to(self.device) + + self.world_model.train_world( + states=states, + actions=actions, + next_states=next_states, + ) + + batch_size = len(states) + # Reshape to batch_size x whatever + if self.train_reward: + rewards = torch.FloatTensor(np.asarray(rewards)).to(self.device) + rewards = rewards.unsqueeze(0).reshape(batch_size, 1) + if self.train_both: + self.world_model.train_together(states, actions, rewards) + else: + self.world_model.train_reward(states, actions, next_states, rewards) + + def train_policy(self, memory: MemoryBuffer, batch_size: int) -> None: + self.learn_counter += 1 + + experiences = memory.sample_uniform(batch_size) + states, actions, rewards, next_states, dones, _ = experiences + + # Convert into tensor + states = torch.FloatTensor(np.asarray(states)).to(self.device) + actions = torch.FloatTensor(np.asarray(actions)).to(self.device) + rewards = torch.FloatTensor(np.asarray(rewards)).to(self.device).unsqueeze(1) + next_states = torch.FloatTensor(np.asarray(next_states)).to(self.device) + dones = torch.LongTensor(np.asarray(dones)).to(self.device).unsqueeze(1) + + # Step 2 train as usual + self._train_policy( + states=states, + actions=actions, + rewards=rewards, + next_states=next_states, + dones=dones, + weights=torch.ones(rewards.shape) + ) + + def reward_function(self, curr_states, next_states): + target_goal_tensor = curr_states[:, -2:] + object_current = next_states[:, -4:-2] + sq_diff = (target_goal_tensor - object_current) ** 2 + # [256, 1] + goal_distance_after = torch.sqrt(torch.sum(sq_diff, dim=1)).unsqueeze(dim=1) + pred_reward = -goal_distance_after + 70 + mask1 = goal_distance_after <= 10 + mask2 = goal_distance_after > 70 + pred_reward[mask1] = 800 + pred_reward[mask2] = 0 + return pred_reward + + def set_statistics(self, stats: dict) -> None: + self.world_model.set_statistics(stats) + self.set_stat = True + + def save_models(self, filename: str, filepath: str = "models") -> None: + # if not os.path.exists(filepath): + # os.makedirs(filepath) + # print(filepath) + # logging.info(filepath) + # torch.save(self.actor_net.state_dict(), f"{filepath}/{filename}_actor.pht") + # torch.save(self.critic_net.state_dict(), f"{filepath}/{filename}_critic.pht") + logging.info("models has been saved...") + + def load_models(self, filepath: str, filename: str) -> None: + self.actor_net.load_state_dict(torch.load(f"{filepath}/{filename}_actor.pht")) + self.critic_net.load_state_dict(torch.load(f"{filepath}/{filename}_critic.pht")) + logging.info("models has been loaded...") diff --git a/cares_reinforcement_learning/algorithm/mbrl/__init__.py b/cares_reinforcement_learning/algorithm/mbrl/__init__.py index 379eb6b9..0aa63719 100644 --- a/cares_reinforcement_learning/algorithm/mbrl/__init__.py +++ b/cares_reinforcement_learning/algorithm/mbrl/__init__.py @@ -4,4 +4,5 @@ from .DynaSAC_SUNRISE_NS import DynaSAC_SUNRISEReweight from .DynaSAC_UWAC_NS import DynaSAC_UWACReweight from .DynaSAC_BIV_NS import DynaSAC_BIVReweight -from .STEVE_MEAN_SAC import STEVE_MEAN +from .STEVESAC_Bounded import STEVESAC_Bounded +from .STEVESAC import STEVESAC diff --git a/cares_reinforcement_learning/networks/world_models/ensemble/world_ensemble_big.py b/cares_reinforcement_learning/networks/world_models/ensemble/world_ensemble_big.py index dec35183..8e07e48e 100644 --- a/cares_reinforcement_learning/networks/world_models/ensemble/world_ensemble_big.py +++ b/cares_reinforcement_learning/networks/world_models/ensemble/world_ensemble_big.py @@ -32,8 +32,17 @@ def __init__(self, prob_rwd: bool = True, num_models: int = 7, boost_inter: int = 3, + num_rwd_model: int = 1 ): - super().__init__(observation_size, num_actions, l_r, device, hidden_size, sas, prob_rwd) + super().__init__(observation_size=observation_size, + num_actions=num_actions, + l_r=l_r, + device=device, + hidden_size=hidden_size, + sas=sas, + prob_rwd=prob_rwd, + num_rwd_model=num_rwd_model) + self.num_models = num_models self.observation_size = observation_size self.num_actions = num_actions @@ -133,6 +142,8 @@ def train_world( self.update_counter += 1 self.update_counter %= self.boost_inter * self.num_models + + def estimate_uncertainty( self, observation: torch.Tensor, actions: torch.Tensor, train_reward:bool ) -> tuple[float, float, torch.Tensor]: @@ -242,7 +253,6 @@ def train_together(self, states: torch.Tensor, actions: torch.Tensor, rewards: t samples = denormalize_observation_delta(samples, self.statistics) samples += states - if self.prob_rwd: if self.sas: rwd_mean, rwd_var = self.reward_network(states, actions, samples) diff --git a/cares_reinforcement_learning/networks/world_models/world_model.py b/cares_reinforcement_learning/networks/world_models/world_model.py index b97a36ec..865379a8 100644 --- a/cares_reinforcement_learning/networks/world_models/world_model.py +++ b/cares_reinforcement_learning/networks/world_models/world_model.py @@ -22,7 +22,9 @@ def __init__( hidden_size=None, sas: bool = True, prob_rwd: bool = False, + num_rwd_model: int = 5 ): + logging.info(f"Num of Reward models: {num_rwd_model}") if hidden_size is None: hidden_size = [128, 128] self.sas = None @@ -32,38 +34,46 @@ def __init__( self.sas = sas self.prob_rwd = prob_rwd self.statistics = {} - if prob_rwd: - if sas: - self.reward_network = Probabilistic_SAS_Reward( - observation_size=observation_size, - num_actions=num_actions, - hidden_size=hidden_size, - normalize=False - ) - else: - self.reward_network = Probabilistic_NS_Reward( - observation_size=observation_size, - num_actions=num_actions, - hidden_size=hidden_size, - normalize=False - ) - else: - if sas: - self.reward_network = Simple_SAS_Reward( - observation_size=observation_size, - num_actions=num_actions, - hidden_size=hidden_size, - normalize=False - ) + self.counter = 0 + self.num_rwd_model = num_rwd_model + + self.rwd_models = [] + self.rwd_model_optimizers = [] + for i in range(self.num_rwd_model): + if prob_rwd: + if sas: + reward_network = Probabilistic_SAS_Reward( + observation_size=observation_size, + num_actions=num_actions, + hidden_size=hidden_size, + normalize=False + ) + else: + reward_network = Probabilistic_NS_Reward( + observation_size=observation_size, + num_actions=num_actions, + hidden_size=hidden_size, + normalize=False + ) else: - self.reward_network = Simple_NS_Reward( - observation_size=observation_size, - num_actions=num_actions, - hidden_size=hidden_size, - normalize=False - ) - self.reward_network.to(self.device) - self.reward_optimizer = optim.Adam(self.reward_network.parameters(), lr=l_r) + if sas: + reward_network = Simple_SAS_Reward( + observation_size=observation_size, + num_actions=num_actions, + hidden_size=hidden_size, + normalize=False + ) + else: + reward_network = Simple_NS_Reward( + observation_size=observation_size, + num_actions=num_actions, + hidden_size=hidden_size, + normalize=False + ) + reward_network.to(self.device) + self.rwd_models.append(reward_network) + reward_optimizer = optim.Adam(reward_network.parameters(), lr=l_r) + self.rwd_model_optimizers.append(reward_optimizer) def set_statistics(self, statistics: dict) -> None: """ @@ -118,21 +128,22 @@ def train_reward( :param next_states: :param rewards: """ - self.reward_optimizer.zero_grad() + indice = self.counter % self.num_rwd_model + self.rwd_model_optimizers[indice].zero_grad() if self.prob_rwd: if self.sas: - rwd_mean, rwd_var = self.reward_network(states, actions, next_states) + rwd_mean, rwd_var = self.rwd_models[indice](states, actions, next_states) else: - rwd_mean, rwd_var = self.reward_network(next_states) + rwd_mean, rwd_var = self.rwd_models[indice](next_states) reward_loss = F.gaussian_nll_loss(input=rwd_mean, target=rewards, var=rwd_var) else: if self.sas: - rwd_mean = self.reward_network(states, actions, next_states) + rwd_mean = self.rwd_models[indice](states, actions, next_states) else: - rwd_mean = self.reward_network(next_states) + rwd_mean = self.rwd_models[indice](next_states) reward_loss = F.mse_loss(rwd_mean, rewards) reward_loss.backward() - self.reward_optimizer.step() + self.rwd_model_optimizers[indice].step() def pred_rewards(self, observation: torch.Tensor, action: torch.Tensor, next_observation: torch.Tensor ) -> tuple[torch.Tensor, torch.Tensor]: @@ -143,22 +154,77 @@ def pred_rewards(self, observation: torch.Tensor, action: torch.Tensor, next_obs :param next_observation: :return: Predicted rewards, Means of rewards, Variances of rewards """ - - if self.prob_rwd: - if self.sas: - pred_rewards, rwd_var = self.reward_network(observation, action, next_observation) + preds = [] + preds_vars = [] + for i in range(self.num_rwd_model): + if self.prob_rwd: + if self.sas: + pred_rewards, rwd_var = self.rwd_models[i](observation, action, next_observation) + else: + pred_rewards, rwd_var = self.rwd_models[i](next_observation) else: - pred_rewards, rwd_var = self.reward_network(next_observation) - return pred_rewards, rwd_var - else: - if self.sas: - pred_rewards = self.reward_network(observation, action, next_observation) + if self.sas: + pred_rewards = self.rwd_models[i](observation, action, next_observation) + else: + pred_rewards = self.rwd_models[i](next_observation) + rwd_var = None + preds.append(pred_rewards) + preds_vars.append(rwd_var) + preds = torch.stack(preds) + total_unc = 0.0 + if self.num_rwd_model > 1: + epistemic_uncert = torch.var(preds, dim=0) ** 0.5 + aleatoric_uncert = torch.zeros(epistemic_uncert.shape) + if rwd_var is None: + rwd_var = torch.zeros(preds.shape) else: - pred_rewards = self.reward_network(next_observation) - return pred_rewards, None + rwd_var = torch.stack(preds_vars) + aleatoric_uncert = torch.mean(rwd_var ** 2, dim=0) ** 0.5 + total_unc = (aleatoric_uncert ** 2 + epistemic_uncert ** 2) ** 0.5 + + if preds.shape[0] > 1: + preds = torch.mean(preds, dim=0) + else: + preds = preds[0] + + return preds, total_unc + + def pred_all_rewards(self, observation: torch.Tensor, action: torch.Tensor, next_observation: torch.Tensor + ) -> tuple[torch.Tensor, torch.Tensor]: + """ + Predict reward based on SAS + :param observation: + :param action: + :param next_observation: + :return: Predicted rewards, Means of rewards, Variances of rewards + """ + preds = [] + preds_vars = [] + for j in range(next_observation.shape[0]): + for i in range(self.num_rwd_model): + if self.prob_rwd: + if self.sas: + pred_rewards, rwd_var = self.rwd_models[i](observation, action, next_observation[j]) + else: + pred_rewards, rwd_var = self.rwd_models[i](next_observation[j]) + else: + if self.sas: + pred_rewards = self.rwd_models[i](observation, action, next_observation[j]) + else: + pred_rewards = self.rwd_models[i](next_observation[j]) + rwd_var = None + preds.append(pred_rewards) + preds_vars.append(rwd_var) + preds = torch.stack(preds) + if rwd_var is None: + preds_vars = torch.zeros(preds.shape) + else: + preds_vars = torch.stack(preds_vars) + + return preds, preds_vars def estimate_uncertainty( - self, observation: torch.Tensor, actions: torch.Tensor, train_reward:bool, + self, observation: torch.Tensor, actions: torch.Tensor, train_reward: bool, ) -> tuple[float, float, torch.Tensor]: """ Estimate next state uncertainty and reward uncertainty. diff --git a/cares_reinforcement_learning/util/configurations.py b/cares_reinforcement_learning/util/configurations.py index 8048256e..5db5515f 100644 --- a/cares_reinforcement_learning/util/configurations.py +++ b/cares_reinforcement_learning/util/configurations.py @@ -134,6 +134,7 @@ class PPOConfig(AlgorithmConfig): ) critic_config: MLPConfig = MLPConfig(hidden_sizes=[1024, 1024]) + ################################### # SAC Algorithms # ################################### @@ -155,7 +156,7 @@ class SACConfig(AlgorithmConfig): class DynaSAC_NSConfig(AlgorithmConfig): algorithm: str = Field("DynaSAC_NS", Literal=True) type: str = Field("mbrl", Literal=True) - G: int = 1, + G: int = 1, G_model: float = 1, actor_lr: float = 3e-4 @@ -181,6 +182,73 @@ class DynaSAC_NSConfig(AlgorithmConfig): gripper: bool = False +class STEVESACConfig(AlgorithmConfig): + algorithm: str = Field("STEVESAC", Literal=True) + type: str = Field("mbrl", Literal=True) + G: int = 1, + G_model: float = 1, + + actor_lr: float = 3e-4 + critic_lr: float = 3e-4 + alpha_lr: float = 3e-4 + gamma: float = 0.99 + tau: float = 0.005 + reward_scale: float = 1.0 + log_std_bounds: list[float] = [-20, 2] + policy_update_freq: int = 1 + target_update_freq: int = 1 + actor_config: MLPConfig = MLPConfig(hidden_sizes=[256, 256]) + critic_config: MLPConfig = MLPConfig(hidden_sizes=[256, 256]) + + max_steps_exploration: int = 256 + + num_models: int = 6 + num_rwd_models: int = 5 + world_model_lr: float = 0.001 + + horizon: int = 3 + + sas: bool = False + train_reward: bool = True + train_both: bool = True + gripper: bool = False + + +class STEVESAC_BoundedConfig(AlgorithmConfig): + algorithm: str = Field("STEVESAC_Bounded", Literal=True) + type: str = Field("mbrl", Literal=True) + G: int = 1, + G_model: float = 1, + + actor_lr: float = 3e-4 + critic_lr: float = 3e-4 + alpha_lr: float = 3e-4 + gamma: float = 0.99 + tau: float = 0.005 + reward_scale: float = 1.0 + log_std_bounds: list[float] = [-20, 2] + policy_update_freq: int = 1 + target_update_freq: int = 1 + actor_config: MLPConfig = MLPConfig(hidden_sizes=[256, 256]) + critic_config: MLPConfig = MLPConfig(hidden_sizes=[256, 256]) + + max_steps_exploration: int = 256 + + num_models: int = 6 + num_rwd_models: int = 5 + world_model_lr: float = 0.001 + + horizon: int = 3 + + sas: bool = False + train_reward: bool = True + train_both: bool = True + gripper: bool = False + + threshold: float = 0.1 + exploration_sample: int = 5 + + class DynaSAC_BoundedConfig(AlgorithmConfig): algorithm: str = Field("DynaSAC_Bounded", Literal=True) type: str = Field("mbrl", Literal=True) @@ -662,6 +730,7 @@ class DroQConfig(SACConfig): layer_order=["dropout", "layernorm", "activation"], ) + class DDPGConfig(AlgorithmConfig): algorithm: str = Field("DDPG", Literal=True) actor_lr: float = 1e-4 diff --git a/cares_reinforcement_learning/util/network_factory.py b/cares_reinforcement_learning/util/network_factory.py index 303d3ae0..8e1acaf1 100644 --- a/cares_reinforcement_learning/util/network_factory.py +++ b/cares_reinforcement_learning/util/network_factory.py @@ -5,6 +5,7 @@ import cares_reinforcement_learning.util.configurations as acf import cares_reinforcement_learning.util.helpers as hlp + # Disable these as this is a deliberate use of dynamic imports # pylint: disable=import-outside-toplevel # pylint: disable=invalid-name @@ -118,6 +119,94 @@ def create_DynaSAC_Bounded(observation_size, action_num, config: acf.DynaSAC_Bou return agent + +def create_STEVESAC(observation_size, action_num, config: acf.STEVESACConfig): + """ + Create networks for model-based SAC agent. The Actor and Critic is same. + An extra world model is added. + """ + from cares_reinforcement_learning.algorithm.mbrl import STEVESAC + from cares_reinforcement_learning.networks.SAC import Actor, Critic + from cares_reinforcement_learning.networks.world_models.ensemble import Ensemble_Dyna_Big + + actor = Actor(observation_size, action_num, config=config) + critic = Critic(observation_size, action_num, config=config) + + device = hlp.get_device() + + world_model = Ensemble_Dyna_Big( + observation_size=observation_size, + num_actions=action_num, + num_models=config.num_models, + num_rwd_model=config.num_rwd_models, + device=device, + l_r=config.world_model_lr, + sas=config.sas + ) + + agent = STEVESAC( + actor_network=actor, + critic_network=critic, + world_network=world_model, + actor_lr=config.actor_lr, + critic_lr=config.critic_lr, + gamma=config.gamma, + tau=config.tau, + action_num=action_num, + alpha_lr=config.alpha_lr, + horizon=config.horizon, + device=device, + train_both=config.train_both, + train_reward=config.train_reward, + gripper=config.gripper + ) + return agent + + +def create_STEVESAC_Bounded(observation_size, action_num, config: acf.STEVESAC_BoundedConfig): + """ + Create networks for model-based SAC agent. The Actor and Critic is same. + An extra world model is added. + """ + + from cares_reinforcement_learning.algorithm.mbrl import STEVESAC_Bounded + from cares_reinforcement_learning.networks.SAC import Actor, Critic + from cares_reinforcement_learning.networks.world_models.ensemble import Ensemble_Dyna_Big + + actor = Actor(observation_size, action_num, config=config) + critic = Critic(observation_size, action_num, config=config) + + device = hlp.get_device() + + world_model = Ensemble_Dyna_Big(observation_size=observation_size, + num_actions=action_num, + num_models=config.num_models, + num_rwd_model=config.num_rwd_models, + device=device, + l_r=config.world_model_lr, sas=config.sas) + + agent = STEVESAC_Bounded( + actor_network=actor, + critic_network=critic, + world_network=world_model, + actor_lr=config.actor_lr, + critic_lr=config.critic_lr, + gamma=config.gamma, + tau=config.tau, + action_num=action_num, + alpha_lr=config.alpha_lr, + horizon=config.horizon, + device=device, + train_both=config.train_both, + train_reward=config.train_reward, + gripper=config.gripper, + threshold=config.threshold, + exploration_sample=config.exploration_sample + ) + + return agent + + # def create_DynaSAC_SAS_Immersive_Weight(observation_size, action_num, config: AlgorithmConfig): # """ # Create networks for model-based SAC agent. The Actor and Critic is same. @@ -163,48 +252,6 @@ def create_DynaSAC_Bounded(observation_size, action_num, config: acf.DynaSAC_Bou # return agent -# def create_STEVE_MEAN(observation_size, action_num, config: acf.STEVE_MEANConfig): -# """ -# Create networks for model-based SAC agent. The Actor and Critic is same. -# An extra world model is added. -# -# """ -# from cares_reinforcement_learning.algorithm.mbrl import STEVE_MEAN -# from cares_reinforcement_learning.networks.SAC import Actor, Critic -# from cares_reinforcement_learning.networks.world_models.ensemble_world_ensemble_sas_reward import \ -# EnsembleWorldEnsembleSASReward -# -# actor = Actor(observation_size, action_num) -# critic = Critic(observation_size, action_num) -# -# device = torch.device("cuda" if torch.cuda.is_available() else "cpu") -# -# world_model = EnsembleWorldEnsembleSASReward( -# observation_size=observation_size, -# num_actions=action_num, -# num_world_models=config.num_world_models, -# num_reward_models=config.num_reward_models, -# lr=config.world_model_lr, -# device=device, -# ) -# -# agent = STEVE_MEAN( -# actor_network=actor, -# critic_network=critic, -# world_network=world_model, -# gamma=config.gamma, -# tau=config.tau, -# action_num=action_num, -# actor_lr=config.actor_lr, -# critic_lr=config.critic_lr, -# alpha_lr=config.alpha_lr, -# horizon=config.horizon, -# L=config.num_critic_models, -# device=device, -# ) -# return agent - - # def create_DynaSAC_SAS(observation_size, action_num, config: AlgorithmConfig): # """ # Create networks for model-based SAC agent. The Actor and Critic is same. @@ -245,11 +292,6 @@ def create_DynaSAC_Bounded(observation_size, action_num, config: acf.DynaSAC_Bou # return agent - - - - - # def create_DynaSAC_BIVReweight(observation_size, action_num, config: AlgorithmConfig): # """ # Create networks for model-based SAC agent. The Actor and Critic is same. @@ -385,15 +427,12 @@ def create_DynaSAC_Bounded(observation_size, action_num, config: acf.DynaSAC_Bou # return agent - - - class NetworkFactory: def create_network( - self, - observation_size, - action_num: int, - config: acf.AlgorithmConfig, + self, + observation_size, + action_num: int, + config: acf.AlgorithmConfig, ): algorithm = config.algorithm From 503addb3f12c772c2957d3f3ce27092538f9ea50 Mon Sep 17 00:00:00 2001 From: tony Date: Fri, 27 Dec 2024 17:37:41 +1300 Subject: [PATCH 72/91] merge --- cares_reinforcement_learning/algorithm/mbrl/DynaSAC_Bounded.py | 1 - 1 file changed, 1 deletion(-) diff --git a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_Bounded.py b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_Bounded.py index 3645d877..9986e463 100644 --- a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_Bounded.py +++ b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_Bounded.py @@ -111,7 +111,6 @@ def select_action_from_policy( (multi_action, multi_log_pi, _) = self.actor_net(multi_state_tensor) # Estimate uncertainty # [6, 10, 17] - print("--------------------") _, _, nstate_means, nstate_vars = self.world_model.pred_next_states( observation=multi_state_tensor, actions=multi_action) # [10, 17] From 1f398ee334dd2e2108e7728856bd0853f586f5bc Mon Sep 17 00:00:00 2001 From: tony Date: Fri, 27 Dec 2024 23:06:06 +1300 Subject: [PATCH 73/91] merge --- .../algorithm/mbrl/DynaSAC_Bounded.py | 12 ++-- .../algorithm/mbrl/STEVESAC_Bounded.py | 65 ++++++++++--------- 2 files changed, 42 insertions(+), 35 deletions(-) diff --git a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_Bounded.py b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_Bounded.py index 9986e463..2576c8be 100644 --- a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_Bounded.py +++ b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_Bounded.py @@ -120,19 +120,21 @@ def select_action_from_policy( epistemic = torch.clamp(epistemic, max=10e3) total_unc = (aleatoric ** 2 + epistemic ** 2) ** 0.5 uncert = torch.mean(total_unc, dim=1) + world_dist = F.softmax(uncert, dim=0) + # world_dist -= torch.min(world_dist) - multi_log_pi = multi_log_pi.squeeze() + Q_s = self.critic_net(multi_state_tensor, multi_action) + Q_s = Q_s.squeeze() + multi_log_pi = Q_s + #multi_log_pi = multi_log_pi.squeeze() policy_dist = F.softmax(multi_log_pi, dim=0) - world_dist = F.softmax(uncert, dim=0) - world_dist -= torch.min(world_dist) final_dist = policy_dist + self.threshold * world_dist - #final_dist = F.softmax(final_dist, dim=0) + # final_dist = F.softmax(final_dist, dim=0) candi = torch.argmax(final_dist) # new_dist = torch.distributions.Categorical(final_dist) # candi = new_dist.sample([5]).squeeze() # print(self._jsd(policy_dist, final_dist)) - action = multi_action[candi] else: (action, _, _) = self.actor_net(state_tensor) diff --git a/cares_reinforcement_learning/algorithm/mbrl/STEVESAC_Bounded.py b/cares_reinforcement_learning/algorithm/mbrl/STEVESAC_Bounded.py index 4e8bd0b7..d1a7b353 100644 --- a/cares_reinforcement_learning/algorithm/mbrl/STEVESAC_Bounded.py +++ b/cares_reinforcement_learning/algorithm/mbrl/STEVESAC_Bounded.py @@ -98,36 +98,41 @@ def select_action_from_policy( state_tensor = torch.FloatTensor(state).unsqueeze(0).to(self.device) if evaluation is False: (action, _, _) = self.actor_net(state_tensor) - # if self.threshold == 0: - # (action, _, _) = self.actor_net(state_tensor) - # else: - # if self.set_stat: - # multi_state_tensor = torch.repeat_interleave(state_tensor, self.exploration_sample, dim=0) - # (multi_action, multi_log_pi, _) = self.actor_net(multi_state_tensor) - # # Estimate uncertainty - # # [6, 10, 17] - # _, _, nstate_means, nstate_vars = self.world_model.pred_next_states( - # observation=multi_state_tensor, actions=multi_action) - # # [10, 17] - # aleatoric = torch.mean(nstate_vars ** 2, dim=0) ** 0.5 - # epistemic = torch.var(nstate_means, dim=0) ** 0.5 - # aleatoric = torch.clamp(aleatoric, max=10e3) - # epistemic = torch.clamp(epistemic, max=10e3) - # total_unc = (aleatoric ** 2 + epistemic ** 2) ** 0.5 - # uncert = torch.mean(total_unc, dim=1) - # multi_log_pi = multi_log_pi.squeeze() - # policy_dist = F.softmax(multi_log_pi, dim=0) - # world_dist = F.softmax(uncert, dim=0) - # world_dist -= torch.min(world_dist) - # final_dist = (1 - self.threshold) * policy_dist + self.threshold * world_dist - # final_dist = F.softmax(final_dist, dim=0) - # candi = torch.argmax(final_dist) - # # new_dist = torch.distributions.Categorical(final_dist) - # # candi = new_dist.sample([5]).squeeze() - # # print(self._jsd(policy_dist, final_dist)) - # action = multi_action[candi] - # else: - # (action, _, _) = self.actor_net(state_tensor) + if self.threshold == 0: + (action, _, _) = self.actor_net(state_tensor) + else: + if self.set_stat: + multi_state_tensor = torch.repeat_interleave(state_tensor, self.exploration_sample, dim=0) + (multi_action, multi_log_pi, _) = self.actor_net(multi_state_tensor) + # Estimate uncertainty + # [6, 10, 17] + _, _, nstate_means, nstate_vars = self.world_model.pred_next_states( + observation=multi_state_tensor, actions=multi_action) + # [10, 17] + aleatoric = torch.mean(nstate_vars ** 2, dim=0) ** 0.5 + epistemic = torch.var(nstate_means, dim=0) ** 0.5 + aleatoric = torch.clamp(aleatoric, max=10e3) + epistemic = torch.clamp(epistemic, max=10e3) + total_unc = (aleatoric ** 2 + epistemic ** 2) ** 0.5 + uncert = torch.mean(total_unc, dim=1) + world_dist = F.softmax(uncert, dim=0) + # world_dist -= torch.min(world_dist) + + Q_s = self.critic_net(multi_state_tensor, multi_action) + Q_s = Q_s.squeeze() + multi_log_pi = Q_s + + # multi_log_pi = multi_log_pi.squeeze() + policy_dist = F.softmax(multi_log_pi, dim=0) + final_dist = (1 - self.threshold) * policy_dist + self.threshold * world_dist + candi = torch.argmax(final_dist) + # final_dist = F.softmax(final_dist, dim=0) + # new_dist = torch.distributions.Categorical(final_dist) + # candi = new_dist.sample([5]).squeeze() + # print(self._jsd(policy_dist, final_dist)) + action = multi_action[candi] + else: + (action, _, _) = self.actor_net(state_tensor) else: (_, _, action) = self.actor_net(state_tensor) action = action.cpu().data.numpy().flatten() From f22a0698aa165e8fe325851def184664c11f0502 Mon Sep 17 00:00:00 2001 From: tony Date: Fri, 27 Dec 2024 23:18:24 +1300 Subject: [PATCH 74/91] merge --- cares_reinforcement_learning/algorithm/mbrl/DynaSAC_Bounded.py | 3 ++- .../algorithm/mbrl/STEVESAC_Bounded.py | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_Bounded.py b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_Bounded.py index 2576c8be..b68da7ca 100644 --- a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_Bounded.py +++ b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_Bounded.py @@ -123,7 +123,8 @@ def select_action_from_policy( world_dist = F.softmax(uncert, dim=0) # world_dist -= torch.min(world_dist) - Q_s = self.critic_net(multi_state_tensor, multi_action) + Q_1, Q_2 = self.critic_net(multi_state_tensor, multi_action) + Q_s = torch.minimum(Q_1, Q_2) Q_s = Q_s.squeeze() multi_log_pi = Q_s #multi_log_pi = multi_log_pi.squeeze() diff --git a/cares_reinforcement_learning/algorithm/mbrl/STEVESAC_Bounded.py b/cares_reinforcement_learning/algorithm/mbrl/STEVESAC_Bounded.py index d1a7b353..8ed33fa2 100644 --- a/cares_reinforcement_learning/algorithm/mbrl/STEVESAC_Bounded.py +++ b/cares_reinforcement_learning/algorithm/mbrl/STEVESAC_Bounded.py @@ -118,7 +118,8 @@ def select_action_from_policy( world_dist = F.softmax(uncert, dim=0) # world_dist -= torch.min(world_dist) - Q_s = self.critic_net(multi_state_tensor, multi_action) + Q_1, Q_2 = self.critic_net(multi_state_tensor, multi_action) + Q_s = torch.minimum(Q_1, Q_2) Q_s = Q_s.squeeze() multi_log_pi = Q_s From bd87ed5991fa05641416fadfbd62c0f0dbe9f292 Mon Sep 17 00:00:00 2001 From: "Formatter [BOT]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Sat, 28 Dec 2024 02:05:59 +0000 Subject: [PATCH 75/91] =?UTF-8?q?Auto-format=20code=20=F0=9F=A7=B9?= =?UTF-8?q?=F0=9F=8C=9F=F0=9F=A4=96?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../algorithm/mbrl/DynaSAC_BIV_NS.py | 145 ++++++++++------- .../algorithm/mbrl/DynaSAC_Bounded.py | 63 +++++--- .../algorithm/mbrl/DynaSAC_IW_NS.py | 128 +++++++++------ .../algorithm/mbrl/DynaSAC_NS.py | 29 ++-- .../algorithm/mbrl/DynaSAC_SUNRISE_NS.py | 129 +++++++++------ .../algorithm/mbrl/DynaSAC_UWAC_NS.py | 131 +++++++++------ .../algorithm/mbrl/STEVESAC.py | 98 +++++++----- .../algorithm/mbrl/STEVESAC_Bounded.py | 125 +++++++++------ .../networks/SAC/triple_critic.py | 1 - .../world_models/ensemble/__init__.py | 2 +- .../ensemble/world_ensemble_big.py | 151 +++++++++++------- .../ensemble/world_ensemble_one_rwd.py | 133 +++++++++------ .../simple/probabilistic_dynamic.py | 20 +-- .../simple/probabilistic_ns_reward.py | 19 ++- .../simple/probabilistic_sas_reward.py | 24 ++- .../world_models/simple/simple_ns_reward.py | 17 +- .../world_models/simple/simple_sas_reward.py | 22 ++- .../networks/world_models/world_model.py | 118 +++++++++----- cares_reinforcement_learning/util/__init__.py | 2 +- .../util/configurations.py | 16 +- cares_reinforcement_learning/util/helpers.py | 1 + .../util/network_factory.py | 62 ++++--- .../util/uncertainty_estimation.py | 17 +- 23 files changed, 900 insertions(+), 553 deletions(-) diff --git a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_BIV_NS.py b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_BIV_NS.py index 90c46dd3..a11c326e 100644 --- a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_BIV_NS.py +++ b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_BIV_NS.py @@ -17,7 +17,7 @@ from cares_reinforcement_learning.memory import MemoryBuffer from cares_reinforcement_learning.networks.world_models.ensemble import ( - Ensemble_Dyna_Big + Ensemble_Dyna_Big, ) from cares_reinforcement_learning.util.helpers import denormalize_observation_delta @@ -29,24 +29,24 @@ class DynaSAC_BIVReweight: """ def __init__( - self, - actor_network: torch.nn.Module, - critic_network: torch.nn.Module, - world_network: Ensemble_Dyna_Big, - gamma: float, - tau: float, - action_num: int, - actor_lr: float, - critic_lr: float, - alpha_lr: float, - num_samples: int, - horizon: int, - threshold_scale: float, - reweight_critic: bool, - reweight_actor: bool, - mode: int, - sample_times: int, - device: torch.device, + self, + actor_network: torch.nn.Module, + critic_network: torch.nn.Module, + world_network: Ensemble_Dyna_Big, + gamma: float, + tau: float, + action_num: int, + actor_lr: float, + critic_lr: float, + alpha_lr: float, + num_samples: int, + horizon: int, + threshold_scale: float, + reweight_critic: bool, + reweight_actor: bool, + mode: int, + sample_times: int, + device: torch.device, ): self.type = "mbrl" self.device = device @@ -94,7 +94,7 @@ def _alpha(self) -> float: # pylint: disable-next=unused-argument to keep the same interface def select_action_from_policy( - self, state: np.ndarray, evaluation: bool = False, noise_scale: float = 0 + self, state: np.ndarray, evaluation: bool = False, noise_scale: float = 0 ) -> np.ndarray: # note that when evaluating this algorithm we need to select mu as self.actor_net.eval() @@ -109,13 +109,13 @@ def select_action_from_policy( return action def _train_policy( - self, - states: torch.Tensor, - actions: torch.Tensor, - rewards: torch.Tensor, - next_states: torch.Tensor, - dones: torch.Tensor, - weights: torch.Tensor, + self, + states: torch.Tensor, + actions: torch.Tensor, + rewards: torch.Tensor, + next_states: torch.Tensor, + dones: torch.Tensor, + weights: torch.Tensor, ) -> None: ################## Update the Critic First #################### # Have more target values? @@ -125,7 +125,7 @@ def _train_policy( next_states, next_actions ) target_q_values = ( - torch.minimum(target_q_one, target_q_two) - self._alpha * next_log_pi + torch.minimum(target_q_one, target_q_two) - self._alpha * next_log_pi ) q_target = rewards + self.gamma * (1 - dones) * target_q_values @@ -182,7 +182,7 @@ def _train_policy( # Update the temperature alpha_loss = -( - self.log_alpha * (first_log_p + self.target_entropy).detach() + self.log_alpha * (first_log_p + self.target_entropy).detach() ).mean() self.log_alpha_optimizer.zero_grad() @@ -191,15 +191,13 @@ def _train_policy( if self.learn_counter % self.policy_update_freq == 0: for target_param, param in zip( - self.target_critic_net.parameters(), self.critic_net.parameters() + self.target_critic_net.parameters(), self.critic_net.parameters() ): target_param.data.copy_( param.data * self.tau + target_param.data * (1.0 - self.tau) ) - def train_world_model( - self, memory: MemoryBuffer, batch_size: int - ) -> None: + def train_world_model(self, memory: MemoryBuffer, batch_size: int) -> None: experiences = memory.sample_uniform(batch_size) states, actions, rewards, next_states, _, _ = experiences @@ -256,15 +254,21 @@ def _dyna_generate_and_train(self, next_states): with torch.no_grad(): pred_state = next_states for _ in range(self.horizon): - pred_state = torch.repeat_interleave(pred_state, self.num_samples, dim=0) + pred_state = torch.repeat_interleave( + pred_state, self.num_samples, dim=0 + ) # This part is controversial. But random actions is empirically better. - rand_acts = np.random.uniform(-1, 1, (pred_state.shape[0], self.action_num)) + rand_acts = np.random.uniform( + -1, 1, (pred_state.shape[0], self.action_num) + ) pred_acts = torch.FloatTensor(rand_acts).to(self.device) - pred_next_state, _, pred_mean, pred_var = self.world_model.pred_next_states( - pred_state, pred_acts + pred_next_state, _, pred_mean, pred_var = ( + self.world_model.pred_next_states(pred_state, pred_acts) + ) + uncert = self.sampling( + curr_states=pred_state, pred_means=pred_mean, pred_vars=pred_var ) - uncert = self.sampling(curr_states=pred_state, pred_means=pred_mean, pred_vars=pred_var) uncert = uncert.unsqueeze(dim=1).to(self.device) pred_uncerts.append(uncert) @@ -297,29 +301,44 @@ def sampling(self, curr_states, pred_means, pred_vars): with torch.no_grad(): # 5 models. Each predict 10 next_states. sample1 = torch.distributions.Normal(pred_means[0], pred_vars[0]).sample( - [self.sample_times]) + [self.sample_times] + ) sample2 = torch.distributions.Normal(pred_means[1], pred_vars[1]).sample( - [self.sample_times]) + [self.sample_times] + ) sample3 = torch.distributions.Normal(pred_means[2], pred_vars[2]).sample( - [self.sample_times]) + [self.sample_times] + ) sample4 = torch.distributions.Normal(pred_means[3], pred_vars[3]).sample( - [self.sample_times]) + [self.sample_times] + ) sample5 = torch.distributions.Normal(pred_means[4], pred_vars[4]).sample( - [self.sample_times]) + [self.sample_times] + ) rs = [] acts = [] qs = [] # Varying the next_state's distribution. for i in range(self.sample_times): - sample1i = denormalize_observation_delta(sample1[i], self.world_model.statistics) + sample1i = denormalize_observation_delta( + sample1[i], self.world_model.statistics + ) sample1i += curr_states - sample2i = denormalize_observation_delta(sample2[i], self.world_model.statistics) + sample2i = denormalize_observation_delta( + sample2[i], self.world_model.statistics + ) sample2i += curr_states - sample3i = denormalize_observation_delta(sample3[i], self.world_model.statistics) + sample3i = denormalize_observation_delta( + sample3[i], self.world_model.statistics + ) sample3i += curr_states - sample4i = denormalize_observation_delta(sample4[i], self.world_model.statistics) + sample4i = denormalize_observation_delta( + sample4[i], self.world_model.statistics + ) sample4i += curr_states - sample5i = denormalize_observation_delta(sample5[i], self.world_model.statistics) + sample5i = denormalize_observation_delta( + sample5[i], self.world_model.statistics + ) sample5i += curr_states # 5 models, each sampled 10 times = 50, @@ -393,10 +412,18 @@ def sampling(self, curr_states, pred_means, pred_vars): gamma_sq = self.gamma * self.gamma # Ablation if self.mode == 0: - total_var = var_r + gamma_sq * var_a + gamma_sq * var_q + gamma_sq * 2 * cov_aq + \ - gamma_sq * 2 * cov_rq + gamma_sq * 2 * cov_ra + total_var = ( + var_r + + gamma_sq * var_a + + gamma_sq * var_q + + gamma_sq * 2 * cov_aq + + gamma_sq * 2 * cov_rq + + gamma_sq * 2 * cov_ra + ) if self.mode == 1: - total_var = var_r + gamma_sq * var_a + gamma_sq * var_q + gamma_sq * 2 * cov_aq + total_var = ( + var_r + gamma_sq * var_a + gamma_sq * var_q + gamma_sq * 2 * cov_aq + ) if self.mode == 2: total_var = var_r + gamma_sq * var_a + gamma_sq * var_q if self.mode == 3: @@ -410,24 +437,28 @@ def sampling(self, curr_states, pred_means, pred_vars): total_stds = ratio * weights return total_stds.detach() - def get_optimal_xi(self, variances): minimal_size = self.threshold_scale if self.compute_eff_bs(self.get_iv_weights(variances)) >= minimal_size: return 0 - fn = lambda x: np.abs(self.compute_eff_bs(self.get_iv_weights(variances + np.abs(x))) - minimal_size) - epsilon = minimize(fn, 0, method='Nelder-Mead', options={'fatol': 1.0, 'maxiter': 100}) + fn = lambda x: np.abs( + self.compute_eff_bs(self.get_iv_weights(variances + np.abs(x))) + - minimal_size + ) + epsilon = minimize( + fn, 0, method="Nelder-Mead", options={"fatol": 1.0, "maxiter": 100} + ) xi = np.abs(epsilon.x[0]) xi = 0 if xi is None else xi return xi def get_iv_weights(self, variances): - ''' + """ Returns Inverse Variance weights Params ====== variances (numpy array): variance of the targets - ''' + """ weights = 1 / variances weights = weights / np.sum(weights) return weights @@ -438,8 +469,6 @@ def compute_eff_bs(self, weights): eff_bs = eff_bs / np.shape(weights)[0] return eff_bs - - def set_statistics(self, stats: dict) -> None: self.world_model.set_statistics(stats) diff --git a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_Bounded.py b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_Bounded.py index b68da7ca..d1fcf366 100644 --- a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_Bounded.py +++ b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_Bounded.py @@ -19,6 +19,7 @@ ) import torch.nn.functional as F + class DynaSAC_Bounded: def __init__( self, @@ -36,9 +37,9 @@ def __init__( device: torch.device, train_reward: bool, train_both: bool, - gripper:bool, - threshold:float, - exploration_sample:int + gripper: bool, + threshold: float, + exploration_sample: int, ): logging.info("-----------------------------------------------") logging.info("----I am runing the DynaSAC_Bounded Agent! ----") @@ -84,15 +85,17 @@ def __init__( # World model self.world_model = world_network - self.k_l = nn.KLDivLoss(reduction='batchmean', log_target=True) + self.k_l = nn.KLDivLoss(reduction="batchmean", log_target=True) @property def _alpha(self) -> float: return self.log_alpha.exp() def _jsd(self, p, q): - p, q = p.view(-1, p.size(-1)).log_softmax(-1), q.view(-1, q.size(-1)).log_softmax(-1) - m = (0.5 * (p + q)) + p, q = p.view(-1, p.size(-1)).log_softmax(-1), q.view( + -1, q.size(-1) + ).log_softmax(-1) + m = 0.5 * (p + q) return 0.5 * (self.k_l(m, p) + self.k_l(m, q)) def select_action_from_policy( @@ -107,18 +110,25 @@ def select_action_from_policy( (action, _, _) = self.actor_net(state_tensor) else: if self.set_stat: - multi_state_tensor = torch.repeat_interleave(state_tensor, self.exploration_sample, dim=0) - (multi_action, multi_log_pi, _) = self.actor_net(multi_state_tensor) + multi_state_tensor = torch.repeat_interleave( + state_tensor, self.exploration_sample, dim=0 + ) + (multi_action, multi_log_pi, _) = self.actor_net( + multi_state_tensor + ) # Estimate uncertainty # [6, 10, 17] - _, _, nstate_means, nstate_vars = self.world_model.pred_next_states( - observation=multi_state_tensor, actions=multi_action) + _, _, nstate_means, nstate_vars = ( + self.world_model.pred_next_states( + observation=multi_state_tensor, actions=multi_action + ) + ) # [10, 17] - aleatoric = torch.mean(nstate_vars ** 2, dim=0) ** 0.5 + aleatoric = torch.mean(nstate_vars**2, dim=0) ** 0.5 epistemic = torch.var(nstate_means, dim=0) ** 0.5 aleatoric = torch.clamp(aleatoric, max=10e3) epistemic = torch.clamp(epistemic, max=10e3) - total_unc = (aleatoric ** 2 + epistemic ** 2) ** 0.5 + total_unc = (aleatoric**2 + epistemic**2) ** 0.5 uncert = torch.mean(total_unc, dim=1) world_dist = F.softmax(uncert, dim=0) # world_dist -= torch.min(world_dist) @@ -127,7 +137,7 @@ def select_action_from_policy( Q_s = torch.minimum(Q_1, Q_2) Q_s = Q_s.squeeze() multi_log_pi = Q_s - #multi_log_pi = multi_log_pi.squeeze() + # multi_log_pi = multi_log_pi.squeeze() policy_dist = F.softmax(multi_log_pi, dim=0) final_dist = policy_dist + self.threshold * world_dist @@ -164,7 +174,7 @@ def _train_policy( next_states, next_actions ) target_q_values = ( - torch.minimum(target_q_one, target_q_two) - self._alpha * next_log_pi + torch.minimum(target_q_one, target_q_two) - self._alpha * next_log_pi ) q_target = rewards + self.gamma * (1 - dones) * target_q_values @@ -208,9 +218,7 @@ def _train_policy( param.data * self.tau + target_param.data * (1.0 - self.tau) ) - def train_world_model( - self, memory: MemoryBuffer, batch_size: int - ) -> None: + def train_world_model(self, memory: MemoryBuffer, batch_size: int) -> None: experiences = memory.sample_uniform(batch_size) states, actions, rewards, next_states, _, _ = experiences @@ -255,7 +263,7 @@ def train_policy(self, memory: MemoryBuffer, batch_size: int) -> None: rewards=rewards, next_states=next_states, dones=dones, - weights=torch.ones(rewards.shape) + weights=torch.ones(rewards.shape), ) self._dyna_generate_and_train(next_states) @@ -268,7 +276,9 @@ def _dyna_generate_and_train(self, next_states: torch.Tensor) -> None: with torch.no_grad(): pred_state = next_states for _ in range(self.horizon): - pred_state = torch.repeat_interleave(pred_state, self.num_samples, dim=0) + pred_state = torch.repeat_interleave( + pred_state, self.num_samples, dim=0 + ) # This part is controversial. But random actions is empirically better. # rand_acts = np.random.uniform(-1, 1, (pred_state.shape[0], self.action_num)) # pred_acts = torch.FloatTensor(rand_acts).to(self.device) @@ -281,9 +291,11 @@ def _dyna_generate_and_train(self, next_states: torch.Tensor) -> None: pred_reward = self.reward_function(pred_state, pred_next_state) pred_next_state[:, -2:] = pred_state[:, -2:] else: - pred_reward, _ = self.world_model.pred_rewards(observation=pred_state, - action=pred_acts, - next_observation=pred_next_state) + pred_reward, _ = self.world_model.pred_rewards( + observation=pred_state, + action=pred_acts, + next_observation=pred_next_state, + ) pred_states.append(pred_state) pred_actions.append(pred_acts.detach()) @@ -298,7 +310,12 @@ def _dyna_generate_and_train(self, next_states: torch.Tensor) -> None: pred_dones = torch.FloatTensor(np.zeros(pred_rs.shape)).to(self.device) # states, actions, rewards, next_states, not_dones self._train_policy( - pred_states, pred_actions, pred_rs, pred_n_states, pred_dones, torch.ones(pred_rs.shape) + pred_states, + pred_actions, + pred_rs, + pred_n_states, + pred_dones, + torch.ones(pred_rs.shape), ) def reward_function(self, curr_states, next_states): diff --git a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_IW_NS.py b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_IW_NS.py index 915bed13..7684f817 100644 --- a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_IW_NS.py +++ b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_IW_NS.py @@ -16,37 +16,36 @@ from cares_reinforcement_learning.memory import MemoryBuffer from cares_reinforcement_learning.networks.world_models.ensemble import ( - Ensemble_Dyna_Big + Ensemble_Dyna_Big, ) from cares_reinforcement_learning.util.helpers import denormalize_observation_delta - class DynaSAC_ScaleBatchReweight: """ Max as ? """ def __init__( - self, - actor_network: torch.nn.Module, - critic_network: torch.nn.Module, - world_network: Ensemble_Dyna_Big, - gamma: float, - tau: float, - action_num: int, - actor_lr: float, - critic_lr: float, - alpha_lr: float, - num_samples: int, - horizon: int, - threshold_scale: float, - reweight_critic: bool, - reweight_actor: bool, - mode: int, - sample_times: int, - device: torch.device, + self, + actor_network: torch.nn.Module, + critic_network: torch.nn.Module, + world_network: Ensemble_Dyna_Big, + gamma: float, + tau: float, + action_num: int, + actor_lr: float, + critic_lr: float, + alpha_lr: float, + num_samples: int, + horizon: int, + threshold_scale: float, + reweight_critic: bool, + reweight_actor: bool, + mode: int, + sample_times: int, + device: torch.device, ): self.type = "mbrl" self.device = device @@ -94,7 +93,7 @@ def _alpha(self) -> float: # pylint: disable-next=unused-argument to keep the same interface def select_action_from_policy( - self, state: np.ndarray, evaluation: bool = False, noise_scale: float = 0 + self, state: np.ndarray, evaluation: bool = False, noise_scale: float = 0 ) -> np.ndarray: # note that when evaluating this algorithm we need to select mu as self.actor_net.eval() @@ -109,13 +108,13 @@ def select_action_from_policy( return action def _train_policy( - self, - states: torch.Tensor, - actions: torch.Tensor, - rewards: torch.Tensor, - next_states: torch.Tensor, - dones: torch.Tensor, - weights: torch.Tensor, + self, + states: torch.Tensor, + actions: torch.Tensor, + rewards: torch.Tensor, + next_states: torch.Tensor, + dones: torch.Tensor, + weights: torch.Tensor, ) -> None: ################## Update the Critic First #################### # Have more target values? @@ -125,7 +124,7 @@ def _train_policy( next_states, next_actions ) target_q_values = ( - torch.minimum(target_q_one, target_q_two) - self._alpha * next_log_pi + torch.minimum(target_q_one, target_q_two) - self._alpha * next_log_pi ) q_target = rewards + self.gamma * (1 - dones) * target_q_values @@ -182,7 +181,7 @@ def _train_policy( # Update the temperature alpha_loss = -( - self.log_alpha * (first_log_p + self.target_entropy).detach() + self.log_alpha * (first_log_p + self.target_entropy).detach() ).mean() self.log_alpha_optimizer.zero_grad() @@ -191,15 +190,13 @@ def _train_policy( if self.learn_counter % self.policy_update_freq == 0: for target_param, param in zip( - self.target_critic_net.parameters(), self.critic_net.parameters() + self.target_critic_net.parameters(), self.critic_net.parameters() ): target_param.data.copy_( param.data * self.tau + target_param.data * (1.0 - self.tau) ) - def train_world_model( - self, memory: MemoryBuffer, batch_size: int - ) -> None: + def train_world_model(self, memory: MemoryBuffer, batch_size: int) -> None: experiences = memory.sample_uniform(batch_size) states, actions, rewards, next_states, _, _ = experiences @@ -256,15 +253,21 @@ def _dyna_generate_and_train(self, next_states): with torch.no_grad(): pred_state = next_states for _ in range(self.horizon): - pred_state = torch.repeat_interleave(pred_state, self.num_samples, dim=0) + pred_state = torch.repeat_interleave( + pred_state, self.num_samples, dim=0 + ) # This part is controversial. But random actions is empirically better. - rand_acts = np.random.uniform(-1, 1, (pred_state.shape[0], self.action_num)) + rand_acts = np.random.uniform( + -1, 1, (pred_state.shape[0], self.action_num) + ) pred_acts = torch.FloatTensor(rand_acts).to(self.device) - pred_next_state, _, pred_mean, pred_var = self.world_model.pred_next_states( - pred_state, pred_acts + pred_next_state, _, pred_mean, pred_var = ( + self.world_model.pred_next_states(pred_state, pred_acts) + ) + uncert = self.sampling( + curr_states=pred_state, pred_means=pred_mean, pred_vars=pred_var ) - uncert = self.sampling(curr_states=pred_state, pred_means=pred_mean, pred_vars=pred_var) uncert = uncert.unsqueeze(dim=1).to(self.device) pred_uncerts.append(uncert) @@ -297,30 +300,45 @@ def sampling(self, curr_states, pred_means, pred_vars): with torch.no_grad(): # 5 models. Each predict 10 next_states. sample1 = torch.distributions.Normal(pred_means[0], pred_vars[0]).sample( - [self.sample_times]) + [self.sample_times] + ) sample2 = torch.distributions.Normal(pred_means[1], pred_vars[1]).sample( - [self.sample_times]) + [self.sample_times] + ) sample3 = torch.distributions.Normal(pred_means[2], pred_vars[2]).sample( - [self.sample_times]) + [self.sample_times] + ) sample4 = torch.distributions.Normal(pred_means[3], pred_vars[3]).sample( - [self.sample_times]) + [self.sample_times] + ) sample5 = torch.distributions.Normal(pred_means[4], pred_vars[4]).sample( - [self.sample_times]) + [self.sample_times] + ) rs = [] acts = [] qs = [] # Varying the next_state's distribution. for i in range(self.sample_times): - sample1i = denormalize_observation_delta(sample1[i], self.world_model.statistics) + sample1i = denormalize_observation_delta( + sample1[i], self.world_model.statistics + ) sample1i += curr_states - sample2i = denormalize_observation_delta(sample2[i], self.world_model.statistics) + sample2i = denormalize_observation_delta( + sample2[i], self.world_model.statistics + ) sample2i += curr_states - sample3i = denormalize_observation_delta(sample3[i], self.world_model.statistics) + sample3i = denormalize_observation_delta( + sample3[i], self.world_model.statistics + ) sample3i += curr_states - sample4i = denormalize_observation_delta(sample4[i], self.world_model.statistics) + sample4i = denormalize_observation_delta( + sample4[i], self.world_model.statistics + ) sample4i += curr_states - sample5i = denormalize_observation_delta(sample5[i], self.world_model.statistics) + sample5i = denormalize_observation_delta( + sample5[i], self.world_model.statistics + ) sample5i += curr_states if self.reweight_critic == 1: @@ -388,8 +406,14 @@ def sampling(self, curr_states, pred_means, pred_vars): cov_ra = torch.mean(diff_r * diff_a, dim=0) gamma_sq = self.gamma * self.gamma - total_var = var_r + gamma_sq * var_a + gamma_sq * var_q + gamma_sq * 2 * cov_aq + \ - gamma_sq * 2 * cov_rq + gamma_sq * 2 * cov_ra + total_var = ( + var_r + + gamma_sq * var_a + + gamma_sq * var_q + + gamma_sq * 2 * cov_aq + + gamma_sq * 2 * cov_rq + + gamma_sq * 2 * cov_ra + ) if self.reweight_actor: mean_a = torch.mean(acts, dim=0, keepdim=True) @@ -401,7 +425,7 @@ def sampling(self, curr_states, pred_means, pred_vars): var_a = torch.var(acts, dim=0) var_q = torch.var(qs, dim=0) # For actor: alpha^2 * var_a + var_q - total_var = (self._alpha ** 2) * var_a + var_q + (self._alpha ** 2) * cov_aq + total_var = (self._alpha**2) * var_a + var_q + (self._alpha**2) * cov_aq min_var = torch.min(total_var) max_var = torch.max(total_var) diff --git a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_NS.py b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_NS.py index 4624e392..f3bd4ee1 100644 --- a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_NS.py +++ b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_NS.py @@ -36,7 +36,7 @@ def __init__( device: torch.device, train_reward: bool, train_both: bool, - gripper:bool, + gripper: bool, ): logging.info("-------------------------------------------") logging.info("----I am runing the Dyna_SAC_NS Agent! ----") @@ -118,7 +118,7 @@ def _train_policy( next_states, next_actions ) target_q_values = ( - torch.minimum(target_q_one, target_q_two) - self._alpha * next_log_pi + torch.minimum(target_q_one, target_q_two) - self._alpha * next_log_pi ) q_target = rewards + self.gamma * (1 - dones) * target_q_values @@ -162,9 +162,7 @@ def _train_policy( param.data * self.tau + target_param.data * (1.0 - self.tau) ) - def train_world_model( - self, memory: MemoryBuffer, batch_size: int - ) -> None: + def train_world_model(self, memory: MemoryBuffer, batch_size: int) -> None: experiences = memory.sample_uniform(batch_size) states, actions, rewards, next_states, _, _ = experiences @@ -209,7 +207,7 @@ def train_policy(self, memory: MemoryBuffer, batch_size: int) -> None: rewards=rewards, next_states=next_states, dones=dones, - weights=torch.ones(rewards.shape) + weights=torch.ones(rewards.shape), ) self._dyna_generate_and_train(next_states) @@ -222,7 +220,9 @@ def _dyna_generate_and_train(self, next_states: torch.Tensor) -> None: with torch.no_grad(): pred_state = next_states for _ in range(self.horizon): - pred_state = torch.repeat_interleave(pred_state, self.num_samples, dim=0) + pred_state = torch.repeat_interleave( + pred_state, self.num_samples, dim=0 + ) # This part is controversial. But random actions is empirically better. # rand_acts = np.random.uniform(-1, 1, (pred_state.shape[0], self.action_num)) # pred_acts = torch.FloatTensor(rand_acts).to(self.device) @@ -235,9 +235,11 @@ def _dyna_generate_and_train(self, next_states: torch.Tensor) -> None: pred_reward = self.reward_function(pred_state, pred_next_state) pred_next_state[:, -2:] = pred_state[:, -2:] else: - pred_reward, _ = self.world_model.pred_rewards(observation=pred_state, - action=pred_acts, - next_observation=pred_next_state) + pred_reward, _ = self.world_model.pred_rewards( + observation=pred_state, + action=pred_acts, + next_observation=pred_next_state, + ) pred_states.append(pred_state) pred_actions.append(pred_acts.detach()) @@ -252,7 +254,12 @@ def _dyna_generate_and_train(self, next_states: torch.Tensor) -> None: pred_dones = torch.FloatTensor(np.zeros(pred_rs.shape)).to(self.device) # states, actions, rewards, next_states, not_dones self._train_policy( - pred_states, pred_actions, pred_rs, pred_n_states, pred_dones, torch.ones(pred_rs.shape) + pred_states, + pred_actions, + pred_rs, + pred_n_states, + pred_dones, + torch.ones(pred_rs.shape), ) def reward_function(self, curr_states, next_states): diff --git a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_SUNRISE_NS.py b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_SUNRISE_NS.py index eb8036d1..1133eddb 100644 --- a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_SUNRISE_NS.py +++ b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_SUNRISE_NS.py @@ -16,7 +16,7 @@ from cares_reinforcement_learning.memory import MemoryBuffer from cares_reinforcement_learning.networks.world_models.ensemble import ( - Ensemble_Dyna_Big + Ensemble_Dyna_Big, ) from cares_reinforcement_learning.util.helpers import denormalize_observation_delta @@ -27,24 +27,24 @@ class DynaSAC_SUNRISEReweight: """ def __init__( - self, - actor_network: torch.nn.Module, - critic_network: torch.nn.Module, - world_network: Ensemble_Dyna_Big, - gamma: float, - tau: float, - action_num: int, - actor_lr: float, - critic_lr: float, - alpha_lr: float, - num_samples: int, - horizon: int, - threshold_scale: float, - reweight_critic: bool, - reweight_actor: bool, - mode: int, - sample_times: int, - device: torch.device, + self, + actor_network: torch.nn.Module, + critic_network: torch.nn.Module, + world_network: Ensemble_Dyna_Big, + gamma: float, + tau: float, + action_num: int, + actor_lr: float, + critic_lr: float, + alpha_lr: float, + num_samples: int, + horizon: int, + threshold_scale: float, + reweight_critic: bool, + reweight_actor: bool, + mode: int, + sample_times: int, + device: torch.device, ): self.type = "mbrl" self.device = device @@ -92,7 +92,7 @@ def _alpha(self) -> float: # pylint: disable-next=unused-argument to keep the same interface def select_action_from_policy( - self, state: np.ndarray, evaluation: bool = False, noise_scale: float = 0 + self, state: np.ndarray, evaluation: bool = False, noise_scale: float = 0 ) -> np.ndarray: # note that when evaluating this algorithm we need to select mu as self.actor_net.eval() @@ -107,13 +107,13 @@ def select_action_from_policy( return action def _train_policy( - self, - states: torch.Tensor, - actions: torch.Tensor, - rewards: torch.Tensor, - next_states: torch.Tensor, - dones: torch.Tensor, - weights: torch.Tensor, + self, + states: torch.Tensor, + actions: torch.Tensor, + rewards: torch.Tensor, + next_states: torch.Tensor, + dones: torch.Tensor, + weights: torch.Tensor, ) -> None: ################## Update the Critic First #################### # Have more target values? @@ -123,7 +123,7 @@ def _train_policy( next_states, next_actions ) target_q_values = ( - torch.minimum(target_q_one, target_q_two) - self._alpha * next_log_pi + torch.minimum(target_q_one, target_q_two) - self._alpha * next_log_pi ) q_target = rewards + self.gamma * (1 - dones) * target_q_values @@ -180,7 +180,7 @@ def _train_policy( # Update the temperature alpha_loss = -( - self.log_alpha * (first_log_p + self.target_entropy).detach() + self.log_alpha * (first_log_p + self.target_entropy).detach() ).mean() self.log_alpha_optimizer.zero_grad() @@ -189,15 +189,13 @@ def _train_policy( if self.learn_counter % self.policy_update_freq == 0: for target_param, param in zip( - self.target_critic_net.parameters(), self.critic_net.parameters() + self.target_critic_net.parameters(), self.critic_net.parameters() ): target_param.data.copy_( param.data * self.tau + target_param.data * (1.0 - self.tau) ) - def train_world_model( - self, memory: MemoryBuffer, batch_size: int - ) -> None: + def train_world_model(self, memory: MemoryBuffer, batch_size: int) -> None: experiences = memory.sample_uniform(batch_size) states, actions, rewards, next_states, _, _ = experiences @@ -254,15 +252,21 @@ def _dyna_generate_and_train(self, next_states): with torch.no_grad(): pred_state = next_states for _ in range(self.horizon): - pred_state = torch.repeat_interleave(pred_state, self.num_samples, dim=0) + pred_state = torch.repeat_interleave( + pred_state, self.num_samples, dim=0 + ) # This part is controversial. But random actions is empirically better. - rand_acts = np.random.uniform(-1, 1, (pred_state.shape[0], self.action_num)) + rand_acts = np.random.uniform( + -1, 1, (pred_state.shape[0], self.action_num) + ) pred_acts = torch.FloatTensor(rand_acts).to(self.device) - pred_next_state, _, pred_mean, pred_var = self.world_model.pred_next_states( - pred_state, pred_acts + pred_next_state, _, pred_mean, pred_var = ( + self.world_model.pred_next_states(pred_state, pred_acts) + ) + uncert = self.sampling( + curr_states=pred_state, pred_means=pred_mean, pred_vars=pred_var ) - uncert = self.sampling(curr_states=pred_state, pred_means=pred_mean, pred_vars=pred_var) uncert = uncert.unsqueeze(dim=1).to(self.device) pred_uncerts.append(uncert) @@ -295,29 +299,44 @@ def sampling(self, curr_states, pred_means, pred_vars): with torch.no_grad(): # 5 models. Each predict 10 next_states. sample1 = torch.distributions.Normal(pred_means[0], pred_vars[0]).sample( - [self.sample_times]) + [self.sample_times] + ) sample2 = torch.distributions.Normal(pred_means[1], pred_vars[1]).sample( - [self.sample_times]) + [self.sample_times] + ) sample3 = torch.distributions.Normal(pred_means[2], pred_vars[2]).sample( - [self.sample_times]) + [self.sample_times] + ) sample4 = torch.distributions.Normal(pred_means[3], pred_vars[3]).sample( - [self.sample_times]) + [self.sample_times] + ) sample5 = torch.distributions.Normal(pred_means[4], pred_vars[4]).sample( - [self.sample_times]) + [self.sample_times] + ) rs = [] acts = [] qs = [] # Varying the next_state's distribution. for i in range(self.sample_times): - sample1i = denormalize_observation_delta(sample1[i], self.world_model.statistics) + sample1i = denormalize_observation_delta( + sample1[i], self.world_model.statistics + ) sample1i += curr_states - sample2i = denormalize_observation_delta(sample2[i], self.world_model.statistics) + sample2i = denormalize_observation_delta( + sample2[i], self.world_model.statistics + ) sample2i += curr_states - sample3i = denormalize_observation_delta(sample3[i], self.world_model.statistics) + sample3i = denormalize_observation_delta( + sample3[i], self.world_model.statistics + ) sample3i += curr_states - sample4i = denormalize_observation_delta(sample4[i], self.world_model.statistics) + sample4i = denormalize_observation_delta( + sample4[i], self.world_model.statistics + ) sample4i += curr_states - sample5i = denormalize_observation_delta(sample5[i], self.world_model.statistics) + sample5i = denormalize_observation_delta( + sample5[i], self.world_model.statistics + ) sample5i += curr_states # 5 models, each sampled 10 times = 50, pred_rwd1 = self.world_model.pred_rewards(sample1i) @@ -390,10 +409,18 @@ def sampling(self, curr_states, pred_means, pred_vars): gamma_sq = self.gamma * self.gamma # Ablation if self.mode == 0: - total_var = var_r + gamma_sq * var_a + gamma_sq * var_q + gamma_sq * 2 * cov_aq + \ - gamma_sq * 2 * cov_rq + gamma_sq * 2 * cov_ra + total_var = ( + var_r + + gamma_sq * var_a + + gamma_sq * var_q + + gamma_sq * 2 * cov_aq + + gamma_sq * 2 * cov_rq + + gamma_sq * 2 * cov_ra + ) - total_stds = torch.sigmoid(-1 * torch.sqrt(total_var) * self.threshold_scale) + 0.5 + total_stds = ( + torch.sigmoid(-1 * torch.sqrt(total_var) * self.threshold_scale) + 0.5 + ) return total_stds.detach() diff --git a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_UWAC_NS.py b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_UWAC_NS.py index 57c58e7b..3f526b25 100644 --- a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_UWAC_NS.py +++ b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_UWAC_NS.py @@ -16,7 +16,7 @@ from cares_reinforcement_learning.memory import MemoryBuffer from cares_reinforcement_learning.networks.world_models.ensemble import ( - Ensemble_Dyna_Big + Ensemble_Dyna_Big, ) from cares_reinforcement_learning.util.helpers import denormalize_observation_delta @@ -28,24 +28,24 @@ class DynaSAC_UWACReweight: """ def __init__( - self, - actor_network: torch.nn.Module, - critic_network: torch.nn.Module, - world_network: Ensemble_Dyna_Big, - gamma: float, - tau: float, - action_num: int, - actor_lr: float, - critic_lr: float, - alpha_lr: float, - num_samples: int, - horizon: int, - threshold_scale: float, - reweight_critic: bool, - reweight_actor: bool, - mode: int, - sample_times: int, - device: torch.device, + self, + actor_network: torch.nn.Module, + critic_network: torch.nn.Module, + world_network: Ensemble_Dyna_Big, + gamma: float, + tau: float, + action_num: int, + actor_lr: float, + critic_lr: float, + alpha_lr: float, + num_samples: int, + horizon: int, + threshold_scale: float, + reweight_critic: bool, + reweight_actor: bool, + mode: int, + sample_times: int, + device: torch.device, ): self.type = "mbrl" self.device = device @@ -93,7 +93,7 @@ def _alpha(self) -> float: # pylint: disable-next=unused-argument to keep the same interface def select_action_from_policy( - self, state: np.ndarray, evaluation: bool = False, noise_scale: float = 0 + self, state: np.ndarray, evaluation: bool = False, noise_scale: float = 0 ) -> np.ndarray: # note that when evaluating this algorithm we need to select mu as self.actor_net.eval() @@ -108,13 +108,13 @@ def select_action_from_policy( return action def _train_policy( - self, - states: torch.Tensor, - actions: torch.Tensor, - rewards: torch.Tensor, - next_states: torch.Tensor, - dones: torch.Tensor, - weights: torch.Tensor, + self, + states: torch.Tensor, + actions: torch.Tensor, + rewards: torch.Tensor, + next_states: torch.Tensor, + dones: torch.Tensor, + weights: torch.Tensor, ) -> None: ################## Update the Critic First #################### # Have more target values? @@ -124,7 +124,7 @@ def _train_policy( next_states, next_actions ) target_q_values = ( - torch.minimum(target_q_one, target_q_two) - self._alpha * next_log_pi + torch.minimum(target_q_one, target_q_two) - self._alpha * next_log_pi ) q_target = rewards + self.gamma * (1 - dones) * target_q_values @@ -181,7 +181,7 @@ def _train_policy( # Update the temperature alpha_loss = -( - self.log_alpha * (first_log_p + self.target_entropy).detach() + self.log_alpha * (first_log_p + self.target_entropy).detach() ).mean() self.log_alpha_optimizer.zero_grad() @@ -190,15 +190,13 @@ def _train_policy( if self.learn_counter % self.policy_update_freq == 0: for target_param, param in zip( - self.target_critic_net.parameters(), self.critic_net.parameters() + self.target_critic_net.parameters(), self.critic_net.parameters() ): target_param.data.copy_( param.data * self.tau + target_param.data * (1.0 - self.tau) ) - def train_world_model( - self, memory: MemoryBuffer, batch_size: int - ) -> None: + def train_world_model(self, memory: MemoryBuffer, batch_size: int) -> None: experiences = memory.sample_uniform(batch_size) states, actions, rewards, next_states, _, _ = experiences @@ -255,15 +253,21 @@ def _dyna_generate_and_train(self, next_states): with torch.no_grad(): pred_state = next_states for _ in range(self.horizon): - pred_state = torch.repeat_interleave(pred_state, self.num_samples, dim=0) + pred_state = torch.repeat_interleave( + pred_state, self.num_samples, dim=0 + ) # This part is controversial. But random actions is empirically better. - rand_acts = np.random.uniform(-1, 1, (pred_state.shape[0], self.action_num)) + rand_acts = np.random.uniform( + -1, 1, (pred_state.shape[0], self.action_num) + ) pred_acts = torch.FloatTensor(rand_acts).to(self.device) - pred_next_state, _, pred_mean, pred_var = self.world_model.pred_next_states( - pred_state, pred_acts + pred_next_state, _, pred_mean, pred_var = ( + self.world_model.pred_next_states(pred_state, pred_acts) + ) + uncert = self.sampling( + curr_states=pred_state, pred_means=pred_mean, pred_vars=pred_var ) - uncert = self.sampling(curr_states=pred_state, pred_means=pred_mean, pred_vars=pred_var) uncert = uncert.unsqueeze(dim=1).to(self.device) pred_uncerts.append(uncert) @@ -296,29 +300,44 @@ def sampling(self, curr_states, pred_means, pred_vars): with torch.no_grad(): # 5 models. Each predict 10 next_states. sample1 = torch.distributions.Normal(pred_means[0], pred_vars[0]).sample( - [self.sample_times]) + [self.sample_times] + ) sample2 = torch.distributions.Normal(pred_means[1], pred_vars[1]).sample( - [self.sample_times]) + [self.sample_times] + ) sample3 = torch.distributions.Normal(pred_means[2], pred_vars[2]).sample( - [self.sample_times]) + [self.sample_times] + ) sample4 = torch.distributions.Normal(pred_means[3], pred_vars[3]).sample( - [self.sample_times]) + [self.sample_times] + ) sample5 = torch.distributions.Normal(pred_means[4], pred_vars[4]).sample( - [self.sample_times]) + [self.sample_times] + ) rs = [] acts = [] qs = [] # Varying the next_state's distribution. for i in range(self.sample_times): - sample1i = denormalize_observation_delta(sample1[i], self.world_model.statistics) + sample1i = denormalize_observation_delta( + sample1[i], self.world_model.statistics + ) sample1i += curr_states - sample2i = denormalize_observation_delta(sample2[i], self.world_model.statistics) + sample2i = denormalize_observation_delta( + sample2[i], self.world_model.statistics + ) sample2i += curr_states - sample3i = denormalize_observation_delta(sample3[i], self.world_model.statistics) + sample3i = denormalize_observation_delta( + sample3[i], self.world_model.statistics + ) sample3i += curr_states - sample4i = denormalize_observation_delta(sample4[i], self.world_model.statistics) + sample4i = denormalize_observation_delta( + sample4[i], self.world_model.statistics + ) sample4i += curr_states - sample5i = denormalize_observation_delta(sample5[i], self.world_model.statistics) + sample5i = denormalize_observation_delta( + sample5[i], self.world_model.statistics + ) sample5i += curr_states # 5 models, each sampled 10 times = 50, pred_rwd1 = self.world_model.pred_rewards(sample1i) @@ -391,13 +410,21 @@ def sampling(self, curr_states, pred_means, pred_vars): gamma_sq = self.gamma * self.gamma # Ablation if self.mode == 0: - total_var = var_r + gamma_sq * var_a + gamma_sq * var_q + gamma_sq * 2 * cov_aq + \ - gamma_sq * 2 * cov_rq + gamma_sq * 2 * cov_ra + total_var = ( + var_r + + gamma_sq * var_a + + gamma_sq * var_q + + gamma_sq * 2 * cov_aq + + gamma_sq * 2 * cov_rq + + gamma_sq * 2 * cov_ra + ) if self.mode == 1: total_var = gamma_sq * var_a + gamma_sq * var_q + gamma_sq * 2 * cov_aq - total_stds = torch.minimum(self.threshold_scale / total_var, - torch.ones(total_var.shape).to(self.device) * 1.5) + total_stds = torch.minimum( + self.threshold_scale / total_var, + torch.ones(total_var.shape).to(self.device) * 1.5, + ) return total_stds.detach() diff --git a/cares_reinforcement_learning/algorithm/mbrl/STEVESAC.py b/cares_reinforcement_learning/algorithm/mbrl/STEVESAC.py index b00dde06..bcfb356a 100644 --- a/cares_reinforcement_learning/algorithm/mbrl/STEVESAC.py +++ b/cares_reinforcement_learning/algorithm/mbrl/STEVESAC.py @@ -21,21 +21,21 @@ class STEVESAC: def __init__( - self, - actor_network: torch.nn.Module, - critic_network: torch.nn.Module, - world_network: Ensemble_Dyna_Big, - gamma: float, - tau: float, - action_num: int, - actor_lr: float, - critic_lr: float, - alpha_lr: float, - horizon: int, - device: torch.device, - train_reward: bool, - train_both: bool, - gripper: bool, + self, + actor_network: torch.nn.Module, + critic_network: torch.nn.Module, + world_network: Ensemble_Dyna_Big, + gamma: float, + tau: float, + action_num: int, + actor_lr: float, + critic_lr: float, + alpha_lr: float, + horizon: int, + device: torch.device, + train_reward: bool, + train_both: bool, + gripper: bool, ): logging.info("----------------------------------------") logging.info("----I am runing the STEVESAC Agent! ----") @@ -78,14 +78,14 @@ def __init__( # World model self.world_model = world_network - self.k_l = nn.KLDivLoss(reduction='batchmean', log_target=True) + self.k_l = nn.KLDivLoss(reduction="batchmean", log_target=True) @property def _alpha(self) -> float: return self.log_alpha.exp() def select_action_from_policy( - self, state: np.ndarray, evaluation: bool = False, noise_scale: float = 0 + self, state: np.ndarray, evaluation: bool = False, noise_scale: float = 0 ) -> np.ndarray: # note that when evaluating this algorithm we need to select mu as self.actor_net.eval() @@ -100,30 +100,36 @@ def select_action_from_policy( return action def _train_policy( - self, - states: torch.Tensor, - actions: torch.Tensor, - rewards: torch.Tensor, - next_states: torch.Tensor, - dones: torch.Tensor, - weights: torch.Tensor, + self, + states: torch.Tensor, + actions: torch.Tensor, + rewards: torch.Tensor, + next_states: torch.Tensor, + dones: torch.Tensor, + weights: torch.Tensor, ) -> None: if weights is None: weights = torch.ones(rewards.shape) ################## Update the Critic First #################### with torch.no_grad(): - not_dones = (1 - dones) + not_dones = 1 - dones q_means = [] q_weights = [] - accum_dist_rewards = torch.repeat_interleave(rewards.unsqueeze(dim=0), repeats=30, dim=0) + accum_dist_rewards = torch.repeat_interleave( + rewards.unsqueeze(dim=0), repeats=30, dim=0 + ) # 5 * 5 * 4 = 100 for hori in range(self.horizon): _, curr_hori_log_pi, curr_hori_action = self.actor_net(next_states) - mean_predictions, all_mean_next, _, _ = self.world_model.pred_next_states(next_states, curr_hori_action) - pred_rewards, _ = self.world_model.pred_all_rewards(observation=next_states, - action=curr_hori_action, - next_observation=all_mean_next) - pred_rewards *= (self.gamma ** (hori + 1)) + mean_predictions, all_mean_next, _, _ = ( + self.world_model.pred_next_states(next_states, curr_hori_action) + ) + pred_rewards, _ = self.world_model.pred_all_rewards( + observation=next_states, + action=curr_hori_action, + next_observation=all_mean_next, + ) + pred_rewards *= self.gamma ** (hori + 1) accum_dist_rewards += pred_rewards # V = Q - alpha * logi pred_q1, pred_q2 = self.target_critic_net(next_states, curr_hori_action) @@ -134,10 +140,22 @@ def _train_policy( pred_v4 = pred_q4 - self._alpha * curr_hori_log_pi q_0 = [] for i in range(pred_rewards.shape[0]): - pred_tq1 = accum_dist_rewards[i] + not_dones * (self.gamma ** (hori + 2)) * pred_v1 - pred_tq2 = accum_dist_rewards[i] + not_dones * (self.gamma ** (hori + 2)) * pred_v2 - pred_tq3 = accum_dist_rewards[i] + not_dones * (self.gamma ** (hori + 2)) * pred_v3 - pred_tq4 = accum_dist_rewards[i] + not_dones * (self.gamma ** (hori + 2)) * pred_v4 + pred_tq1 = ( + accum_dist_rewards[i] + + not_dones * (self.gamma ** (hori + 2)) * pred_v1 + ) + pred_tq2 = ( + accum_dist_rewards[i] + + not_dones * (self.gamma ** (hori + 2)) * pred_v2 + ) + pred_tq3 = ( + accum_dist_rewards[i] + + not_dones * (self.gamma ** (hori + 2)) * pred_v3 + ) + pred_tq4 = ( + accum_dist_rewards[i] + + not_dones * (self.gamma ** (hori + 2)) * pred_v4 + ) q_0.append(pred_tq1) q_0.append(pred_tq2) q_0.append(pred_tq3) @@ -181,7 +199,7 @@ def _train_policy( # Update the temperature alpha_loss = -( - self.log_alpha * (first_log_p + self.target_entropy).detach() + self.log_alpha * (first_log_p + self.target_entropy).detach() ).mean() self.log_alpha_optimizer.zero_grad() @@ -190,15 +208,13 @@ def _train_policy( if self.learn_counter % self.policy_update_freq == 0: for target_param, param in zip( - self.target_critic_net.parameters(), self.critic_net.parameters() + self.target_critic_net.parameters(), self.critic_net.parameters() ): target_param.data.copy_( param.data * self.tau + target_param.data * (1.0 - self.tau) ) - def train_world_model( - self, memory: MemoryBuffer, batch_size: int - ) -> None: + def train_world_model(self, memory: MemoryBuffer, batch_size: int) -> None: experiences = memory.sample_uniform(batch_size) states, actions, rewards, next_states, _, _ = experiences @@ -243,7 +259,7 @@ def train_policy(self, memory: MemoryBuffer, batch_size: int) -> None: rewards=rewards, next_states=next_states, dones=dones, - weights=torch.ones(rewards.shape) + weights=torch.ones(rewards.shape), ) def set_statistics(self, stats: dict) -> None: diff --git a/cares_reinforcement_learning/algorithm/mbrl/STEVESAC_Bounded.py b/cares_reinforcement_learning/algorithm/mbrl/STEVESAC_Bounded.py index 8ed33fa2..5195d98e 100644 --- a/cares_reinforcement_learning/algorithm/mbrl/STEVESAC_Bounded.py +++ b/cares_reinforcement_learning/algorithm/mbrl/STEVESAC_Bounded.py @@ -22,23 +22,23 @@ class STEVESAC_Bounded: def __init__( - self, - actor_network: torch.nn.Module, - critic_network: torch.nn.Module, - world_network: Ensemble_Dyna_Big, - gamma: float, - tau: float, - action_num: int, - actor_lr: float, - critic_lr: float, - alpha_lr: float, - horizon: int, - device: torch.device, - train_reward: bool, - train_both: bool, - gripper: bool, - threshold: float, - exploration_sample: int + self, + actor_network: torch.nn.Module, + critic_network: torch.nn.Module, + world_network: Ensemble_Dyna_Big, + gamma: float, + tau: float, + action_num: int, + actor_lr: float, + critic_lr: float, + alpha_lr: float, + horizon: int, + device: torch.device, + train_reward: bool, + train_both: bool, + gripper: bool, + threshold: float, + exploration_sample: int, ): logging.info("------------------------------------------------") logging.info("----I am runing the STEVESAC_Bounded Agent! ----") @@ -83,14 +83,14 @@ def __init__( # World model self.world_model = world_network - self.k_l = nn.KLDivLoss(reduction='batchmean', log_target=True) + self.k_l = nn.KLDivLoss(reduction="batchmean", log_target=True) @property def _alpha(self) -> float: return self.log_alpha.exp() def select_action_from_policy( - self, state: np.ndarray, evaluation: bool = False, noise_scale: float = 0 + self, state: np.ndarray, evaluation: bool = False, noise_scale: float = 0 ) -> np.ndarray: # note that when evaluating this algorithm we need to select mu as self.actor_net.eval() @@ -102,18 +102,25 @@ def select_action_from_policy( (action, _, _) = self.actor_net(state_tensor) else: if self.set_stat: - multi_state_tensor = torch.repeat_interleave(state_tensor, self.exploration_sample, dim=0) - (multi_action, multi_log_pi, _) = self.actor_net(multi_state_tensor) + multi_state_tensor = torch.repeat_interleave( + state_tensor, self.exploration_sample, dim=0 + ) + (multi_action, multi_log_pi, _) = self.actor_net( + multi_state_tensor + ) # Estimate uncertainty # [6, 10, 17] - _, _, nstate_means, nstate_vars = self.world_model.pred_next_states( - observation=multi_state_tensor, actions=multi_action) + _, _, nstate_means, nstate_vars = ( + self.world_model.pred_next_states( + observation=multi_state_tensor, actions=multi_action + ) + ) # [10, 17] - aleatoric = torch.mean(nstate_vars ** 2, dim=0) ** 0.5 + aleatoric = torch.mean(nstate_vars**2, dim=0) ** 0.5 epistemic = torch.var(nstate_means, dim=0) ** 0.5 aleatoric = torch.clamp(aleatoric, max=10e3) epistemic = torch.clamp(epistemic, max=10e3) - total_unc = (aleatoric ** 2 + epistemic ** 2) ** 0.5 + total_unc = (aleatoric**2 + epistemic**2) ** 0.5 uncert = torch.mean(total_unc, dim=1) world_dist = F.softmax(uncert, dim=0) # world_dist -= torch.min(world_dist) @@ -125,7 +132,9 @@ def select_action_from_policy( # multi_log_pi = multi_log_pi.squeeze() policy_dist = F.softmax(multi_log_pi, dim=0) - final_dist = (1 - self.threshold) * policy_dist + self.threshold * world_dist + final_dist = ( + 1 - self.threshold + ) * policy_dist + self.threshold * world_dist candi = torch.argmax(final_dist) # final_dist = F.softmax(final_dist, dim=0) # new_dist = torch.distributions.Categorical(final_dist) @@ -141,30 +150,36 @@ def select_action_from_policy( return action def _train_policy( - self, - states: torch.Tensor, - actions: torch.Tensor, - rewards: torch.Tensor, - next_states: torch.Tensor, - dones: torch.Tensor, - weights: torch.Tensor, + self, + states: torch.Tensor, + actions: torch.Tensor, + rewards: torch.Tensor, + next_states: torch.Tensor, + dones: torch.Tensor, + weights: torch.Tensor, ) -> None: if weights is None: weights = torch.ones(rewards.shape) ################## Update the Critic First #################### with torch.no_grad(): - not_dones = (1 - dones) + not_dones = 1 - dones q_means = [] q_weights = [] - accum_dist_rewards = torch.repeat_interleave(rewards.unsqueeze(dim=0), repeats=30, dim=0) + accum_dist_rewards = torch.repeat_interleave( + rewards.unsqueeze(dim=0), repeats=30, dim=0 + ) # 5 * 5 * 4 = 100 for hori in range(self.horizon): _, curr_hori_log_pi, curr_hori_action = self.actor_net(next_states) - mean_predictions, all_mean_next, _, _ = self.world_model.pred_next_states(next_states, curr_hori_action) - pred_rewards, _ = self.world_model.pred_all_rewards(observation=next_states, - action=curr_hori_action, - next_observation=all_mean_next) - pred_rewards *= (self.gamma ** (hori + 1)) + mean_predictions, all_mean_next, _, _ = ( + self.world_model.pred_next_states(next_states, curr_hori_action) + ) + pred_rewards, _ = self.world_model.pred_all_rewards( + observation=next_states, + action=curr_hori_action, + next_observation=all_mean_next, + ) + pred_rewards *= self.gamma ** (hori + 1) accum_dist_rewards += pred_rewards # V = Q - alpha * logi pred_q1, pred_q2 = self.target_critic_net(next_states, curr_hori_action) @@ -175,10 +190,22 @@ def _train_policy( pred_v4 = pred_q4 - self._alpha * curr_hori_log_pi q_0 = [] for i in range(pred_rewards.shape[0]): - pred_tq1 = accum_dist_rewards[i] + not_dones * (self.gamma ** (hori + 2)) * pred_v1 - pred_tq2 = accum_dist_rewards[i] + not_dones * (self.gamma ** (hori + 2)) * pred_v2 - pred_tq3 = accum_dist_rewards[i] + not_dones * (self.gamma ** (hori + 2)) * pred_v3 - pred_tq4 = accum_dist_rewards[i] + not_dones * (self.gamma ** (hori + 2)) * pred_v4 + pred_tq1 = ( + accum_dist_rewards[i] + + not_dones * (self.gamma ** (hori + 2)) * pred_v1 + ) + pred_tq2 = ( + accum_dist_rewards[i] + + not_dones * (self.gamma ** (hori + 2)) * pred_v2 + ) + pred_tq3 = ( + accum_dist_rewards[i] + + not_dones * (self.gamma ** (hori + 2)) * pred_v3 + ) + pred_tq4 = ( + accum_dist_rewards[i] + + not_dones * (self.gamma ** (hori + 2)) * pred_v4 + ) q_0.append(pred_tq1) q_0.append(pred_tq2) q_0.append(pred_tq3) @@ -222,7 +249,7 @@ def _train_policy( # Update the temperature alpha_loss = -( - self.log_alpha * (first_log_p + self.target_entropy).detach() + self.log_alpha * (first_log_p + self.target_entropy).detach() ).mean() self.log_alpha_optimizer.zero_grad() @@ -231,15 +258,13 @@ def _train_policy( if self.learn_counter % self.policy_update_freq == 0: for target_param, param in zip( - self.target_critic_net.parameters(), self.critic_net.parameters() + self.target_critic_net.parameters(), self.critic_net.parameters() ): target_param.data.copy_( param.data * self.tau + target_param.data * (1.0 - self.tau) ) - def train_world_model( - self, memory: MemoryBuffer, batch_size: int - ) -> None: + def train_world_model(self, memory: MemoryBuffer, batch_size: int) -> None: experiences = memory.sample_uniform(batch_size) states, actions, rewards, next_states, _, _ = experiences @@ -284,7 +309,7 @@ def train_policy(self, memory: MemoryBuffer, batch_size: int) -> None: rewards=rewards, next_states=next_states, dones=dones, - weights=torch.ones(rewards.shape) + weights=torch.ones(rewards.shape), ) def reward_function(self, curr_states, next_states): diff --git a/cares_reinforcement_learning/networks/SAC/triple_critic.py b/cares_reinforcement_learning/networks/SAC/triple_critic.py index c6347d6a..2250a269 100644 --- a/cares_reinforcement_learning/networks/SAC/triple_critic.py +++ b/cares_reinforcement_learning/networks/SAC/triple_critic.py @@ -34,7 +34,6 @@ def __init__(self, observation_size: int, num_actions: int): nn.Linear(self.hidden_size[1], 1), ) - def forward( self, state: torch.Tensor, action: torch.Tensor ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: diff --git a/cares_reinforcement_learning/networks/world_models/ensemble/__init__.py b/cares_reinforcement_learning/networks/world_models/ensemble/__init__.py index 29153282..69fd4fe7 100644 --- a/cares_reinforcement_learning/networks/world_models/ensemble/__init__.py +++ b/cares_reinforcement_learning/networks/world_models/ensemble/__init__.py @@ -1,2 +1,2 @@ from .world_ensemble_one_rwd import Ensemble_Dyna_One_Reward -from .world_ensemble_big import Ensemble_Dyna_Big \ No newline at end of file +from .world_ensemble_big import Ensemble_Dyna_Big diff --git a/cares_reinforcement_learning/networks/world_models/ensemble/world_ensemble_big.py b/cares_reinforcement_learning/networks/world_models/ensemble/world_ensemble_big.py index 8e07e48e..7c88f74a 100644 --- a/cares_reinforcement_learning/networks/world_models/ensemble/world_ensemble_big.py +++ b/cares_reinforcement_learning/networks/world_models/ensemble/world_ensemble_big.py @@ -4,10 +4,16 @@ import torch.nn.functional as F import torch.utils from torch import optim -from cares_reinforcement_learning.networks.world_models.simple import Probabilistic_Dynamics +from cares_reinforcement_learning.networks.world_models.simple import ( + Probabilistic_Dynamics, +) from cares_reinforcement_learning.networks.world_models import World_Model from cares_reinforcement_learning.util.helpers import normalize_observation_delta -from cares_reinforcement_learning.util import denormalize_observation_delta, normalize_observation +from cares_reinforcement_learning.util import ( + denormalize_observation_delta, + normalize_observation, +) + def sig(x): """ @@ -22,26 +28,30 @@ class Ensemble_Dyna_Big(World_Model): """ World Model """ - def __init__(self, - observation_size: int, - num_actions: int, - device: str, - l_r: float = 0.001, - hidden_size=None, - sas: bool = True, - prob_rwd: bool = True, - num_models: int = 7, - boost_inter: int = 3, - num_rwd_model: int = 1 - ): - super().__init__(observation_size=observation_size, - num_actions=num_actions, - l_r=l_r, - device=device, - hidden_size=hidden_size, - sas=sas, - prob_rwd=prob_rwd, - num_rwd_model=num_rwd_model) + + def __init__( + self, + observation_size: int, + num_actions: int, + device: str, + l_r: float = 0.001, + hidden_size=None, + sas: bool = True, + prob_rwd: bool = True, + num_models: int = 7, + boost_inter: int = 3, + num_rwd_model: int = 1, + ): + super().__init__( + observation_size=observation_size, + num_actions=num_actions, + l_r=l_r, + device=device, + hidden_size=hidden_size, + sas=sas, + prob_rwd=prob_rwd, + num_rwd_model=num_rwd_model, + ) self.num_models = num_models self.observation_size = observation_size @@ -72,7 +82,10 @@ def __init__(self, ) self.world_models.append(model) - self.optimizers = [optim.Adam(self.world_models[i].parameters(), lr=l_r) for i in range(self.num_models)] + self.optimizers = [ + optim.Adam(self.world_models[i].parameters(), lr=l_r) + for i in range(self.num_models) + ] self.statistics = {} # Bring all reward prediction and dynamic rediction networks to device. self.device = device @@ -82,11 +95,11 @@ def __init__(self, self.update_counter = 0 def pred_next_states( - self, observation: torch.Tensor, actions: torch.Tensor + self, observation: torch.Tensor, actions: torch.Tensor ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: assert ( - observation.shape[1] + actions.shape[1] - == self.observation_size + self.num_actions + observation.shape[1] + actions.shape[1] + == self.observation_size + self.num_actions ) norm_means = [] norm_vars = [] @@ -100,17 +113,19 @@ def pred_next_states( predictions_vars = torch.stack(norm_vars) predictions_norm_means = torch.stack(norm_means) # Normalized - predictions_means = denormalize_observation_delta(predictions_norm_means, self.statistics) + predictions_means = denormalize_observation_delta( + predictions_norm_means, self.statistics + ) all_predictions = predictions_means + observation denorm_avg = torch.mean(predictions_means, dim=0) prediction = denorm_avg + observation return prediction, all_predictions, predictions_norm_means, predictions_vars def train_world( - self, - states: torch.Tensor, - actions: torch.Tensor, - next_states: torch.Tensor, + self, + states: torch.Tensor, + actions: torch.Tensor, + next_states: torch.Tensor, ) -> None: # This boosting part is useless, cause inaccuracy. # weights = 1.5 - sig(self.curr_losses) @@ -118,8 +133,8 @@ def train_world( assert len(states.shape) >= 2 assert len(actions.shape) == 2 assert ( - states.shape[1] + actions.shape[1] - == self.num_actions + self.observation_size + states.shape[1] + actions.shape[1] + == self.num_actions + self.observation_size ) # min_ = np.min(self.curr_losses) # max_ = np.max(self.curr_losses) @@ -134,7 +149,9 @@ def train_world( delta_targets_normalized = normalize_observation_delta(target, self.statistics) normalized_state = normalize_observation(states, self.statistics) n_mean, n_var = self.world_models[index].forward(normalized_state, actions) - model_loss = F.gaussian_nll_loss(input=n_mean, target=delta_targets_normalized, var=n_var).mean() + model_loss = F.gaussian_nll_loss( + input=n_mean, target=delta_targets_normalized, var=n_var + ).mean() self.optimizers[index].zero_grad() model_loss.backward() self.optimizers[index].step() @@ -142,10 +159,8 @@ def train_world( self.update_counter += 1 self.update_counter %= self.boost_inter * self.num_models - - def estimate_uncertainty( - self, observation: torch.Tensor, actions: torch.Tensor, train_reward:bool + self, observation: torch.Tensor, actions: torch.Tensor, train_reward: bool ) -> tuple[float, float, torch.Tensor]: """ Estimate uncertainty. @@ -164,13 +179,13 @@ def estimate_uncertainty( vars_s.append(var) vars_s = torch.stack(vars_s).squeeze() noises = vars_s.cpu().detach().numpy() - aleatoric = (noises ** 2).mean(axis=0) ** 0.5 + aleatoric = (noises**2).mean(axis=0) ** 0.5 all_means = torch.stack(means).squeeze() epistemic = all_means.cpu().detach().numpy() epistemic = epistemic.var(axis=0) ** 0.5 aleatoric = np.minimum(aleatoric, 10e3) epistemic = np.minimum(epistemic, 10e3) - total_unc = (aleatoric ** 2 + epistemic ** 2) ** 0.5 + total_unc = (aleatoric**2 + epistemic**2) ** 0.5 uncert = np.mean(total_unc) if train_reward: # Reward Uncertainty @@ -178,21 +193,29 @@ def estimate_uncertainty( means = torch.vstack(means) dist = torch.distributions.Normal(means, vars_s) samples = dist.sample([sample_times]) - samples = torch.reshape(samples, (sample_times * self.num_models, self.observation_size)) + samples = torch.reshape( + samples, (sample_times * self.num_models, self.observation_size) + ) samples = denormalize_observation_delta(samples, self.statistics) - observationss = torch.repeat_interleave(observation, repeats=sample_times * self.num_models, dim=0) - actionss = torch.repeat_interleave(actions, repeats=sample_times * self.num_models, dim=0) + observationss = torch.repeat_interleave( + observation, repeats=sample_times * self.num_models, dim=0 + ) + actionss = torch.repeat_interleave( + actions, repeats=sample_times * self.num_models, dim=0 + ) samples += observationss if self.sas: if self.prob_rwd: - rewards, rwd_var = self.reward_network(observationss, actionss, samples) + rewards, rwd_var = self.reward_network( + observationss, actionss, samples + ) epis_uncert = torch.var(rewards, dim=0).item() rwd_var = rwd_var.squeeze().detach().cpu().numpy().mean() alea_uncert = rwd_var epis_uncert = np.minimum(epis_uncert, 10e3) alea_uncert = np.minimum(alea_uncert, 10e3) - uncert_rwd = ((epis_uncert ** 2) + (alea_uncert ** 2)) ** 0.5 + uncert_rwd = ((epis_uncert**2) + (alea_uncert**2)) ** 0.5 else: rewards = self.reward_network(observationss, actionss, samples) uncert_rwd = torch.var(rewards, dim=0).item() @@ -204,19 +227,25 @@ def estimate_uncertainty( alea_uncert = rwd_var epis_uncert = np.minimum(epis_uncert, 10e3) alea_uncert = np.minimum(alea_uncert, 10e3) - uncert_rwd = ((epis_uncert ** 2) + (alea_uncert ** 2)) ** 0.5 + uncert_rwd = ((epis_uncert**2) + (alea_uncert**2)) ** 0.5 else: rewards = self.reward_network(samples, actionss) uncert_rwd = torch.var(rewards, dim=0).item() else: dist = torch.distributions.Normal(all_means, vars_s) next_state_samples = dist.sample([20]) - next_state_samples = next_state_samples.reshape((self.num_models * 20, self.observation_size)) - next_state_samples = denormalize_observation_delta(next_state_samples, self.statistics) + next_state_samples = next_state_samples.reshape( + (self.num_models * 20, self.observation_size) + ) + next_state_samples = denormalize_observation_delta( + next_state_samples, self.statistics + ) next_state_samples += observation return uncert, uncert_rwd, next_state_samples - def train_together(self, states: torch.Tensor, actions: torch.Tensor, rewards: torch.Tensor): + def train_together( + self, states: torch.Tensor, actions: torch.Tensor, rewards: torch.Tensor + ): sample_times = 20 normalized_state = normalize_observation(states, self.statistics) mean_s = [] @@ -239,15 +268,27 @@ def train_together(self, states: torch.Tensor, actions: torch.Tensor, rewards: t rwd_s = torch.vstack(rwd_s) dist = torch.distributions.Normal(mean_s, var_s) - samples = (dist.sample([sample_times])) + samples = dist.sample([sample_times]) - actions = torch.repeat_interleave(act_s.unsqueeze(dim=0), repeats=sample_times, dim=0) - states = torch.repeat_interleave(state_s.unsqueeze(dim=0), repeats=sample_times,dim=0) - rwd_s = torch.repeat_interleave(rwd_s.unsqueeze(dim=0), repeats=sample_times, dim=0) + actions = torch.repeat_interleave( + act_s.unsqueeze(dim=0), repeats=sample_times, dim=0 + ) + states = torch.repeat_interleave( + state_s.unsqueeze(dim=0), repeats=sample_times, dim=0 + ) + rwd_s = torch.repeat_interleave( + rwd_s.unsqueeze(dim=0), repeats=sample_times, dim=0 + ) - samples = torch.reshape(samples, (samples.shape[0] * samples.shape[1], self.observation_size)) - states = torch.reshape(states, (states.shape[0] * states.shape[1], states.shape[2])) - actions = torch.reshape(actions, (actions.shape[0] * actions.shape[1], actions.shape[2])) + samples = torch.reshape( + samples, (samples.shape[0] * samples.shape[1], self.observation_size) + ) + states = torch.reshape( + states, (states.shape[0] * states.shape[1], states.shape[2]) + ) + actions = torch.reshape( + actions, (actions.shape[0] * actions.shape[1], actions.shape[2]) + ) rwd_s = torch.reshape(rwd_s, (rwd_s.shape[0] * rwd_s.shape[1], rwd_s.shape[2])) samples = denormalize_observation_delta(samples, self.statistics) diff --git a/cares_reinforcement_learning/networks/world_models/ensemble/world_ensemble_one_rwd.py b/cares_reinforcement_learning/networks/world_models/ensemble/world_ensemble_one_rwd.py index 6affe44b..334b8419 100644 --- a/cares_reinforcement_learning/networks/world_models/ensemble/world_ensemble_one_rwd.py +++ b/cares_reinforcement_learning/networks/world_models/ensemble/world_ensemble_one_rwd.py @@ -4,10 +4,16 @@ import torch.nn.functional as F import torch.utils from torch import optim -from cares_reinforcement_learning.networks.world_models.simple import Probabilistic_Dynamics +from cares_reinforcement_learning.networks.world_models.simple import ( + Probabilistic_Dynamics, +) from cares_reinforcement_learning.networks.world_models import World_Model from cares_reinforcement_learning.util.helpers import normalize_observation_delta -from cares_reinforcement_learning.util import denormalize_observation_delta, normalize_observation +from cares_reinforcement_learning.util import ( + denormalize_observation_delta, + normalize_observation, +) + def sig(x): """ @@ -22,17 +28,22 @@ class Ensemble_Dyna_One_Reward(World_Model): """ World Model """ - def __init__(self, - observation_size: int, - num_actions: int, - device: str, - num_models: int = 5, - l_r: float = 0.001, - boost_inter: int = 3, - hidden_size=None, - sas: bool = True, - prob_rwd: bool = True): - super().__init__(observation_size, num_actions, l_r, device, hidden_size, sas, prob_rwd) + + def __init__( + self, + observation_size: int, + num_actions: int, + device: str, + num_models: int = 5, + l_r: float = 0.001, + boost_inter: int = 3, + hidden_size=None, + sas: bool = True, + prob_rwd: bool = True, + ): + super().__init__( + observation_size, num_actions, l_r, device, hidden_size, sas, prob_rwd + ) if hidden_size is None: hidden_size = [128, 128] self.num_models = num_models @@ -48,7 +59,10 @@ def __init__(self, ) for _ in range(self.num_models) ] - self.optimizers = [optim.Adam(self.world_models[i].parameters(), lr=l_r) for i in range(self.num_models)] + self.optimizers = [ + optim.Adam(self.world_models[i].parameters(), lr=l_r) + for i in range(self.num_models) + ] self.statistics = {} # Bring all reward prediction and dynamic rediction networks to device. self.device = device @@ -58,11 +72,11 @@ def __init__(self, self.update_counter = 0 def pred_next_states( - self, observation: torch.Tensor, actions: torch.Tensor + self, observation: torch.Tensor, actions: torch.Tensor ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: assert ( - observation.shape[1] + actions.shape[1] - == self.observation_size + self.num_actions + observation.shape[1] + actions.shape[1] + == self.observation_size + self.num_actions ) norm_means = [] norm_vars = [] @@ -76,17 +90,19 @@ def pred_next_states( predictions_vars = torch.stack(norm_vars) predictions_norm_means = torch.stack(norm_means) # Normalized - predictions_means = denormalize_observation_delta(predictions_norm_means, self.statistics) + predictions_means = denormalize_observation_delta( + predictions_norm_means, self.statistics + ) all_predictions = predictions_means + observation denorm_avg = torch.mean(predictions_means, dim=0) prediction = denorm_avg + observation return prediction, all_predictions, predictions_norm_means, predictions_vars def train_world( - self, - states: torch.Tensor, - actions: torch.Tensor, - next_states: torch.Tensor, + self, + states: torch.Tensor, + actions: torch.Tensor, + next_states: torch.Tensor, ) -> None: # This boosting part is useless, cause inaccuracy. # weights = 1.5 - sig(self.curr_losses) @@ -94,8 +110,8 @@ def train_world( assert len(states.shape) >= 2 assert len(actions.shape) == 2 assert ( - states.shape[1] + actions.shape[1] - == self.num_actions + self.observation_size + states.shape[1] + actions.shape[1] + == self.num_actions + self.observation_size ) # min_ = np.min(self.curr_losses) # max_ = np.max(self.curr_losses) @@ -110,7 +126,9 @@ def train_world( delta_targets_normalized = normalize_observation_delta(target, self.statistics) normalized_state = normalize_observation(states, self.statistics) n_mean, n_var = self.world_models[index].forward(normalized_state, actions) - model_loss = F.gaussian_nll_loss(input=n_mean, target=delta_targets_normalized, var=n_var).mean() + model_loss = F.gaussian_nll_loss( + input=n_mean, target=delta_targets_normalized, var=n_var + ).mean() self.optimizers[index].zero_grad() model_loss.backward() self.optimizers[index].step() @@ -119,7 +137,7 @@ def train_world( self.update_counter %= self.boost_inter * self.num_models def estimate_uncertainty( - self, observation: torch.Tensor, actions: torch.Tensor, train_reward:bool + self, observation: torch.Tensor, actions: torch.Tensor, train_reward: bool ) -> tuple[float, float, torch.Tensor]: """ Estimate uncertainty. @@ -138,13 +156,13 @@ def estimate_uncertainty( vars_s.append(var) vars_s = torch.stack(vars_s).squeeze() noises = vars_s.cpu().detach().numpy() - aleatoric = (noises ** 2).mean(axis=0) ** 0.5 + aleatoric = (noises**2).mean(axis=0) ** 0.5 all_means = torch.stack(means).squeeze() epistemic = all_means.cpu().detach().numpy() epistemic = epistemic.var(axis=0) ** 0.5 aleatoric = np.minimum(aleatoric, 10e3) epistemic = np.minimum(epistemic, 10e3) - total_unc = (aleatoric ** 2 + epistemic ** 2) ** 0.5 + total_unc = (aleatoric**2 + epistemic**2) ** 0.5 uncert = np.mean(total_unc) if train_reward: # Reward Uncertainty @@ -152,21 +170,29 @@ def estimate_uncertainty( means = torch.vstack(means) dist = torch.distributions.Normal(means, vars_s) samples = dist.sample([sample_times]) - samples = torch.reshape(samples, (sample_times * self.num_models, self.observation_size)) + samples = torch.reshape( + samples, (sample_times * self.num_models, self.observation_size) + ) samples = denormalize_observation_delta(samples, self.statistics) - observationss = torch.repeat_interleave(observation, repeats=sample_times * self.num_models, dim=0) - actionss = torch.repeat_interleave(actions, repeats=sample_times * self.num_models, dim=0) + observationss = torch.repeat_interleave( + observation, repeats=sample_times * self.num_models, dim=0 + ) + actionss = torch.repeat_interleave( + actions, repeats=sample_times * self.num_models, dim=0 + ) samples += observationss if self.sas: if self.prob_rwd: - rewards, rwd_var = self.reward_network(observationss, actionss, samples) + rewards, rwd_var = self.reward_network( + observationss, actionss, samples + ) epis_uncert = torch.var(rewards, dim=0).item() rwd_var = rwd_var.squeeze().detach().cpu().numpy().mean() alea_uncert = rwd_var epis_uncert = np.minimum(epis_uncert, 10e3) alea_uncert = np.minimum(alea_uncert, 10e3) - uncert_rwd = ((epis_uncert ** 2) + (alea_uncert ** 2)) ** 0.5 + uncert_rwd = ((epis_uncert**2) + (alea_uncert**2)) ** 0.5 else: rewards = self.reward_network(observationss, actionss, samples) uncert_rwd = torch.var(rewards, dim=0).item() @@ -178,19 +204,25 @@ def estimate_uncertainty( alea_uncert = rwd_var epis_uncert = np.minimum(epis_uncert, 10e3) alea_uncert = np.minimum(alea_uncert, 10e3) - uncert_rwd = ((epis_uncert ** 2) + (alea_uncert ** 2)) ** 0.5 + uncert_rwd = ((epis_uncert**2) + (alea_uncert**2)) ** 0.5 else: rewards = self.reward_network(samples, actionss) uncert_rwd = torch.var(rewards, dim=0).item() else: dist = torch.distributions.Normal(all_means, vars_s) next_state_samples = dist.sample([20]) - next_state_samples = next_state_samples.reshape((self.num_models * 20, self.observation_size)) - next_state_samples = denormalize_observation_delta(next_state_samples, self.statistics) + next_state_samples = next_state_samples.reshape( + (self.num_models * 20, self.observation_size) + ) + next_state_samples = denormalize_observation_delta( + next_state_samples, self.statistics + ) next_state_samples += observation return uncert, uncert_rwd, next_state_samples - def train_together(self, states: torch.Tensor, actions: torch.Tensor, rewards: torch.Tensor): + def train_together( + self, states: torch.Tensor, actions: torch.Tensor, rewards: torch.Tensor + ): sample_times = 20 normalized_state = normalize_observation(states, self.statistics) mean_s = [] @@ -213,21 +245,32 @@ def train_together(self, states: torch.Tensor, actions: torch.Tensor, rewards: t rwd_s = torch.vstack(rwd_s) dist = torch.distributions.Normal(mean_s, var_s) - samples = (dist.sample([sample_times])) + samples = dist.sample([sample_times]) - actions = torch.repeat_interleave(act_s.unsqueeze(dim=0), repeats=sample_times, dim=0) - states = torch.repeat_interleave(state_s.unsqueeze(dim=0), repeats=sample_times,dim=0) - rwd_s = torch.repeat_interleave(rwd_s.unsqueeze(dim=0), repeats=sample_times, dim=0) + actions = torch.repeat_interleave( + act_s.unsqueeze(dim=0), repeats=sample_times, dim=0 + ) + states = torch.repeat_interleave( + state_s.unsqueeze(dim=0), repeats=sample_times, dim=0 + ) + rwd_s = torch.repeat_interleave( + rwd_s.unsqueeze(dim=0), repeats=sample_times, dim=0 + ) - samples = torch.reshape(samples, (samples.shape[0] * samples.shape[1], self.observation_size)) - states = torch.reshape(states, (states.shape[0] * states.shape[1], states.shape[2])) - actions = torch.reshape(actions, (actions.shape[0] * actions.shape[1], actions.shape[2])) + samples = torch.reshape( + samples, (samples.shape[0] * samples.shape[1], self.observation_size) + ) + states = torch.reshape( + states, (states.shape[0] * states.shape[1], states.shape[2]) + ) + actions = torch.reshape( + actions, (actions.shape[0] * actions.shape[1], actions.shape[2]) + ) rwd_s = torch.reshape(rwd_s, (rwd_s.shape[0] * rwd_s.shape[1], rwd_s.shape[2])) samples = denormalize_observation_delta(samples, self.statistics) samples += states - if self.prob_rwd: if self.sas: rwd_mean, rwd_var = self.reward_network(states, actions, samples) diff --git a/cares_reinforcement_learning/networks/world_models/simple/probabilistic_dynamic.py b/cares_reinforcement_learning/networks/world_models/simple/probabilistic_dynamic.py index 682423a2..fb9b7477 100644 --- a/cares_reinforcement_learning/networks/world_models/simple/probabilistic_dynamic.py +++ b/cares_reinforcement_learning/networks/world_models/simple/probabilistic_dynamic.py @@ -27,18 +27,20 @@ def __init__(self, observation_size: int, num_actions: int, hidden_size: list): self.observation_size = observation_size self.num_actions = num_actions - self.model = MLP(input_size=observation_size + num_actions, - hidden_sizes=hidden_size, - output_size=2 * observation_size) + self.model = MLP( + input_size=observation_size + num_actions, + hidden_sizes=hidden_size, + output_size=2 * observation_size, + ) - self.add_module('mlp', self.model) + self.add_module("mlp", self.model) self.model.apply(weight_init_pnn) self.statistics = {} def forward( - self, observation: torch.Tensor, actions: torch.Tensor + self, observation: torch.Tensor, actions: torch.Tensor ) -> tuple[torch.Tensor, torch.Tensor]: """ Forward the inputs throught the network. @@ -54,14 +56,14 @@ def forward( uncertainty estimation. """ assert ( - observation.shape[1] + actions.shape[1] - == self.observation_size + self.num_actions + observation.shape[1] + actions.shape[1] + == self.observation_size + self.num_actions ) # Always normalized obs x = torch.cat((observation, actions), dim=1) pred = self.model(x) - logvar = pred[:, :self.observation_size] - normalized_mean = pred[:, self.observation_size:] + logvar = pred[:, : self.observation_size] + normalized_mean = pred[:, self.observation_size :] logvar = torch.tanh(logvar) normalized_var = torch.exp(logvar) # Always denormalized delta diff --git a/cares_reinforcement_learning/networks/world_models/simple/probabilistic_ns_reward.py b/cares_reinforcement_learning/networks/world_models/simple/probabilistic_ns_reward.py index f9dbe781..010bfbef 100644 --- a/cares_reinforcement_learning/networks/world_models/simple/probabilistic_ns_reward.py +++ b/cares_reinforcement_learning/networks/world_models/simple/probabilistic_ns_reward.py @@ -5,7 +5,13 @@ class Probabilistic_NS_Reward(nn.Module): - def __init__(self, observation_size: int, num_actions: int, hidden_size: list, normalize:bool): + def __init__( + self, + observation_size: int, + num_actions: int, + hidden_size: list, + normalize: bool, + ): """ Note, This reward function is limited to 0 ~ 1 for dm_control. A reward model with fully connected layers. It takes current states (s) @@ -16,12 +22,13 @@ def __init__(self, observation_size: int, num_actions: int, hidden_size: list, n self.normalize = normalize self.observation_size = observation_size self.num_actions = num_actions - self.model = MLP(input_size=observation_size, hidden_sizes=hidden_size, output_size=2) - self.add_module('mlp', self.model) + self.model = MLP( + input_size=observation_size, hidden_sizes=hidden_size, output_size=2 + ) + self.add_module("mlp", self.model) self.model.apply(weight_init) - def forward( - self, - next_observation: torch.Tensor) -> tuple[Tensor, Tensor]: + + def forward(self, next_observation: torch.Tensor) -> tuple[Tensor, Tensor]: """ Forward the inputs throught the network. Note: For DMCS environment, the reward is from 0~1. diff --git a/cares_reinforcement_learning/networks/world_models/simple/probabilistic_sas_reward.py b/cares_reinforcement_learning/networks/world_models/simple/probabilistic_sas_reward.py index af6fa196..122be07c 100644 --- a/cares_reinforcement_learning/networks/world_models/simple/probabilistic_sas_reward.py +++ b/cares_reinforcement_learning/networks/world_models/simple/probabilistic_sas_reward.py @@ -5,7 +5,13 @@ class Probabilistic_SAS_Reward(nn.Module): - def __init__(self, observation_size: int, num_actions: int, hidden_size: list, normalize: bool): + def __init__( + self, + observation_size: int, + num_actions: int, + hidden_size: list, + normalize: bool, + ): """ Note, This reward function is limited to 0 ~ 1 for dm_control. A reward model with fully connected layers. It takes current states (s) @@ -21,15 +27,21 @@ def __init__(self, observation_size: int, num_actions: int, hidden_size: list, n self.observation_size = observation_size self.num_actions = num_actions - self.model = MLP(input_size=2 * observation_size + num_actions, - hidden_sizes=hidden_size, - output_size=2) + self.model = MLP( + input_size=2 * observation_size + num_actions, + hidden_sizes=hidden_size, + output_size=2, + ) - self.add_module('mlp', self.model) + self.add_module("mlp", self.model) self.model.apply(weight_init) def forward( - self, observation: torch.Tensor, actions: torch.Tensor, next_observation: torch.Tensor) -> tuple[Tensor, Tensor]: + self, + observation: torch.Tensor, + actions: torch.Tensor, + next_observation: torch.Tensor, + ) -> tuple[Tensor, Tensor]: """ Forward the inputs throught the network. Note: For DMCS environment, the reward is from 0~1. diff --git a/cares_reinforcement_learning/networks/world_models/simple/simple_ns_reward.py b/cares_reinforcement_learning/networks/world_models/simple/simple_ns_reward.py index 776ca2c7..52a30b7a 100644 --- a/cares_reinforcement_learning/networks/world_models/simple/simple_ns_reward.py +++ b/cares_reinforcement_learning/networks/world_models/simple/simple_ns_reward.py @@ -5,7 +5,13 @@ class Simple_NS_Reward(nn.Module): - def __init__(self, observation_size: int, num_actions: int, hidden_size: list, normalize:bool): + def __init__( + self, + observation_size: int, + num_actions: int, + hidden_size: list, + normalize: bool, + ): """ Note, This reward function is limited to 0 ~ 1 for dm_control. A reward model with fully connected layers. It takes current states (s) @@ -20,12 +26,13 @@ def __init__(self, observation_size: int, num_actions: int, hidden_size: list, n self.normalize = normalize self.observation_size = observation_size self.num_actions = num_actions - self.model = MLP(input_size=observation_size, hidden_sizes=hidden_size, output_size=1) - self.add_module('mlp', self.model) + self.model = MLP( + input_size=observation_size, hidden_sizes=hidden_size, output_size=1 + ) + self.add_module("mlp", self.model) self.model.apply(weight_init) - def forward( - self, observation: torch.Tensor) -> torch.Tensor: + def forward(self, observation: torch.Tensor) -> torch.Tensor: """ Forward the inputs throught the network. Note: For DMCS environment, the reward is from 0~1. diff --git a/cares_reinforcement_learning/networks/world_models/simple/simple_sas_reward.py b/cares_reinforcement_learning/networks/world_models/simple/simple_sas_reward.py index c3348c1e..2df6e8b3 100644 --- a/cares_reinforcement_learning/networks/world_models/simple/simple_sas_reward.py +++ b/cares_reinforcement_learning/networks/world_models/simple/simple_sas_reward.py @@ -5,7 +5,13 @@ class Simple_SAS_Reward(nn.Module): - def __init__(self, observation_size: int, num_actions: int, hidden_size: list, normalize: bool): + def __init__( + self, + observation_size: int, + num_actions: int, + hidden_size: list, + normalize: bool, + ): """ Note, This reward function is limited to 0 ~ 1 for dm_control. A reward model with fully connected layers. It takes current states (s) @@ -20,12 +26,20 @@ def __init__(self, observation_size: int, num_actions: int, hidden_size: list, n self.normalize = normalize self.observation_size = observation_size self.num_actions = num_actions - self.model = MLP(input_size=2 * observation_size + num_actions, hidden_sizes=hidden_size, output_size=1) - self.add_module('mlp', self.model) + self.model = MLP( + input_size=2 * observation_size + num_actions, + hidden_sizes=hidden_size, + output_size=1, + ) + self.add_module("mlp", self.model) self.model.apply(weight_init) def forward( - self, observation: torch.Tensor, actions: torch.Tensor, next_observation: torch.Tensor) -> torch.Tensor: + self, + observation: torch.Tensor, + actions: torch.Tensor, + next_observation: torch.Tensor, + ) -> torch.Tensor: """ Forward the inputs throught the network. Note: For DMCS environment, the reward is from 0~1. diff --git a/cares_reinforcement_learning/networks/world_models/world_model.py b/cares_reinforcement_learning/networks/world_models/world_model.py index 865379a8..3246a824 100644 --- a/cares_reinforcement_learning/networks/world_models/world_model.py +++ b/cares_reinforcement_learning/networks/world_models/world_model.py @@ -1,8 +1,14 @@ import logging import torch import numpy as np -from cares_reinforcement_learning.networks.world_models.simple import Probabilistic_SAS_Reward, Probabilistic_NS_Reward -from cares_reinforcement_learning.networks.world_models.simple import Simple_SAS_Reward, Simple_NS_Reward +from cares_reinforcement_learning.networks.world_models.simple import ( + Probabilistic_SAS_Reward, + Probabilistic_NS_Reward, +) +from cares_reinforcement_learning.networks.world_models.simple import ( + Simple_SAS_Reward, + Simple_NS_Reward, +) import torch.nn.functional as F import torch.utils from torch import optim @@ -14,15 +20,15 @@ class World_Model: """ def __init__( - self, - observation_size: int, - num_actions: int, - l_r: float, - device: str, - hidden_size=None, - sas: bool = True, - prob_rwd: bool = False, - num_rwd_model: int = 5 + self, + observation_size: int, + num_actions: int, + l_r: float, + device: str, + hidden_size=None, + sas: bool = True, + prob_rwd: bool = False, + num_rwd_model: int = 5, ): logging.info(f"Num of Reward models: {num_rwd_model}") if hidden_size is None: @@ -46,14 +52,14 @@ def __init__( observation_size=observation_size, num_actions=num_actions, hidden_size=hidden_size, - normalize=False + normalize=False, ) else: reward_network = Probabilistic_NS_Reward( observation_size=observation_size, num_actions=num_actions, hidden_size=hidden_size, - normalize=False + normalize=False, ) else: if sas: @@ -61,14 +67,14 @@ def __init__( observation_size=observation_size, num_actions=num_actions, hidden_size=hidden_size, - normalize=False + normalize=False, ) else: reward_network = Simple_NS_Reward( observation_size=observation_size, num_actions=num_actions, hidden_size=hidden_size, - normalize=False + normalize=False, ) reward_network.to(self.device) self.rwd_models.append(reward_network) @@ -88,10 +94,10 @@ def set_statistics(self, statistics: dict) -> None: self.statistics = statistics def train_world( - self, - states: torch.Tensor, - actions: torch.Tensor, - next_states: torch.Tensor, + self, + states: torch.Tensor, + actions: torch.Tensor, + next_states: torch.Tensor, ) -> None: """ Train the dynamic of world model. @@ -102,7 +108,7 @@ def train_world( logging.info(" Train world Not Implemented") def pred_next_states( - self, observation: torch.Tensor, actions: torch.Tensor + self, observation: torch.Tensor, actions: torch.Tensor ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: """ Make a prediction of next state. @@ -111,14 +117,18 @@ def pred_next_states( :return: Next_state Prediction, Next_state Means, Next_State Variance. """ logging.info("Predict Next Not Implemented") - return torch.zeros(observation.shape), torch.zeros(observation.shape), torch.zeros(observation.shape) + return ( + torch.zeros(observation.shape), + torch.zeros(observation.shape), + torch.zeros(observation.shape), + ) def train_reward( - self, - states: torch.Tensor, - actions: torch.Tensor, - next_states: torch.Tensor, - rewards: torch.Tensor, + self, + states: torch.Tensor, + actions: torch.Tensor, + next_states: torch.Tensor, + rewards: torch.Tensor, ) -> None: """ Train the reward prediction with or without world model dynamics. @@ -132,10 +142,14 @@ def train_reward( self.rwd_model_optimizers[indice].zero_grad() if self.prob_rwd: if self.sas: - rwd_mean, rwd_var = self.rwd_models[indice](states, actions, next_states) + rwd_mean, rwd_var = self.rwd_models[indice]( + states, actions, next_states + ) else: rwd_mean, rwd_var = self.rwd_models[indice](next_states) - reward_loss = F.gaussian_nll_loss(input=rwd_mean, target=rewards, var=rwd_var) + reward_loss = F.gaussian_nll_loss( + input=rwd_mean, target=rewards, var=rwd_var + ) else: if self.sas: rwd_mean = self.rwd_models[indice](states, actions, next_states) @@ -145,8 +159,12 @@ def train_reward( reward_loss.backward() self.rwd_model_optimizers[indice].step() - def pred_rewards(self, observation: torch.Tensor, action: torch.Tensor, next_observation: torch.Tensor - ) -> tuple[torch.Tensor, torch.Tensor]: + def pred_rewards( + self, + observation: torch.Tensor, + action: torch.Tensor, + next_observation: torch.Tensor, + ) -> tuple[torch.Tensor, torch.Tensor]: """ Predict reward based on SAS :param observation: @@ -159,12 +177,16 @@ def pred_rewards(self, observation: torch.Tensor, action: torch.Tensor, next_obs for i in range(self.num_rwd_model): if self.prob_rwd: if self.sas: - pred_rewards, rwd_var = self.rwd_models[i](observation, action, next_observation) + pred_rewards, rwd_var = self.rwd_models[i]( + observation, action, next_observation + ) else: pred_rewards, rwd_var = self.rwd_models[i](next_observation) else: if self.sas: - pred_rewards = self.rwd_models[i](observation, action, next_observation) + pred_rewards = self.rwd_models[i]( + observation, action, next_observation + ) else: pred_rewards = self.rwd_models[i](next_observation) rwd_var = None @@ -179,8 +201,8 @@ def pred_rewards(self, observation: torch.Tensor, action: torch.Tensor, next_obs rwd_var = torch.zeros(preds.shape) else: rwd_var = torch.stack(preds_vars) - aleatoric_uncert = torch.mean(rwd_var ** 2, dim=0) ** 0.5 - total_unc = (aleatoric_uncert ** 2 + epistemic_uncert ** 2) ** 0.5 + aleatoric_uncert = torch.mean(rwd_var**2, dim=0) ** 0.5 + total_unc = (aleatoric_uncert**2 + epistemic_uncert**2) ** 0.5 if preds.shape[0] > 1: preds = torch.mean(preds, dim=0) @@ -189,8 +211,12 @@ def pred_rewards(self, observation: torch.Tensor, action: torch.Tensor, next_obs return preds, total_unc - def pred_all_rewards(self, observation: torch.Tensor, action: torch.Tensor, next_observation: torch.Tensor - ) -> tuple[torch.Tensor, torch.Tensor]: + def pred_all_rewards( + self, + observation: torch.Tensor, + action: torch.Tensor, + next_observation: torch.Tensor, + ) -> tuple[torch.Tensor, torch.Tensor]: """ Predict reward based on SAS :param observation: @@ -204,12 +230,16 @@ def pred_all_rewards(self, observation: torch.Tensor, action: torch.Tensor, next for i in range(self.num_rwd_model): if self.prob_rwd: if self.sas: - pred_rewards, rwd_var = self.rwd_models[i](observation, action, next_observation[j]) + pred_rewards, rwd_var = self.rwd_models[i]( + observation, action, next_observation[j] + ) else: pred_rewards, rwd_var = self.rwd_models[i](next_observation[j]) else: if self.sas: - pred_rewards = self.rwd_models[i](observation, action, next_observation[j]) + pred_rewards = self.rwd_models[i]( + observation, action, next_observation[j] + ) else: pred_rewards = self.rwd_models[i](next_observation[j]) rwd_var = None @@ -224,7 +254,10 @@ def pred_all_rewards(self, observation: torch.Tensor, action: torch.Tensor, next return preds, preds_vars def estimate_uncertainty( - self, observation: torch.Tensor, actions: torch.Tensor, train_reward: bool, + self, + observation: torch.Tensor, + actions: torch.Tensor, + train_reward: bool, ) -> tuple[float, float, torch.Tensor]: """ Estimate next state uncertainty and reward uncertainty. @@ -236,5 +269,10 @@ def estimate_uncertainty( logging.info("Estimating Uncertainty Not Implemented") return 0.0, 0.0, None - def train_together(self, states: torch.Tensor, actions: torch.Tensor, rewards: torch.Tensor, ): + def train_together( + self, + states: torch.Tensor, + actions: torch.Tensor, + rewards: torch.Tensor, + ): logging.info("Train Together Not Implemented") diff --git a/cares_reinforcement_learning/util/__init__.py b/cares_reinforcement_learning/util/__init__.py index 8dbd1fcd..6a13b055 100644 --- a/cares_reinforcement_learning/util/__init__.py +++ b/cares_reinforcement_learning/util/__init__.py @@ -2,4 +2,4 @@ from cares_reinforcement_learning.util.record import Record from cares_reinforcement_learning.util.rl_parser import RLParser from cares_reinforcement_learning.util.helpers import * -from cares_reinforcement_learning.util.uncertainty_estimation import * \ No newline at end of file +from cares_reinforcement_learning.util.uncertainty_estimation import * diff --git a/cares_reinforcement_learning/util/configurations.py b/cares_reinforcement_learning/util/configurations.py index 5db5515f..ed384938 100644 --- a/cares_reinforcement_learning/util/configurations.py +++ b/cares_reinforcement_learning/util/configurations.py @@ -156,8 +156,8 @@ class SACConfig(AlgorithmConfig): class DynaSAC_NSConfig(AlgorithmConfig): algorithm: str = Field("DynaSAC_NS", Literal=True) type: str = Field("mbrl", Literal=True) - G: int = 1, - G_model: float = 1, + G: int = (1,) + G_model: float = (1,) actor_lr: float = 3e-4 critic_lr: float = 3e-4 @@ -185,8 +185,8 @@ class DynaSAC_NSConfig(AlgorithmConfig): class STEVESACConfig(AlgorithmConfig): algorithm: str = Field("STEVESAC", Literal=True) type: str = Field("mbrl", Literal=True) - G: int = 1, - G_model: float = 1, + G: int = (1,) + G_model: float = (1,) actor_lr: float = 3e-4 critic_lr: float = 3e-4 @@ -217,8 +217,8 @@ class STEVESACConfig(AlgorithmConfig): class STEVESAC_BoundedConfig(AlgorithmConfig): algorithm: str = Field("STEVESAC_Bounded", Literal=True) type: str = Field("mbrl", Literal=True) - G: int = 1, - G_model: float = 1, + G: int = (1,) + G_model: float = (1,) actor_lr: float = 3e-4 critic_lr: float = 3e-4 @@ -252,8 +252,8 @@ class STEVESAC_BoundedConfig(AlgorithmConfig): class DynaSAC_BoundedConfig(AlgorithmConfig): algorithm: str = Field("DynaSAC_Bounded", Literal=True) type: str = Field("mbrl", Literal=True) - G: int = 1, - G_model: float = 1, + G: int = (1,) + G_model: float = (1,) actor_lr: float = 3e-4 critic_lr: float = 3e-4 diff --git a/cares_reinforcement_learning/util/helpers.py b/cares_reinforcement_learning/util/helpers.py index 859edf72..c49f2917 100644 --- a/cares_reinforcement_learning/util/helpers.py +++ b/cares_reinforcement_learning/util/helpers.py @@ -8,6 +8,7 @@ import torch.nn.functional as F import time + class MLP(nn.Module): def __init__(self, input_size: int, hidden_sizes: list[int], output_size: int): super().__init__() diff --git a/cares_reinforcement_learning/util/network_factory.py b/cares_reinforcement_learning/util/network_factory.py index 8e1acaf1..8d65d069 100644 --- a/cares_reinforcement_learning/util/network_factory.py +++ b/cares_reinforcement_learning/util/network_factory.py @@ -13,6 +13,7 @@ # DQN Algorithms # ################################### + def create_SAC(observation_size, action_num, config: acf.SACConfig): from cares_reinforcement_learning.algorithm.policy import SAC from cares_reinforcement_learning.networks.SAC import Actor, Critic @@ -37,7 +38,9 @@ def create_DynaSAC_NS(observation_size, action_num, config: acf.DynaSAC_NSConfig """ from cares_reinforcement_learning.algorithm.mbrl import DynaSAC_NS from cares_reinforcement_learning.networks.SAC import Actor, Critic - from cares_reinforcement_learning.networks.world_models.ensemble import Ensemble_Dyna_Big + from cares_reinforcement_learning.networks.world_models.ensemble import ( + Ensemble_Dyna_Big, + ) actor = Actor(observation_size, action_num, config=config) critic = Critic(observation_size, action_num, config=config) @@ -49,7 +52,7 @@ def create_DynaSAC_NS(observation_size, action_num, config: acf.DynaSAC_NSConfig num_actions=action_num, num_models=config.num_models, device=device, - sas=config.sas + sas=config.sas, ) agent = DynaSAC_NS( @@ -67,19 +70,23 @@ def create_DynaSAC_NS(observation_size, action_num, config: acf.DynaSAC_NSConfig device=device, train_both=config.train_both, train_reward=config.train_reward, - gripper=config.gripper + gripper=config.gripper, ) return agent -def create_DynaSAC_Bounded(observation_size, action_num, config: acf.DynaSAC_BoundedConfig): +def create_DynaSAC_Bounded( + observation_size, action_num, config: acf.DynaSAC_BoundedConfig +): """ Create networks for model-based SAC agent. The Actor and Critic is same. An extra world model is added. """ from cares_reinforcement_learning.algorithm.mbrl import DynaSAC_Bounded from cares_reinforcement_learning.networks.SAC import Actor, Critic - from cares_reinforcement_learning.networks.world_models.ensemble import Ensemble_Dyna_Big + from cares_reinforcement_learning.networks.world_models.ensemble import ( + Ensemble_Dyna_Big, + ) actor = Actor(observation_size, action_num, config=config) critic = Critic(observation_size, action_num, config=config) @@ -94,7 +101,7 @@ def create_DynaSAC_Bounded(observation_size, action_num, config: acf.DynaSAC_Bou l_r=config.world_model_lr, sas=config.sas, prob_rwd=True, - boost_inter=30 + boost_inter=30, ) agent = DynaSAC_Bounded( @@ -114,7 +121,7 @@ def create_DynaSAC_Bounded(observation_size, action_num, config: acf.DynaSAC_Bou train_reward=config.train_reward, gripper=config.gripper, threshold=config.threshold, - exploration_sample=config.exploration_sample + exploration_sample=config.exploration_sample, ) return agent @@ -127,7 +134,9 @@ def create_STEVESAC(observation_size, action_num, config: acf.STEVESACConfig): """ from cares_reinforcement_learning.algorithm.mbrl import STEVESAC from cares_reinforcement_learning.networks.SAC import Actor, Critic - from cares_reinforcement_learning.networks.world_models.ensemble import Ensemble_Dyna_Big + from cares_reinforcement_learning.networks.world_models.ensemble import ( + Ensemble_Dyna_Big, + ) actor = Actor(observation_size, action_num, config=config) critic = Critic(observation_size, action_num, config=config) @@ -141,7 +150,7 @@ def create_STEVESAC(observation_size, action_num, config: acf.STEVESACConfig): num_rwd_model=config.num_rwd_models, device=device, l_r=config.world_model_lr, - sas=config.sas + sas=config.sas, ) agent = STEVESAC( @@ -158,12 +167,14 @@ def create_STEVESAC(observation_size, action_num, config: acf.STEVESACConfig): device=device, train_both=config.train_both, train_reward=config.train_reward, - gripper=config.gripper + gripper=config.gripper, ) return agent -def create_STEVESAC_Bounded(observation_size, action_num, config: acf.STEVESAC_BoundedConfig): +def create_STEVESAC_Bounded( + observation_size, action_num, config: acf.STEVESAC_BoundedConfig +): """ Create networks for model-based SAC agent. The Actor and Critic is same. An extra world model is added. @@ -171,19 +182,24 @@ def create_STEVESAC_Bounded(observation_size, action_num, config: acf.STEVESAC_B from cares_reinforcement_learning.algorithm.mbrl import STEVESAC_Bounded from cares_reinforcement_learning.networks.SAC import Actor, Critic - from cares_reinforcement_learning.networks.world_models.ensemble import Ensemble_Dyna_Big + from cares_reinforcement_learning.networks.world_models.ensemble import ( + Ensemble_Dyna_Big, + ) actor = Actor(observation_size, action_num, config=config) critic = Critic(observation_size, action_num, config=config) device = hlp.get_device() - world_model = Ensemble_Dyna_Big(observation_size=observation_size, - num_actions=action_num, - num_models=config.num_models, - num_rwd_model=config.num_rwd_models, - device=device, - l_r=config.world_model_lr, sas=config.sas) + world_model = Ensemble_Dyna_Big( + observation_size=observation_size, + num_actions=action_num, + num_models=config.num_models, + num_rwd_model=config.num_rwd_models, + device=device, + l_r=config.world_model_lr, + sas=config.sas, + ) agent = STEVESAC_Bounded( actor_network=actor, @@ -201,7 +217,7 @@ def create_STEVESAC_Bounded(observation_size, action_num, config: acf.STEVESAC_B train_reward=config.train_reward, gripper=config.gripper, threshold=config.threshold, - exploration_sample=config.exploration_sample + exploration_sample=config.exploration_sample, ) return agent @@ -429,10 +445,10 @@ def create_STEVESAC_Bounded(observation_size, action_num, config: acf.STEVESAC_B class NetworkFactory: def create_network( - self, - observation_size, - action_num: int, - config: acf.AlgorithmConfig, + self, + observation_size, + action_num: int, + config: acf.AlgorithmConfig, ): algorithm = config.algorithm diff --git a/cares_reinforcement_learning/util/uncertainty_estimation.py b/cares_reinforcement_learning/util/uncertainty_estimation.py index ed9b0933..d1c4010c 100644 --- a/cares_reinforcement_learning/util/uncertainty_estimation.py +++ b/cares_reinforcement_learning/util/uncertainty_estimation.py @@ -11,16 +11,11 @@ def sampling(pred_means, pred_vars): :return: """ # 5 models, each sampled 10 times = 50, - sample1 = torch.distributions.Normal(pred_means[0], pred_vars[0]).sample( - [10]) - sample2 = torch.distributions.Normal(pred_means[1], pred_vars[1]).sample( - [10]) - sample3 = torch.distributions.Normal(pred_means[2], pred_vars[2]).sample( - [10]) - sample4 = torch.distributions.Normal(pred_means[3], pred_vars[3]).sample( - [10]) - sample5 = torch.distributions.Normal(pred_means[4], pred_vars[4]).sample( - [10]) + sample1 = torch.distributions.Normal(pred_means[0], pred_vars[0]).sample([10]) + sample2 = torch.distributions.Normal(pred_means[1], pred_vars[1]).sample([10]) + sample3 = torch.distributions.Normal(pred_means[2], pred_vars[2]).sample([10]) + sample4 = torch.distributions.Normal(pred_means[3], pred_vars[3]).sample([10]) + sample5 = torch.distributions.Normal(pred_means[4], pred_vars[4]).sample([10]) samples = torch.cat((sample1, sample2, sample3, sample4, sample5)) # Samples = [5 * 10, 10 predictions, 11 state dims] @@ -38,4 +33,4 @@ def sampling(pred_means, pred_vars): # total_stds = total_stds / torch.mean(total_stds) # if very uncertain, # high std, encouraged. # total_stds = total_stds - torch.min(total_stds) - return total_stds.detach() \ No newline at end of file + return total_stds.detach() From e9837326a5d2861ea59c88952e1652928cbe169c Mon Sep 17 00:00:00 2001 From: tony Date: Mon, 30 Dec 2024 12:39:31 +1300 Subject: [PATCH 76/91] merge --- .../util/configurations.py | 8 +++---- .../util/network_factory.py | 21 +++++++++---------- 2 files changed, 14 insertions(+), 15 deletions(-) diff --git a/cares_reinforcement_learning/util/configurations.py b/cares_reinforcement_learning/util/configurations.py index 5db5515f..41cdb258 100644 --- a/cares_reinforcement_learning/util/configurations.py +++ b/cares_reinforcement_learning/util/configurations.py @@ -178,7 +178,7 @@ class DynaSAC_NSConfig(AlgorithmConfig): num_samples: int = 10 sas: bool = False train_reward: bool = True - train_both: bool = True + train_both: bool = False gripper: bool = False @@ -210,7 +210,7 @@ class STEVESACConfig(AlgorithmConfig): sas: bool = False train_reward: bool = True - train_both: bool = True + train_both: bool = False gripper: bool = False @@ -242,7 +242,7 @@ class STEVESAC_BoundedConfig(AlgorithmConfig): sas: bool = False train_reward: bool = True - train_both: bool = True + train_both: bool = False gripper: bool = False threshold: float = 0.1 @@ -274,7 +274,7 @@ class DynaSAC_BoundedConfig(AlgorithmConfig): num_samples: int = 10 sas: bool = False train_reward: bool = True - train_both: bool = True + train_both: bool = False gripper: bool = False threshold: float = 0.1 exploration_sample: int = 5 diff --git a/cares_reinforcement_learning/util/network_factory.py b/cares_reinforcement_learning/util/network_factory.py index 8e1acaf1..84bb1e25 100644 --- a/cares_reinforcement_learning/util/network_factory.py +++ b/cares_reinforcement_learning/util/network_factory.py @@ -49,7 +49,9 @@ def create_DynaSAC_NS(observation_size, action_num, config: acf.DynaSAC_NSConfig num_actions=action_num, num_models=config.num_models, device=device, - sas=config.sas + l_r=config.world_model_lr, + sas=config.sas, + boost_inter=30 ) agent = DynaSAC_NS( @@ -86,16 +88,13 @@ def create_DynaSAC_Bounded(observation_size, action_num, config: acf.DynaSAC_Bou device = hlp.get_device() - world_model = Ensemble_Dyna_Big( - observation_size=observation_size, - num_actions=action_num, - num_models=config.num_models, - device=device, - l_r=config.world_model_lr, - sas=config.sas, - prob_rwd=True, - boost_inter=30 - ) + world_model = Ensemble_Dyna_Big(observation_size=observation_size, + num_actions=action_num, + num_models=config.num_models, + device=device, + l_r=config.world_model_lr, + sas=config.sas, + boost_inter=30) agent = DynaSAC_Bounded( actor_network=actor, From 051d3f086e941ec24cce51dfa3fc4e38160f53b5 Mon Sep 17 00:00:00 2001 From: tony Date: Mon, 30 Dec 2024 12:44:16 +1300 Subject: [PATCH 77/91] merge --- cares_reinforcement_learning/util/network_factory.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/cares_reinforcement_learning/util/network_factory.py b/cares_reinforcement_learning/util/network_factory.py index 43002561..3c1b9e77 100644 --- a/cares_reinforcement_learning/util/network_factory.py +++ b/cares_reinforcement_learning/util/network_factory.py @@ -54,8 +54,7 @@ def create_DynaSAC_NS(observation_size, action_num, config: acf.DynaSAC_NSConfig device=device, l_r=config.world_model_lr, sas=config.sas, - boost_inter=30 - sas=config.sas, + boost_inter=30, ) agent = DynaSAC_NS( From 7884942c97a9781bd3dc3ba87464541ba9162121 Mon Sep 17 00:00:00 2001 From: "Formatter [BOT]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Sun, 29 Dec 2024 23:44:35 +0000 Subject: [PATCH 78/91] =?UTF-8?q?Auto-format=20code=20=F0=9F=A7=B9?= =?UTF-8?q?=F0=9F=8C=9F=F0=9F=A4=96?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- cares_reinforcement_learning/util/network_factory.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/cares_reinforcement_learning/util/network_factory.py b/cares_reinforcement_learning/util/network_factory.py index 3c1b9e77..804d4f66 100644 --- a/cares_reinforcement_learning/util/network_factory.py +++ b/cares_reinforcement_learning/util/network_factory.py @@ -184,7 +184,9 @@ def create_STEVESAC_Bounded( from cares_reinforcement_learning.algorithm.mbrl import STEVESAC_Bounded from cares_reinforcement_learning.networks.SAC import Actor, Critic - from cares_reinforcement_learning.networks.world_models.ensemble import Ensemble_Dyna_Big + from cares_reinforcement_learning.networks.world_models.ensemble import ( + Ensemble_Dyna_Big, + ) actor = Actor(observation_size, action_num, config=config) critic = Critic(observation_size, action_num, config=config) @@ -445,10 +447,10 @@ def create_STEVESAC_Bounded( class NetworkFactory: def create_network( - self, - observation_size, - action_num: int, - config: acf.AlgorithmConfig, + self, + observation_size, + action_num: int, + config: acf.AlgorithmConfig, ): algorithm = config.algorithm From 3e2127bef63d94dc39a6d0ced681ed73a31cfaa5 Mon Sep 17 00:00:00 2001 From: tony Date: Mon, 30 Dec 2024 13:03:46 +1300 Subject: [PATCH 79/91] Fix reward learning --- .../networks/world_models/world_model.py | 1 + 1 file changed, 1 insertion(+) diff --git a/cares_reinforcement_learning/networks/world_models/world_model.py b/cares_reinforcement_learning/networks/world_models/world_model.py index 3246a824..6077f198 100644 --- a/cares_reinforcement_learning/networks/world_models/world_model.py +++ b/cares_reinforcement_learning/networks/world_models/world_model.py @@ -158,6 +158,7 @@ def train_reward( reward_loss = F.mse_loss(rwd_mean, rewards) reward_loss.backward() self.rwd_model_optimizers[indice].step() + self.counter += 1 def pred_rewards( self, From 9a4cefa1df52525b734271179e2ef32ba89108bc Mon Sep 17 00:00:00 2001 From: tony Date: Mon, 30 Dec 2024 13:23:16 +1300 Subject: [PATCH 80/91] Fix exploration --- .../algorithm/mbrl/DynaSAC_Bounded.py | 22 +++++++++---------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_Bounded.py b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_Bounded.py index d1fcf366..59fb261f 100644 --- a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_Bounded.py +++ b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_Bounded.py @@ -131,21 +131,21 @@ def select_action_from_policy( total_unc = (aleatoric**2 + epistemic**2) ** 0.5 uncert = torch.mean(total_unc, dim=1) world_dist = F.softmax(uncert, dim=0) - # world_dist -= torch.min(world_dist) + world_dist -= torch.min(world_dist) - Q_1, Q_2 = self.critic_net(multi_state_tensor, multi_action) - Q_s = torch.minimum(Q_1, Q_2) - Q_s = Q_s.squeeze() - multi_log_pi = Q_s - # multi_log_pi = multi_log_pi.squeeze() + # Q_1, Q_2 = self.critic_net(multi_state_tensor, multi_action) + # Q_s = torch.minimum(Q_1, Q_2) + # Q_s = Q_s.squeeze() + # multi_log_pi = Q_s + + multi_log_pi = multi_log_pi.squeeze() policy_dist = F.softmax(multi_log_pi, dim=0) final_dist = policy_dist + self.threshold * world_dist - # final_dist = F.softmax(final_dist, dim=0) - candi = torch.argmax(final_dist) - # new_dist = torch.distributions.Categorical(final_dist) - # candi = new_dist.sample([5]).squeeze() - # print(self._jsd(policy_dist, final_dist)) + final_dist = F.softmax(final_dist, dim=0) + # candi = torch.argmax(final_dist) + new_dist = torch.distributions.Categorical(final_dist) + candi = new_dist.sample([1]).squeeze() action = multi_action[candi] else: (action, _, _) = self.actor_net(state_tensor) From e3cf728148c71600e0075790012485e37f021c73 Mon Sep 17 00:00:00 2001 From: tony Date: Mon, 30 Dec 2024 13:46:03 +1300 Subject: [PATCH 81/91] Fix exploration --- .../algorithm/mbrl/STEVESAC_Bounded.py | 27 +++++++++---------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/cares_reinforcement_learning/algorithm/mbrl/STEVESAC_Bounded.py b/cares_reinforcement_learning/algorithm/mbrl/STEVESAC_Bounded.py index 5195d98e..721a0811 100644 --- a/cares_reinforcement_learning/algorithm/mbrl/STEVESAC_Bounded.py +++ b/cares_reinforcement_learning/algorithm/mbrl/STEVESAC_Bounded.py @@ -123,23 +123,22 @@ def select_action_from_policy( total_unc = (aleatoric**2 + epistemic**2) ** 0.5 uncert = torch.mean(total_unc, dim=1) world_dist = F.softmax(uncert, dim=0) - # world_dist -= torch.min(world_dist) + world_dist -= torch.min(world_dist) - Q_1, Q_2 = self.critic_net(multi_state_tensor, multi_action) - Q_s = torch.minimum(Q_1, Q_2) - Q_s = Q_s.squeeze() - multi_log_pi = Q_s + # Q_1, Q_2 = self.critic_net(multi_state_tensor, multi_action) + # Q_s = torch.minimum(Q_1, Q_2) + # Q_s = Q_s.squeeze() + # multi_log_pi = Q_s - # multi_log_pi = multi_log_pi.squeeze() + multi_log_pi = multi_log_pi.squeeze() policy_dist = F.softmax(multi_log_pi, dim=0) - final_dist = ( - 1 - self.threshold - ) * policy_dist + self.threshold * world_dist - candi = torch.argmax(final_dist) - # final_dist = F.softmax(final_dist, dim=0) - # new_dist = torch.distributions.Categorical(final_dist) - # candi = new_dist.sample([5]).squeeze() - # print(self._jsd(policy_dist, final_dist)) + final_dist = policy_dist + self.threshold * world_dist + + # candi = torch.argmax(final_dist) + final_dist = F.softmax(final_dist, dim=0) + new_dist = torch.distributions.Categorical(final_dist) + candi = new_dist.sample([1]).squeeze() + action = multi_action[candi] else: (action, _, _) = self.actor_net(state_tensor) From 971884c0cc887b76042dbcb4e79dbc4a0e73dcc1 Mon Sep 17 00:00:00 2001 From: tony Date: Wed, 1 Jan 2025 20:13:43 +1300 Subject: [PATCH 82/91] Fix bounded exploration --- cares_reinforcement_learning/algorithm/mbrl/DynaSAC_Bounded.py | 2 +- cares_reinforcement_learning/algorithm/mbrl/STEVESAC_Bounded.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_Bounded.py b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_Bounded.py index 59fb261f..ac96fae3 100644 --- a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_Bounded.py +++ b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_Bounded.py @@ -141,7 +141,7 @@ def select_action_from_policy( multi_log_pi = multi_log_pi.squeeze() policy_dist = F.softmax(multi_log_pi, dim=0) - final_dist = policy_dist + self.threshold * world_dist + final_dist = (1- self.threshold) * policy_dist + self.threshold * world_dist final_dist = F.softmax(final_dist, dim=0) # candi = torch.argmax(final_dist) new_dist = torch.distributions.Categorical(final_dist) diff --git a/cares_reinforcement_learning/algorithm/mbrl/STEVESAC_Bounded.py b/cares_reinforcement_learning/algorithm/mbrl/STEVESAC_Bounded.py index 721a0811..48682cdc 100644 --- a/cares_reinforcement_learning/algorithm/mbrl/STEVESAC_Bounded.py +++ b/cares_reinforcement_learning/algorithm/mbrl/STEVESAC_Bounded.py @@ -132,7 +132,7 @@ def select_action_from_policy( multi_log_pi = multi_log_pi.squeeze() policy_dist = F.softmax(multi_log_pi, dim=0) - final_dist = policy_dist + self.threshold * world_dist + final_dist = (1 - self.threshold) * policy_dist + self.threshold * world_dist # candi = torch.argmax(final_dist) final_dist = F.softmax(final_dist, dim=0) From 86d97308252a87dc259227a19c335d3420bc947d Mon Sep 17 00:00:00 2001 From: "Formatter [BOT]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Wed, 1 Jan 2025 07:14:04 +0000 Subject: [PATCH 83/91] =?UTF-8?q?Auto-format=20code=20=F0=9F=A7=B9?= =?UTF-8?q?=F0=9F=8C=9F=F0=9F=A4=96?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../algorithm/mbrl/DynaSAC_Bounded.py | 4 +++- .../algorithm/mbrl/STEVESAC_Bounded.py | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_Bounded.py b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_Bounded.py index ac96fae3..2915266d 100644 --- a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_Bounded.py +++ b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_Bounded.py @@ -141,7 +141,9 @@ def select_action_from_policy( multi_log_pi = multi_log_pi.squeeze() policy_dist = F.softmax(multi_log_pi, dim=0) - final_dist = (1- self.threshold) * policy_dist + self.threshold * world_dist + final_dist = ( + 1 - self.threshold + ) * policy_dist + self.threshold * world_dist final_dist = F.softmax(final_dist, dim=0) # candi = torch.argmax(final_dist) new_dist = torch.distributions.Categorical(final_dist) diff --git a/cares_reinforcement_learning/algorithm/mbrl/STEVESAC_Bounded.py b/cares_reinforcement_learning/algorithm/mbrl/STEVESAC_Bounded.py index 48682cdc..8e602c20 100644 --- a/cares_reinforcement_learning/algorithm/mbrl/STEVESAC_Bounded.py +++ b/cares_reinforcement_learning/algorithm/mbrl/STEVESAC_Bounded.py @@ -132,7 +132,9 @@ def select_action_from_policy( multi_log_pi = multi_log_pi.squeeze() policy_dist = F.softmax(multi_log_pi, dim=0) - final_dist = (1 - self.threshold) * policy_dist + self.threshold * world_dist + final_dist = ( + 1 - self.threshold + ) * policy_dist + self.threshold * world_dist # candi = torch.argmax(final_dist) final_dist = F.softmax(final_dist, dim=0) From 4d1da67795ab85c4a926ca7322581f614a16819c Mon Sep 17 00:00:00 2001 From: tony Date: Mon, 6 Jan 2025 09:52:52 +1300 Subject: [PATCH 84/91] Fix iw --- .../algorithm/mbrl/DynaSAC_IW_NS.py | 457 ------------------ .../algorithm/mbrl/DynaSAC_NS_IW.py | 359 ++++++++++++++ .../algorithm/mbrl/__init__.py | 11 +- .../util/configurations.py | 12 +- .../util/network_factory.py | 89 ++-- 5 files changed, 422 insertions(+), 506 deletions(-) delete mode 100644 cares_reinforcement_learning/algorithm/mbrl/DynaSAC_IW_NS.py create mode 100644 cares_reinforcement_learning/algorithm/mbrl/DynaSAC_NS_IW.py diff --git a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_IW_NS.py b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_IW_NS.py deleted file mode 100644 index 7684f817..00000000 --- a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_IW_NS.py +++ /dev/null @@ -1,457 +0,0 @@ -""" -Sutton, Richard S. "Dyna, an integrated architecture for learning, planning, and reacting." - -Original Paper: https://dl.acm.org/doi/abs/10.1145/122344.122377 - -This code runs automatic entropy tuning -""" - -import copy -import logging -import os - -import numpy as np -import torch -import torch.nn.functional as F - -from cares_reinforcement_learning.memory import MemoryBuffer -from cares_reinforcement_learning.networks.world_models.ensemble import ( - Ensemble_Dyna_Big, -) - -from cares_reinforcement_learning.util.helpers import denormalize_observation_delta - - -class DynaSAC_ScaleBatchReweight: - """ - Max as ? - """ - - def __init__( - self, - actor_network: torch.nn.Module, - critic_network: torch.nn.Module, - world_network: Ensemble_Dyna_Big, - gamma: float, - tau: float, - action_num: int, - actor_lr: float, - critic_lr: float, - alpha_lr: float, - num_samples: int, - horizon: int, - threshold_scale: float, - reweight_critic: bool, - reweight_actor: bool, - mode: int, - sample_times: int, - device: torch.device, - ): - self.type = "mbrl" - self.device = device - self.reweight_critic = reweight_critic - self.reweight_actor = reweight_actor - # this may be called policy_net in other implementations - self.actor_net = actor_network.to(self.device) - # this may be called soft_q_net in other implementations - self.critic_net = critic_network.to(self.device) - self.target_critic_net = copy.deepcopy(self.critic_net) - - self.gamma = gamma - self.tau = tau - - self.num_samples = num_samples - self.horizon = horizon - self.action_num = action_num - - self.learn_counter = 0 - self.policy_update_freq = 1 - - self.actor_net_optimiser = torch.optim.Adam( - self.actor_net.parameters(), lr=actor_lr - ) - self.critic_net_optimiser = torch.optim.Adam( - self.critic_net.parameters(), lr=critic_lr - ) - - # Set to initial alpha to 1.0 according to other baselines. - self.log_alpha = torch.FloatTensor([np.log(1.0)]).to(device) - self.log_alpha.requires_grad = True - self.target_entropy = -action_num - self.log_alpha_optimizer = torch.optim.Adam([self.log_alpha], lr=alpha_lr) - - # World model - self.world_model = world_network - # Parameter - self.threshold_scale = threshold_scale - self.mode = mode - self.sample_times = sample_times - - @property - def _alpha(self) -> float: - return self.log_alpha.exp() - - # pylint: disable-next=unused-argument to keep the same interface - def select_action_from_policy( - self, state: np.ndarray, evaluation: bool = False, noise_scale: float = 0 - ) -> np.ndarray: - # note that when evaluating this algorithm we need to select mu as - self.actor_net.eval() - with torch.no_grad(): - state_tensor = torch.FloatTensor(state).unsqueeze(0).to(self.device) - if evaluation is False: - (action, _, _) = self.actor_net(state_tensor) - else: - (_, _, action) = self.actor_net(state_tensor) - action = action.cpu().data.numpy().flatten() - self.actor_net.train() - return action - - def _train_policy( - self, - states: torch.Tensor, - actions: torch.Tensor, - rewards: torch.Tensor, - next_states: torch.Tensor, - dones: torch.Tensor, - weights: torch.Tensor, - ) -> None: - ################## Update the Critic First #################### - # Have more target values? - with torch.no_grad(): - next_actions, next_log_pi, _ = self.actor_net(next_states) - target_q_one, target_q_two = self.target_critic_net( - next_states, next_actions - ) - target_q_values = ( - torch.minimum(target_q_one, target_q_two) - self._alpha * next_log_pi - ) - q_target = rewards + self.gamma * (1 - dones) * target_q_values - - q_values_one, q_values_two = self.critic_net(states, actions) - - if self.reweight_critic: - # Reweighted loss function. weight not participant in training. - l2_loss_one = (q_values_one - q_target).pow(2) - l2_loss_two = (q_values_two - q_target).pow(2) - - weights = weights.detach() - disc_l2_loss_one = l2_loss_one * weights - disc_l2_loss_two = l2_loss_two * weights - # A ratio to scale the loss back to original loss scale. - - ratio_1 = torch.mean(l2_loss_one) / torch.mean(disc_l2_loss_one) - ratio_1 = ratio_1.detach() - ratio_2 = torch.mean(l2_loss_two) / torch.mean(disc_l2_loss_two) - ratio_2 = ratio_2.detach() - - critic_loss_one = disc_l2_loss_one.mean() * ratio_1 - critic_loss_two = disc_l2_loss_two.mean() * ratio_2 - - critic_loss_total = critic_loss_one + critic_loss_two - else: - critic_loss_one = F.mse_loss(q_values_one, q_target) - critic_loss_two = F.mse_loss(q_values_two, q_target) - critic_loss_total = critic_loss_one + critic_loss_two - - # Update the Critic - self.critic_net_optimiser.zero_grad() - critic_loss_total.backward() - self.critic_net_optimiser.step() - - ################## Update the Actor Second #################### - pi, first_log_p, _ = self.actor_net(states) - qf1_pi, qf2_pi = self.critic_net(states, pi) - min_qf_pi = torch.minimum(qf1_pi, qf2_pi) - - if self.reweight_actor: - weights = weights.detach() - a_loss = (self._alpha * first_log_p) - min_qf_pi - disc_actor_loss = a_loss * weights - ratio = torch.mean(a_loss) / torch.mean(disc_actor_loss) - ratio = ratio.detach() - actor_loss = ratio * torch.mean(disc_actor_loss) - else: - actor_loss = ((self._alpha * first_log_p) - min_qf_pi).mean() - - # Update the Actor - self.actor_net_optimiser.zero_grad() - actor_loss.backward() - self.actor_net_optimiser.step() - - # Update the temperature - alpha_loss = -( - self.log_alpha * (first_log_p + self.target_entropy).detach() - ).mean() - - self.log_alpha_optimizer.zero_grad() - alpha_loss.backward() - self.log_alpha_optimizer.step() - - if self.learn_counter % self.policy_update_freq == 0: - for target_param, param in zip( - self.target_critic_net.parameters(), self.critic_net.parameters() - ): - target_param.data.copy_( - param.data * self.tau + target_param.data * (1.0 - self.tau) - ) - - def train_world_model(self, memory: MemoryBuffer, batch_size: int) -> None: - experiences = memory.sample_uniform(batch_size) - states, actions, rewards, next_states, _, _ = experiences - - states = torch.FloatTensor(np.asarray(states)).to(self.device) - actions = torch.FloatTensor(np.asarray(actions)).to(self.device) - rewards = torch.FloatTensor(np.asarray(rewards)).to(self.device).unsqueeze(1) - next_states = torch.FloatTensor(np.asarray(next_states)).to(self.device) - - self.world_model.train_world( - states=states, - actions=actions, - next_states=next_states, - ) - self.world_model.train_reward( - next_states=next_states, - rewards=rewards, - ) - - def train_policy(self, memory: MemoryBuffer, batch_size: int) -> None: - self.learn_counter += 1 - - experiences = memory.sample_uniform(batch_size) - states, actions, rewards, next_states, dones, _ = experiences - - # Convert into tensor - states = torch.FloatTensor(np.asarray(states)).to(self.device) - actions = torch.FloatTensor(np.asarray(actions)).to(self.device) - rewards = torch.FloatTensor(np.asarray(rewards)).to(self.device).unsqueeze(1) - next_states = torch.FloatTensor(np.asarray(next_states)).to(self.device) - dones = torch.LongTensor(np.asarray(dones)).to(self.device).unsqueeze(1) - full_weights = torch.ones(rewards.shape).to(self.device) - # Step 2 train as usual - self._train_policy( - states=states, - actions=actions, - rewards=rewards, - next_states=next_states, - dones=dones, - weights=full_weights, - ) - # # # Step 3 Dyna add more data - self._dyna_generate_and_train(next_states=next_states) - - def _dyna_generate_and_train(self, next_states): - """ - Only off-policy Dyna will work. - :param next_states: - """ - pred_states = [] - pred_actions = [] - pred_rs = [] - pred_n_states = [] - pred_uncerts = [] - with torch.no_grad(): - pred_state = next_states - for _ in range(self.horizon): - pred_state = torch.repeat_interleave( - pred_state, self.num_samples, dim=0 - ) - # This part is controversial. But random actions is empirically better. - rand_acts = np.random.uniform( - -1, 1, (pred_state.shape[0], self.action_num) - ) - pred_acts = torch.FloatTensor(rand_acts).to(self.device) - - pred_next_state, _, pred_mean, pred_var = ( - self.world_model.pred_next_states(pred_state, pred_acts) - ) - uncert = self.sampling( - curr_states=pred_state, pred_means=pred_mean, pred_vars=pred_var - ) - uncert = uncert.unsqueeze(dim=1).to(self.device) - pred_uncerts.append(uncert) - - pred_reward = self.world_model.pred_rewards(pred_next_state) - pred_states.append(pred_state) - pred_actions.append(pred_acts.detach()) - pred_rs.append(pred_reward.detach()) - pred_n_states.append(pred_next_state.detach()) - pred_state = pred_next_state.detach() - pred_states = torch.vstack(pred_states) - pred_actions = torch.vstack(pred_actions) - pred_rs = torch.vstack(pred_rs) - pred_n_states = torch.vstack(pred_n_states) - pred_weights = torch.vstack(pred_uncerts) - # Pay attention to here! It is dones in the Cares RL Code! - pred_dones = torch.FloatTensor(np.zeros(pred_rs.shape)).to(self.device) - # states, actions, rewards, next_states, not_dones - self._train_policy( - pred_states, pred_actions, pred_rs, pred_n_states, pred_dones, pred_weights - ) - - def sampling(self, curr_states, pred_means, pred_vars): - """ - High std means low uncertainty. Therefore, divided by 1 - - :param pred_means: - :param pred_vars: - :return: - """ - with torch.no_grad(): - # 5 models. Each predict 10 next_states. - sample1 = torch.distributions.Normal(pred_means[0], pred_vars[0]).sample( - [self.sample_times] - ) - sample2 = torch.distributions.Normal(pred_means[1], pred_vars[1]).sample( - [self.sample_times] - ) - sample3 = torch.distributions.Normal(pred_means[2], pred_vars[2]).sample( - [self.sample_times] - ) - sample4 = torch.distributions.Normal(pred_means[3], pred_vars[3]).sample( - [self.sample_times] - ) - sample5 = torch.distributions.Normal(pred_means[4], pred_vars[4]).sample( - [self.sample_times] - ) - - rs = [] - acts = [] - qs = [] - # Varying the next_state's distribution. - for i in range(self.sample_times): - sample1i = denormalize_observation_delta( - sample1[i], self.world_model.statistics - ) - sample1i += curr_states - sample2i = denormalize_observation_delta( - sample2[i], self.world_model.statistics - ) - sample2i += curr_states - sample3i = denormalize_observation_delta( - sample3[i], self.world_model.statistics - ) - sample3i += curr_states - sample4i = denormalize_observation_delta( - sample4[i], self.world_model.statistics - ) - sample4i += curr_states - sample5i = denormalize_observation_delta( - sample5[i], self.world_model.statistics - ) - sample5i += curr_states - - if self.reweight_critic == 1: - # 5 models, each sampled 10 times = 50, - pred_rwd1 = self.world_model.pred_rewards(sample1i) - pred_rwd2 = self.world_model.pred_rewards(sample2i) - pred_rwd3 = self.world_model.pred_rewards(sample3i) - pred_rwd4 = self.world_model.pred_rewards(sample4i) - pred_rwd5 = self.world_model.pred_rewards(sample5i) - rs.append(pred_rwd1) - rs.append(pred_rwd2) - rs.append(pred_rwd3) - rs.append(pred_rwd4) - rs.append(pred_rwd5) - # Each times, 5 models predict different actions. - # [2560, 17] - pred_act1, log_pi1, _ = self.actor_net(sample1i) - pred_act2, log_pi2, _ = self.actor_net(sample2i) - pred_act3, log_pi3, _ = self.actor_net(sample3i) - pred_act4, log_pi4, _ = self.actor_net(sample4i) - pred_act5, log_pi5, _ = self.actor_net(sample5i) - acts.append(log_pi1) - acts.append(log_pi2) - acts.append(log_pi3) - acts.append(log_pi4) - acts.append(log_pi5) - # How to become the same next state, different action. - # Now: sample1 sample2... same next state, different model. - # Pred_act1 pred_act2 same next_state, different actions. - # 5[] * 10[var of state] - qa1, qa2 = self.target_critic_net(sample1i, pred_act1) - qa = torch.minimum(qa1, qa2) - qb1, qb2 = self.target_critic_net(sample2i, pred_act2) - qb = torch.minimum(qb1, qb2) - qc1, qc2 = self.target_critic_net(sample3i, pred_act3) - qc = torch.minimum(qc1, qc2) - qd1, qd2 = self.target_critic_net(sample4i, pred_act4) - qd = torch.minimum(qd1, qd2) - qe1, qe2 = self.target_critic_net(sample5i, pred_act5) - qe = torch.minimum(qe1, qe2) - qs.append(qa) - qs.append(qb) - qs.append(qc) - qs.append(qd) - qs.append(qe) - if self.reweight_critic == 1: - rs = torch.stack(rs) - acts = torch.stack(acts) - qs = torch.stack(qs) - - if self.reweight_critic: - var_r = torch.var(rs, dim=0) - var_a = torch.var(acts, dim=0) - var_q = torch.var(qs, dim=0) - - mean_a = torch.mean(acts, dim=0, keepdim=True) - mean_q = torch.mean(qs, dim=0, keepdim=True) - diff_a = acts - mean_a - diff_q = qs - mean_q - cov_aq = torch.mean(diff_a * diff_q, dim=0) - - mean_r = torch.mean(rs, dim=0, keepdim=True) - diff_r = rs - mean_r - cov_rq = torch.mean(diff_r * diff_q, dim=0) - cov_ra = torch.mean(diff_r * diff_a, dim=0) - - gamma_sq = self.gamma * self.gamma - total_var = ( - var_r - + gamma_sq * var_a - + gamma_sq * var_q - + gamma_sq * 2 * cov_aq - + gamma_sq * 2 * cov_rq - + gamma_sq * 2 * cov_ra - ) - - if self.reweight_actor: - mean_a = torch.mean(acts, dim=0, keepdim=True) - mean_q = torch.mean(qs, dim=0, keepdim=True) - diff_a = acts - mean_a - diff_q = qs - mean_q - cov_aq = torch.mean(diff_a * diff_q, dim=0) - - var_a = torch.var(acts, dim=0) - var_q = torch.var(qs, dim=0) - # For actor: alpha^2 * var_a + var_q - total_var = (self._alpha**2) * var_a + var_q + (self._alpha**2) * cov_aq - - min_var = torch.min(total_var) - max_var = torch.max(total_var) - # As (max-min) decrease, threshold should go down. - threshold = self.threshold_scale * (max_var - min_var) + min_var - total_var[total_var <= threshold] = threshold - - total_var += 0.00000001 - total_stds = 1 / total_var - - return total_stds.detach() - - def set_statistics(self, stats: dict) -> None: - self.world_model.set_statistics(stats) - - def save_models(self, filename: str, filepath: str = "models") -> None: - path = f"{filepath}/models" if filepath != "models" else filepath - dir_exists = os.path.exists(path) - if not dir_exists: - os.makedirs(path) - torch.save(self.actor_net.state_dict(), f"{path}/{filename}_actor.pth") - torch.save(self.critic_net.state_dict(), f"{path}/{filename}_critic.pth") - logging.info("models has been saved...") - - def load_models(self, filepath: str, filename: str) -> None: - path = f"{filepath}/models" if filepath != "models" else filepath - self.actor_net.load_state_dict(torch.load(f"{path}/{filename}_actor.pth")) - self.critic_net.load_state_dict(torch.load(f"{path}/{filename}_critic.pth")) - logging.info("models has been loaded...") diff --git a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_NS_IW.py b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_NS_IW.py new file mode 100644 index 00000000..e29f58ee --- /dev/null +++ b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_NS_IW.py @@ -0,0 +1,359 @@ +""" +Sutton, Richard S. "Dyna, an integrated architecture for learning, planning, and reacting." + +Original Paper: https://dl.acm.org/doi/abs/10.1145/122344.122377 + +This code runs automatic entropy tuning +""" + +import copy +import logging + +import numpy as np +import torch +from cares_reinforcement_learning.util.helpers import denormalize_observation_delta +from cares_reinforcement_learning.memory import MemoryBuffer + +from cares_reinforcement_learning.networks.world_models.ensemble import ( + Ensemble_Dyna_Big, +) + + +class DynaSAC_NS_IW: + def __init__( + self, + actor_network: torch.nn.Module, + critic_network: torch.nn.Module, + world_network: Ensemble_Dyna_Big, + gamma: float, + tau: float, + action_num: int, + actor_lr: float, + critic_lr: float, + alpha_lr: float, + num_samples: int, + horizon: int, + threshold:float, + device: torch.device, + train_reward: bool, + train_both: bool, + gripper: bool, + ): + logging.info("-------------------------------------------") + logging.info("----I am runing the Dyna_SAC_NS Agent! ----") + logging.info("-------------------------------------------") + self.train_reward = train_reward + self.train_both = train_both + self.gripper = gripper + self.threshold = threshold + self.type = "mbrl" + self.device = device + + # this may be called policy_net in other implementations + self.actor_net = actor_network.to(self.device) + # this may be called soft_q_net in other implementations + self.critic_net = critic_network.to(self.device) + self.target_critic_net = copy.deepcopy(self.critic_net) + + self.gamma = gamma + self.tau = tau + + self.num_samples = num_samples + self.horizon = horizon + self.action_num = action_num + + self.learn_counter = 0 + self.policy_update_freq = 1 + + self.actor_net_optimiser = torch.optim.Adam( + self.actor_net.parameters(), lr=actor_lr + ) + self.critic_net_optimiser = torch.optim.Adam( + self.critic_net.parameters(), lr=critic_lr + ) + + # Set to initial alpha to 1.0 according to other baselines. + self.log_alpha = torch.FloatTensor([np.log(1.0)]).to(device) + self.log_alpha.requires_grad = True + self.target_entropy = -action_num + self.log_alpha_optimizer = torch.optim.Adam([self.log_alpha], lr=alpha_lr) + + # World model + self.world_model = world_network + + @property + def _alpha(self) -> float: + return self.log_alpha.exp() + + def select_action_from_policy( + self, state: np.ndarray, evaluation: bool = False, noise_scale: float = 0 + ) -> np.ndarray: + # note that when evaluating this algorithm we need to select mu as + self.actor_net.eval() + with torch.no_grad(): + state_tensor = torch.FloatTensor(state).unsqueeze(0).to(self.device) + if evaluation is False: + (action, _, _) = self.actor_net(state_tensor) + else: + (_, _, action) = self.actor_net(state_tensor) + action = action.cpu().data.numpy().flatten() + self.actor_net.train() + return action + + def _train_policy( + self, + states: torch.Tensor, + actions: torch.Tensor, + rewards: torch.Tensor, + next_states: torch.Tensor, + dones: torch.Tensor, + weights: torch.Tensor, + ) -> None: + if weights is None: + weights = torch.ones(rewards.shape).to(self.device) + weights = weights.to(self.device) + info = {} + with torch.no_grad(): + next_actions, next_log_pi, _ = self.actor_net(next_states) + target_q_one, target_q_two = self.target_critic_net( + next_states, next_actions + ) + target_q_values = ( + torch.minimum(target_q_one, target_q_two) - self._alpha * next_log_pi + ) + q_target = rewards + self.gamma * (1 - dones) * target_q_values + assert (len(q_target.shape) == 2) and (q_target.shape[1] == 1) + q_target = q_target.detach() + q_values_one, q_values_two = self.critic_net(states, actions) + # critic_loss_one = F.mse_loss(q_values_one, q_target) + td_error1 = (q_target - q_values_one) # * weights + td_error2 = (q_target - q_values_two) # * weights + critic_loss_one = 0.5 * (td_error1.pow(2) * weights).mean() + critic_loss_two = 0.5 * (td_error2.pow(2) * weights).mean() + critic_loss_total = critic_loss_one + critic_loss_two + # Update the Critic + self.critic_net_optimiser.zero_grad() + critic_loss_total.backward() + self.critic_net_optimiser.step() + ################## Update the Actor Second #################### + pi, first_log_p, _ = self.actor_net(states) + qf1_pi, qf2_pi = self.critic_net(states, pi) + min_qf_pi = torch.minimum(qf1_pi, qf2_pi) + actor_loss = ((self._alpha * first_log_p) - min_qf_pi).mean() + + # Update the Actor + self.actor_net_optimiser.zero_grad() + actor_loss.backward() + self.actor_net_optimiser.step() + + # update the temperature + alpha_loss = -( + self.log_alpha * (first_log_p + self.target_entropy).detach() + ).mean() + self.log_alpha_optimizer.zero_grad() + alpha_loss.backward() + self.log_alpha_optimizer.step() + + if self.learn_counter % self.policy_update_freq == 0: + for target_param, param in zip( + self.target_critic_net.parameters(), self.critic_net.parameters() + ): + target_param.data.copy_( + param.data * self.tau + target_param.data * (1.0 - self.tau) + ) + + def train_world_model(self, memory: MemoryBuffer, batch_size: int) -> None: + + experiences = memory.sample_uniform(batch_size) + states, actions, rewards, next_states, _, _ = experiences + + states = torch.FloatTensor(np.asarray(states)).to(self.device) + actions = torch.FloatTensor(np.asarray(actions)).to(self.device) + next_states = torch.FloatTensor(np.asarray(next_states)).to(self.device) + + self.world_model.train_world( + states=states, + actions=actions, + next_states=next_states, + ) + + batch_size = len(states) + # Reshape to batch_size x whatever + if self.train_reward: + rewards = torch.FloatTensor(np.asarray(rewards)).to(self.device) + rewards = rewards.unsqueeze(0).reshape(batch_size, 1) + if self.train_both: + self.world_model.train_together(states, actions, rewards) + else: + self.world_model.train_reward(states, actions, next_states, rewards) + + def train_policy(self, memory: MemoryBuffer, batch_size: int) -> None: + self.learn_counter += 1 + + experiences = memory.sample_uniform(batch_size) + states, actions, rewards, next_states, dones, _ = experiences + + # Convert into tensor + states = torch.FloatTensor(np.asarray(states)).to(self.device) + actions = torch.FloatTensor(np.asarray(actions)).to(self.device) + rewards = torch.FloatTensor(np.asarray(rewards)).to(self.device).unsqueeze(1) + next_states = torch.FloatTensor(np.asarray(next_states)).to(self.device) + dones = torch.LongTensor(np.asarray(dones)).to(self.device).unsqueeze(1) + + # Step 2 train as usual + self._train_policy( + states=states, + actions=actions, + rewards=rewards, + next_states=next_states, + dones=dones, + weights=torch.ones(rewards.shape), + ) + self._dyna_generate_and_train(next_states) + + def _dyna_generate_and_train(self, next_states: torch.Tensor) -> None: + """ + Only off-policy Dyna will work. + :param next_states: + """ + pred_states = [] + pred_actions = [] + pred_rs = [] + pred_n_states = [] + weights = [] + + with torch.no_grad(): + pred_state = next_states + for _ in range(self.horizon): + pred_state = torch.repeat_interleave(pred_state, self.num_samples, dim=0) + # This part is controversial. But random actions is empirically better. + # rand_acts = np.random.uniform(-1, 1, (pred_state.shape[0], self.action_num)) + # pred_acts = torch.FloatTensor(rand_acts).to(self.device) + (pred_acts, _, _) = self.actor_net(pred_state) + # [2560, 18] + pred_next_state, _, norm_means_, norm_vars_ = self.world_model.pred_next_states( + pred_state, pred_acts + ) + if self.gripper: + pred_reward = self.reward_function(pred_state, pred_next_state) + pred_next_state[:, -2:] = pred_state[:, -2:] + else: + pred_reward, _ = self.world_model.pred_rewards(observation=pred_state, + action=pred_acts, + next_observation=pred_next_state) + uncert = self.sampling(pred_state, norm_means_, norm_vars_) + # Q, A, R + weights.append(uncert) + + pred_states.append(pred_state) + pred_actions.append(pred_acts.detach()) + pred_rs.append(pred_reward.detach()) + pred_n_states.append(pred_next_state.detach()) + pred_state = pred_next_state.detach() + pred_states = torch.vstack(pred_states) + pred_actions = torch.vstack(pred_actions) + pred_rs = torch.vstack(pred_rs) + pred_n_states = torch.vstack(pred_n_states) + pred_weights = torch.vstack(weights) + # Pay attention to here! It is dones in the Cares RL Code! + pred_dones = torch.FloatTensor(np.zeros(pred_rs.shape)).to(self.device) + # states, actions, rewards, next_states, not_dones + self._train_policy( + pred_states, pred_actions, pred_rs, pred_n_states, pred_dones, pred_weights + ) + + def reward_function(self, curr_states, next_states): + target_goal_tensor = curr_states[:, -2:] + object_current = next_states[:, -4:-2] + sq_diff = (target_goal_tensor - object_current) ** 2 + # [256, 1] + goal_distance_after = torch.sqrt(torch.sum(sq_diff, dim=1)).unsqueeze(dim=1) + pred_reward = -goal_distance_after + 70 + mask1 = goal_distance_after <= 10 + mask2 = goal_distance_after > 70 + pred_reward[mask1] = 800 + pred_reward[mask2] = 0 + return pred_reward + + def sampling(self, curr_states, pred_means, pred_vars): + """ + High std means low uncertainty. Therefore, divided by 1 + + :param pred_means: [num_model, batch_size * 10, observation_dim] + :param pred_vars: + :return: + """ + with torch.no_grad(): + # 5 models. Each predict 10 next_states. + r_s = [] + act_logs = [] + q_s = [] + # For each model + for i in range(pred_means.shape[0]): + sample_times = 10 + samples = torch.distributions.Normal(pred_means[i], pred_vars[i]).sample([sample_times]) + # For each sampling + for i in range(sample_times): + samples[i] = denormalize_observation_delta(samples[i], self.world_model.statistics) + samples[i] += curr_states + pred_act, log_pi, _ = self.actor_net(samples[i]) + act_logs.append(log_pi) + # pred_rwd1 = self.world_model.pred_rewards(samples[i]) + rewards = self.reward_function(curr_states, samples[i]) + r_s.append(rewards) + qa1, qa2 = self.target_critic_net(samples[i], pred_act) + q_a = torch.minimum(qa1, qa2) + q_s.append(q_a) + r_s = torch.stack(r_s) + act_logs = torch.stack(act_logs) + q_s = torch.stack(q_s) + + var_r = torch.var(r_s, dim=0) + var_a = torch.var(act_logs, dim=0) + var_q = torch.var(q_s, dim=0) + + mean_a = torch.mean(act_logs, dim=0, keepdim=True) + mean_q = torch.mean(q_s, dim=0, keepdim=True) + diff_a = act_logs - mean_a + diff_q = q_s - mean_q + cov_aq = torch.mean(diff_a * diff_q, dim=0) + + mean_r = torch.mean(r_s, dim=0, keepdim=True) + diff_r = r_s - mean_r + cov_rq = torch.mean(diff_r * diff_q, dim=0) + cov_ra = torch.mean(diff_r * diff_a, dim=0) + + gamma_sq = self.gamma * self.gamma + total_var = var_r + gamma_sq * var_a + gamma_sq * var_q + gamma_sq * 2 * cov_aq + \ + gamma_sq * 2 * cov_rq + gamma_sq * 2 * cov_ra + # # For actor: alpha^2 * var_a + var_q + min_var = torch.min(total_var) + max_var = torch.max(total_var) + # As (max-min) decrease, threshold should go down. + threshold = self.threshold * (max_var - min_var) + min_var + total_var[total_var <= threshold] = threshold + # Inverse variance. + weights = 1 / total_var + # Normalization + new_min_var = torch.min(weights) + new_max_var = torch.max(weights) + weights = (weights - new_min_var) / (new_max_var - new_min_var) + weights += 0.0001 + return weights.detach() + + def set_statistics(self, stats: dict) -> None: + self.world_model.set_statistics(stats) + + def save_models(self, filename: str, filepath: str = "models") -> None: + # if not os.path.exists(filepath): + # os.makedirs(filepath) + # print(filepath) + # logging.info(filepath) + # torch.save(self.actor_net.state_dict(), f"{filepath}/{filename}_actor.pht") + # torch.save(self.critic_net.state_dict(), f"{filepath}/{filename}_critic.pht") + logging.info("models has been saved...") + + def load_models(self, filepath: str, filename: str) -> None: + self.actor_net.load_state_dict(torch.load(f"{filepath}/{filename}_actor.pht")) + self.critic_net.load_state_dict(torch.load(f"{filepath}/{filename}_critic.pht")) + logging.info("models has been loaded...") diff --git a/cares_reinforcement_learning/algorithm/mbrl/__init__.py b/cares_reinforcement_learning/algorithm/mbrl/__init__.py index 0aa63719..b7904238 100644 --- a/cares_reinforcement_learning/algorithm/mbrl/__init__.py +++ b/cares_reinforcement_learning/algorithm/mbrl/__init__.py @@ -1,8 +1,13 @@ +# Baseline from .DynaSAC_NS import DynaSAC_NS +from .STEVESAC import STEVESAC +# Bounded Exploration from .DynaSAC_Bounded import DynaSAC_Bounded -from .DynaSAC_IW_NS import DynaSAC_ScaleBatchReweight +from .STEVESAC_Bounded import STEVESAC_Bounded +# Immersive Weighting +from .DynaSAC_NS_IW import DynaSAC_NS_IW from .DynaSAC_SUNRISE_NS import DynaSAC_SUNRISEReweight from .DynaSAC_UWAC_NS import DynaSAC_UWACReweight from .DynaSAC_BIV_NS import DynaSAC_BIVReweight -from .STEVESAC_Bounded import STEVESAC_Bounded -from .STEVESAC import STEVESAC + + diff --git a/cares_reinforcement_learning/util/configurations.py b/cares_reinforcement_learning/util/configurations.py index 26b36f6f..a33ae098 100644 --- a/cares_reinforcement_learning/util/configurations.py +++ b/cares_reinforcement_learning/util/configurations.py @@ -267,6 +267,7 @@ class DynaSAC_BoundedConfig(AlgorithmConfig): actor_config: MLPConfig = MLPConfig(hidden_sizes=[256, 256]) critic_config: MLPConfig = MLPConfig(hidden_sizes=[256, 256]) + num_rwd_models: int = 1 max_steps_exploration: int = 256 num_models: int = 5 world_model_lr: float = 0.001 @@ -305,9 +306,11 @@ class STEVE_MEANConfig(AlgorithmConfig): gripper: bool = False -class DynaSAC_SAS_Immersive_WeightConfig(AlgorithmConfig): - algorithm: str = Field("DynaSAC_IWNS", Literal=True) +class DynaSAC_NS_IW(AlgorithmConfig): + algorithm: str = Field("DynaSAC_NS_IW", Literal=True) type: str = Field("mbrl", Literal=True) + G: int = (1,) + G_model: float = (1,) actor_lr: float = 3e-4 critic_lr: float = 3e-4 alpha_lr: float = 3e-4 @@ -320,16 +323,19 @@ class DynaSAC_SAS_Immersive_WeightConfig(AlgorithmConfig): actor_config: MLPConfig = MLPConfig(hidden_sizes=[256, 256]) critic_config: MLPConfig = MLPConfig(hidden_sizes=[256, 256]) + max_steps_exploration: int = 256 num_models: int = 5 world_model_lr: float = 0.001 horizon: int = 3 num_samples: int = 10 + num_rwd_models: int = 1 + sas: bool = False threshold: float = 0.1 reweight_actor: bool = False train_reward: bool = True - train_both: bool = True + train_both: bool = False gripper: bool = False diff --git a/cares_reinforcement_learning/util/network_factory.py b/cares_reinforcement_learning/util/network_factory.py index 804d4f66..af3592c2 100644 --- a/cares_reinforcement_learning/util/network_factory.py +++ b/cares_reinforcement_learning/util/network_factory.py @@ -225,49 +225,52 @@ def create_STEVESAC_Bounded( return agent -# def create_DynaSAC_SAS_Immersive_Weight(observation_size, action_num, config: AlgorithmConfig): -# """ -# Create networks for model-based SAC agent. The Actor and Critic is same. -# An extra world model is added. -# -# """ -# from cares_reinforcement_learning.algorithm.mbrl import DynaSAC_SAS_Immersive_Weight -# from cares_reinforcement_learning.networks.SAC import Actor, Critic -# from cares_reinforcement_learning.networks.world_models import EnsembleWorldAndOneSASReward -# -# actor = Actor(observation_size, action_num) -# critic = Critic(observation_size, action_num) -# -# device = torch.device("cuda" if torch.cuda.is_available() else "cpu") -# -# world_model = EnsembleWorldAndOneSASReward( -# observation_size=observation_size, -# num_actions=action_num, -# num_models=config.num_models, -# device=device, -# lr=config.world_model_lr, -# ) -# -# agent = DynaSAC_SAS_Immersive_Weight( -# actor_network=actor, -# critic_network=critic, -# world_network=world_model, -# actor_lr=config.actor_lr, -# critic_lr=config.critic_lr, -# gamma=config.gamma, -# tau=config.tau, -# action_num=action_num, -# device=device, -# alpha_lr=config.alpha_lr, -# horizon=config.horizon, -# num_samples=config.num_samples, -# threshold_scale=config.threshold_scale, -# reweight_critic=config.reweight_critic, -# reweight_actor=config.reweight_actor, -# mode=config.mode, -# sample_times=config.sample_times, -# ) -# return agent +def create_DynaSAC_NS_IW(observation_size, action_num, config: acf.DynaSAC_NS_Immersive_WeightConfig): + """ + Create networks for model-based SAC agent. The Actor and Critic is same. + An extra world model is added. + + """ + from cares_reinforcement_learning.algorithm.mbrl import DynaSAC_NS_IW + from cares_reinforcement_learning.networks.SAC import Actor, Critic + from cares_reinforcement_learning.networks.world_models.ensemble import ( + Ensemble_Dyna_Big, + ) + + actor = Actor(observation_size, action_num, config=config) + critic = Critic(observation_size, action_num, config=config) + + device = hlp.get_device() + + world_model = Ensemble_Dyna_Big( + observation_size=observation_size, + num_actions=action_num, + num_models=config.num_models, + num_rwd_model=config.num_rwd_models, + device=device, + l_r=config.world_model_lr, + sas=config.sas, + ) + + agent = DynaSAC_NS_IW( + actor_network=actor, + critic_network=critic, + world_network=world_model, + actor_lr=config.actor_lr, + critic_lr=config.critic_lr, + gamma=config.gamma, + tau=config.tau, + action_num=action_num, + device=device, + alpha_lr=config.alpha_lr, + horizon=config.horizon, + num_samples=config.num_samples, + train_both=config.train_both, + train_reward=config.train_reward, + gripper=config.gripper, + threshold=config.threshold, + ) + return agent # def create_DynaSAC_SAS(observation_size, action_num, config: AlgorithmConfig): From 82e017c081fe677d10f6b1aa5acc37261ba5cb27 Mon Sep 17 00:00:00 2001 From: "Formatter [BOT]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Sun, 5 Jan 2025 20:53:14 +0000 Subject: [PATCH 85/91] =?UTF-8?q?Auto-format=20code=20=F0=9F=A7=B9?= =?UTF-8?q?=F0=9F=8C=9F=F0=9F=A4=96?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../algorithm/mbrl/DynaSAC_NS_IW.py | 60 ++++++++++++------- .../algorithm/mbrl/__init__.py | 4 +- .../util/network_factory.py | 4 +- 3 files changed, 42 insertions(+), 26 deletions(-) diff --git a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_NS_IW.py b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_NS_IW.py index e29f58ee..1092aeb3 100644 --- a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_NS_IW.py +++ b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_NS_IW.py @@ -33,7 +33,7 @@ def __init__( alpha_lr: float, num_samples: int, horizon: int, - threshold:float, + threshold: float, device: torch.device, train_reward: bool, train_both: bool, @@ -101,13 +101,13 @@ def select_action_from_policy( return action def _train_policy( - self, - states: torch.Tensor, - actions: torch.Tensor, - rewards: torch.Tensor, - next_states: torch.Tensor, - dones: torch.Tensor, - weights: torch.Tensor, + self, + states: torch.Tensor, + actions: torch.Tensor, + rewards: torch.Tensor, + next_states: torch.Tensor, + dones: torch.Tensor, + weights: torch.Tensor, ) -> None: if weights is None: weights = torch.ones(rewards.shape).to(self.device) @@ -119,15 +119,15 @@ def _train_policy( next_states, next_actions ) target_q_values = ( - torch.minimum(target_q_one, target_q_two) - self._alpha * next_log_pi + torch.minimum(target_q_one, target_q_two) - self._alpha * next_log_pi ) q_target = rewards + self.gamma * (1 - dones) * target_q_values assert (len(q_target.shape) == 2) and (q_target.shape[1] == 1) q_target = q_target.detach() q_values_one, q_values_two = self.critic_net(states, actions) # critic_loss_one = F.mse_loss(q_values_one, q_target) - td_error1 = (q_target - q_values_one) # * weights - td_error2 = (q_target - q_values_two) # * weights + td_error1 = q_target - q_values_one # * weights + td_error2 = q_target - q_values_two # * weights critic_loss_one = 0.5 * (td_error1.pow(2) * weights).mean() critic_loss_two = 0.5 * (td_error2.pow(2) * weights).mean() critic_loss_total = critic_loss_one + critic_loss_two @@ -148,7 +148,7 @@ def _train_policy( # update the temperature alpha_loss = -( - self.log_alpha * (first_log_p + self.target_entropy).detach() + self.log_alpha * (first_log_p + self.target_entropy).detach() ).mean() self.log_alpha_optimizer.zero_grad() alpha_loss.backward() @@ -156,7 +156,7 @@ def _train_policy( if self.learn_counter % self.policy_update_freq == 0: for target_param, param in zip( - self.target_critic_net.parameters(), self.critic_net.parameters() + self.target_critic_net.parameters(), self.critic_net.parameters() ): target_param.data.copy_( param.data * self.tau + target_param.data * (1.0 - self.tau) @@ -225,22 +225,26 @@ def _dyna_generate_and_train(self, next_states: torch.Tensor) -> None: with torch.no_grad(): pred_state = next_states for _ in range(self.horizon): - pred_state = torch.repeat_interleave(pred_state, self.num_samples, dim=0) + pred_state = torch.repeat_interleave( + pred_state, self.num_samples, dim=0 + ) # This part is controversial. But random actions is empirically better. # rand_acts = np.random.uniform(-1, 1, (pred_state.shape[0], self.action_num)) # pred_acts = torch.FloatTensor(rand_acts).to(self.device) (pred_acts, _, _) = self.actor_net(pred_state) # [2560, 18] - pred_next_state, _, norm_means_, norm_vars_ = self.world_model.pred_next_states( - pred_state, pred_acts + pred_next_state, _, norm_means_, norm_vars_ = ( + self.world_model.pred_next_states(pred_state, pred_acts) ) if self.gripper: pred_reward = self.reward_function(pred_state, pred_next_state) pred_next_state[:, -2:] = pred_state[:, -2:] else: - pred_reward, _ = self.world_model.pred_rewards(observation=pred_state, - action=pred_acts, - next_observation=pred_next_state) + pred_reward, _ = self.world_model.pred_rewards( + observation=pred_state, + action=pred_acts, + next_observation=pred_next_state, + ) uncert = self.sampling(pred_state, norm_means_, norm_vars_) # Q, A, R weights.append(uncert) @@ -291,10 +295,14 @@ def sampling(self, curr_states, pred_means, pred_vars): # For each model for i in range(pred_means.shape[0]): sample_times = 10 - samples = torch.distributions.Normal(pred_means[i], pred_vars[i]).sample([sample_times]) + samples = torch.distributions.Normal( + pred_means[i], pred_vars[i] + ).sample([sample_times]) # For each sampling for i in range(sample_times): - samples[i] = denormalize_observation_delta(samples[i], self.world_model.statistics) + samples[i] = denormalize_observation_delta( + samples[i], self.world_model.statistics + ) samples[i] += curr_states pred_act, log_pi, _ = self.actor_net(samples[i]) act_logs.append(log_pi) @@ -324,8 +332,14 @@ def sampling(self, curr_states, pred_means, pred_vars): cov_ra = torch.mean(diff_r * diff_a, dim=0) gamma_sq = self.gamma * self.gamma - total_var = var_r + gamma_sq * var_a + gamma_sq * var_q + gamma_sq * 2 * cov_aq + \ - gamma_sq * 2 * cov_rq + gamma_sq * 2 * cov_ra + total_var = ( + var_r + + gamma_sq * var_a + + gamma_sq * var_q + + gamma_sq * 2 * cov_aq + + gamma_sq * 2 * cov_rq + + gamma_sq * 2 * cov_ra + ) # # For actor: alpha^2 * var_a + var_q min_var = torch.min(total_var) max_var = torch.max(total_var) diff --git a/cares_reinforcement_learning/algorithm/mbrl/__init__.py b/cares_reinforcement_learning/algorithm/mbrl/__init__.py index b7904238..63f0e7cb 100644 --- a/cares_reinforcement_learning/algorithm/mbrl/__init__.py +++ b/cares_reinforcement_learning/algorithm/mbrl/__init__.py @@ -1,13 +1,13 @@ # Baseline from .DynaSAC_NS import DynaSAC_NS from .STEVESAC import STEVESAC + # Bounded Exploration from .DynaSAC_Bounded import DynaSAC_Bounded from .STEVESAC_Bounded import STEVESAC_Bounded + # Immersive Weighting from .DynaSAC_NS_IW import DynaSAC_NS_IW from .DynaSAC_SUNRISE_NS import DynaSAC_SUNRISEReweight from .DynaSAC_UWAC_NS import DynaSAC_UWACReweight from .DynaSAC_BIV_NS import DynaSAC_BIVReweight - - diff --git a/cares_reinforcement_learning/util/network_factory.py b/cares_reinforcement_learning/util/network_factory.py index af3592c2..f99d2129 100644 --- a/cares_reinforcement_learning/util/network_factory.py +++ b/cares_reinforcement_learning/util/network_factory.py @@ -225,7 +225,9 @@ def create_STEVESAC_Bounded( return agent -def create_DynaSAC_NS_IW(observation_size, action_num, config: acf.DynaSAC_NS_Immersive_WeightConfig): +def create_DynaSAC_NS_IW( + observation_size, action_num, config: acf.DynaSAC_NS_Immersive_WeightConfig +): """ Create networks for model-based SAC agent. The Actor and Critic is same. An extra world model is added. From 2f5fc92db8e701d04333cf660a317f37ed4fcd3d Mon Sep 17 00:00:00 2001 From: tony Date: Mon, 6 Jan 2025 10:14:53 +1300 Subject: [PATCH 86/91] Fix iw --- cares_reinforcement_learning/util/configurations.py | 2 +- cares_reinforcement_learning/util/network_factory.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/cares_reinforcement_learning/util/configurations.py b/cares_reinforcement_learning/util/configurations.py index a33ae098..0791a081 100644 --- a/cares_reinforcement_learning/util/configurations.py +++ b/cares_reinforcement_learning/util/configurations.py @@ -306,7 +306,7 @@ class STEVE_MEANConfig(AlgorithmConfig): gripper: bool = False -class DynaSAC_NS_IW(AlgorithmConfig): +class DynaSAC_NS_IWConfig(AlgorithmConfig): algorithm: str = Field("DynaSAC_NS_IW", Literal=True) type: str = Field("mbrl", Literal=True) G: int = (1,) diff --git a/cares_reinforcement_learning/util/network_factory.py b/cares_reinforcement_learning/util/network_factory.py index af3592c2..cf1dbd6d 100644 --- a/cares_reinforcement_learning/util/network_factory.py +++ b/cares_reinforcement_learning/util/network_factory.py @@ -225,7 +225,7 @@ def create_STEVESAC_Bounded( return agent -def create_DynaSAC_NS_IW(observation_size, action_num, config: acf.DynaSAC_NS_Immersive_WeightConfig): +def create_DynaSAC_NS_IW(observation_size, action_num, config: acf.DynaSAC_NS_IWConfig): """ Create networks for model-based SAC agent. The Actor and Critic is same. An extra world model is added. From 9e223d2c3d2b987eed971b26aedea213940bf1cb Mon Sep 17 00:00:00 2001 From: "Formatter [BOT]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Sun, 5 Jan 2025 21:16:48 +0000 Subject: [PATCH 87/91] =?UTF-8?q?Auto-format=20code=20=F0=9F=A7=B9?= =?UTF-8?q?=F0=9F=8C=9F=F0=9F=A4=96?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- cares_reinforcement_learning/util/network_factory.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/cares_reinforcement_learning/util/network_factory.py b/cares_reinforcement_learning/util/network_factory.py index a7bf5e01..cf1dbd6d 100644 --- a/cares_reinforcement_learning/util/network_factory.py +++ b/cares_reinforcement_learning/util/network_factory.py @@ -225,9 +225,7 @@ def create_STEVESAC_Bounded( return agent -def create_DynaSAC_NS_IW( - observation_size, action_num, config: acf.DynaSAC_NS_IWConfig -): +def create_DynaSAC_NS_IW(observation_size, action_num, config: acf.DynaSAC_NS_IWConfig): """ Create networks for model-based SAC agent. The Actor and Critic is same. An extra world model is added. From eae247cc4f733a6cf77ebbf969e2601cc44d6b3a Mon Sep 17 00:00:00 2001 From: tony Date: Thu, 9 Jan 2025 14:48:49 +1300 Subject: [PATCH 88/91] Fix iw --- .../algorithm/mbrl/DynaSAC_Bounded_Yao.py | 351 ++++++++++++++++++ .../algorithm/mbrl/STEVESAC_Bounded_Yao.py | 344 +++++++++++++++++ .../algorithm/mbrl/__init__.py | 3 +- .../util/configurations.py | 66 ++++ .../util/network_factory.py | 103 +++++ 5 files changed, 866 insertions(+), 1 deletion(-) create mode 100644 cares_reinforcement_learning/algorithm/mbrl/DynaSAC_Bounded_Yao.py create mode 100644 cares_reinforcement_learning/algorithm/mbrl/STEVESAC_Bounded_Yao.py diff --git a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_Bounded_Yao.py b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_Bounded_Yao.py new file mode 100644 index 00000000..3caa4d41 --- /dev/null +++ b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_Bounded_Yao.py @@ -0,0 +1,351 @@ +""" +Sutton, Richard S. "Dyna, an integrated architecture for learning, planning, and reacting." + +Original Paper: https://dl.acm.org/doi/abs/10.1145/122344.122377 + +This code runs automatic entropy tuning +""" + +import copy +import logging + +import numpy as np +import torch +from torch import nn +from cares_reinforcement_learning.memory import MemoryBuffer + +from cares_reinforcement_learning.networks.world_models.ensemble import ( + Ensemble_Dyna_Big, +) +import torch.nn.functional as F + + +class DynaSAC_Bounded_Yao: + def __init__( + self, + actor_network: torch.nn.Module, + critic_network: torch.nn.Module, + world_network: Ensemble_Dyna_Big, + gamma: float, + tau: float, + action_num: int, + actor_lr: float, + critic_lr: float, + alpha_lr: float, + num_samples: int, + horizon: int, + device: torch.device, + train_reward: bool, + train_both: bool, + gripper: bool, + threshold: float, + exploration_sample: int, + ): + logging.info("-----------------------------------------------") + logging.info("----I am runing the DynaSAC_Bounded Agent! ----") + logging.info("-----------------------------------------------") + self.train_reward = train_reward + self.train_both = train_both + self.gripper = gripper + self.exploration_sample = exploration_sample + self.threshold = threshold + self.set_stat = False + self.type = "mbrl" + self.device = device + + # this may be called policy_net in other implementations + self.actor_net = actor_network.to(self.device) + # this may be called soft_q_net in other implementations + self.critic_net = critic_network.to(self.device) + self.target_critic_net = copy.deepcopy(self.critic_net) + + self.gamma = gamma + self.tau = tau + + self.num_samples = num_samples + self.horizon = horizon + self.action_num = action_num + + self.learn_counter = 0 + self.policy_update_freq = 1 + + self.actor_net_optimiser = torch.optim.Adam( + self.actor_net.parameters(), lr=actor_lr + ) + self.critic_net_optimiser = torch.optim.Adam( + self.critic_net.parameters(), lr=critic_lr + ) + + # Set to initial alpha to 1.0 according to other baselines. + self.log_alpha = torch.FloatTensor([np.log(1.0)]).to(device) + self.log_alpha.requires_grad = True + self.target_entropy = -action_num + self.log_alpha_optimizer = torch.optim.Adam([self.log_alpha], lr=alpha_lr) + + # World model + self.world_model = world_network + + self.k_l = nn.KLDivLoss(reduction="batchmean", log_target=True) + + @property + def _alpha(self) -> float: + return self.log_alpha.exp() + + def _jsd(self, p, q): + p, q = p.view(-1, p.size(-1)).log_softmax(-1), q.view( + -1, q.size(-1) + ).log_softmax(-1) + m = 0.5 * (p + q) + return 0.5 * (self.k_l(m, p) + self.k_l(m, q)) + + def select_action_from_policy( + self, state: np.ndarray, evaluation: bool = False, noise_scale: float = 0 + ) -> np.ndarray: + # note that when evaluating this algorithm we need to select mu as + self.actor_net.eval() + with torch.no_grad(): + state_tensor = torch.FloatTensor(state).unsqueeze(0).to(self.device) + if evaluation is False: + if self.threshold == 0: + (action, _, _) = self.actor_net(state_tensor) + else: + if self.set_stat: + multi_state_tensor = torch.repeat_interleave( + state_tensor, self.exploration_sample, dim=0 + ) + (multi_action, multi_log_pi, _) = self.actor_net( + multi_state_tensor + ) + # Estimate uncertainty + # [6, 10, 17] + _, _, nstate_means, nstate_vars = ( + self.world_model.pred_next_states( + observation=multi_state_tensor, actions=multi_action + ) + ) + # [10, 17] + aleatoric = torch.mean(nstate_vars**2, dim=0) ** 0.5 + epistemic = torch.var(nstate_means, dim=0) ** 0.5 + aleatoric = torch.clamp(aleatoric, max=10e3) + epistemic = torch.clamp(epistemic, max=10e3) + total_unc = (aleatoric**2 + epistemic**2) ** 0.5 + world_dist = torch.mean(total_unc, dim=1) + # world_dist = F.softmax(uncert, dim=0) + # world_dist -= torch.min(world_dist) + + Q_1, Q_2 = self.critic_net(multi_state_tensor, multi_action) + Q_s = torch.minimum(Q_1, Q_2) + Q_s = Q_s.squeeze() + policy_dist = Q_s + + # policy_dist = multi_log_pi.squeeze() + # policy_dist = F.softmax(multi_log_pi, dim=0) + final_dist = policy_dist + self.threshold * world_dist + + # final_dist = F.softmax(final_dist, dim=0) + # candi = torch.argmax(final_dist) + # new_dist = torch.distributions.Categorical(final_dist) + # candi = new_dist.sample([1]).squeeze() + candi = torch.argmax(final_dist) + action = multi_action[candi] + else: + (action, _, _) = self.actor_net(state_tensor) + else: + (_, _, action) = self.actor_net(state_tensor) + action = action.cpu().data.numpy().flatten() + self.actor_net.train() + return action + + def _train_policy( + self, + states: torch.Tensor, + actions: torch.Tensor, + rewards: torch.Tensor, + next_states: torch.Tensor, + dones: torch.Tensor, + weights: torch.Tensor, + ) -> None: + if weights is None: + weights = torch.ones(rewards.shape) + ################## Update the Critic First #################### + with torch.no_grad(): + next_actions, next_log_pi, _ = self.actor_net(next_states) + + target_q_one, target_q_two = self.target_critic_net( + next_states, next_actions + ) + target_q_values = ( + torch.minimum(target_q_one, target_q_two) - self._alpha * next_log_pi + ) + q_target = rewards + self.gamma * (1 - dones) * target_q_values + + q_values_one, q_values_two = self.critic_net(states, actions) + + critic_loss_one = ((q_values_one - q_target).pow(2)).mean() + critic_loss_two = ((q_values_two - q_target).pow(2)).mean() + + critic_loss_total = critic_loss_one + critic_loss_two + + # Update the Critic + self.critic_net_optimiser.zero_grad() + critic_loss_total.backward() + self.critic_net_optimiser.step() + + ################## Update the Actor Second #################### + pi, first_log_p, _ = self.actor_net(states) + qf1_pi, qf2_pi = self.critic_net(states, pi) + min_qf_pi = torch.minimum(qf1_pi, qf2_pi) + actor_loss = ((self._alpha * first_log_p) - min_qf_pi).mean() + + # Update the Actor + self.actor_net_optimiser.zero_grad() + actor_loss.backward() + self.actor_net_optimiser.step() + + # Update the temperature + alpha_loss = -( + self.log_alpha * (first_log_p + self.target_entropy).detach() + ).mean() + + self.log_alpha_optimizer.zero_grad() + alpha_loss.backward() + self.log_alpha_optimizer.step() + + if self.learn_counter % self.policy_update_freq == 0: + for target_param, param in zip( + self.target_critic_net.parameters(), self.critic_net.parameters() + ): + target_param.data.copy_( + param.data * self.tau + target_param.data * (1.0 - self.tau) + ) + + def train_world_model(self, memory: MemoryBuffer, batch_size: int) -> None: + + experiences = memory.sample_uniform(batch_size) + states, actions, rewards, next_states, _, _ = experiences + + states = torch.FloatTensor(np.asarray(states)).to(self.device) + actions = torch.FloatTensor(np.asarray(actions)).to(self.device) + next_states = torch.FloatTensor(np.asarray(next_states)).to(self.device) + + self.world_model.train_world( + states=states, + actions=actions, + next_states=next_states, + ) + + batch_size = len(states) + # Reshape to batch_size x whatever + if self.train_reward: + rewards = torch.FloatTensor(np.asarray(rewards)).to(self.device) + rewards = rewards.unsqueeze(0).reshape(batch_size, 1) + if self.train_both: + self.world_model.train_together(states, actions, rewards) + else: + self.world_model.train_reward(states, actions, next_states, rewards) + + def train_policy(self, memory: MemoryBuffer, batch_size: int) -> None: + self.learn_counter += 1 + + experiences = memory.sample_uniform(batch_size) + states, actions, rewards, next_states, dones, _ = experiences + + # Convert into tensor + states = torch.FloatTensor(np.asarray(states)).to(self.device) + actions = torch.FloatTensor(np.asarray(actions)).to(self.device) + rewards = torch.FloatTensor(np.asarray(rewards)).to(self.device).unsqueeze(1) + next_states = torch.FloatTensor(np.asarray(next_states)).to(self.device) + dones = torch.LongTensor(np.asarray(dones)).to(self.device).unsqueeze(1) + + # Step 2 train as usual + self._train_policy( + states=states, + actions=actions, + rewards=rewards, + next_states=next_states, + dones=dones, + weights=torch.ones(rewards.shape), + ) + self._dyna_generate_and_train(next_states) + + def _dyna_generate_and_train(self, next_states: torch.Tensor) -> None: + pred_states = [] + pred_actions = [] + pred_rs = [] + pred_n_states = [] + + with torch.no_grad(): + pred_state = next_states + for _ in range(self.horizon): + pred_state = torch.repeat_interleave( + pred_state, self.num_samples, dim=0 + ) + # This part is controversial. But random actions is empirically better. + # rand_acts = np.random.uniform(-1, 1, (pred_state.shape[0], self.action_num)) + # pred_acts = torch.FloatTensor(rand_acts).to(self.device) + pred_acts, _, _ = self.actor_net(pred_state) + pred_next_state, _, _, _ = self.world_model.pred_next_states( + pred_state, pred_acts + ) + + if self.gripper: + pred_reward = self.reward_function(pred_state, pred_next_state) + pred_next_state[:, -2:] = pred_state[:, -2:] + else: + pred_reward, _ = self.world_model.pred_rewards( + observation=pred_state, + action=pred_acts, + next_observation=pred_next_state, + ) + + pred_states.append(pred_state) + pred_actions.append(pred_acts.detach()) + pred_rs.append(pred_reward.detach()) + pred_n_states.append(pred_next_state.detach()) + pred_state = pred_next_state.detach() + pred_states = torch.vstack(pred_states) + pred_actions = torch.vstack(pred_actions) + pred_rs = torch.vstack(pred_rs) + pred_n_states = torch.vstack(pred_n_states) + # Pay attention to here! It is dones in the Cares RL Code! + pred_dones = torch.FloatTensor(np.zeros(pred_rs.shape)).to(self.device) + # states, actions, rewards, next_states, not_dones + self._train_policy( + pred_states, + pred_actions, + pred_rs, + pred_n_states, + pred_dones, + torch.ones(pred_rs.shape), + ) + + def reward_function(self, curr_states, next_states): + target_goal_tensor = curr_states[:, -2:] + object_current = next_states[:, -4:-2] + sq_diff = (target_goal_tensor - object_current) ** 2 + # [256, 1] + goal_distance_after = torch.sqrt(torch.sum(sq_diff, dim=1)).unsqueeze(dim=1) + pred_reward = -goal_distance_after + 70 + mask1 = goal_distance_after <= 10 + mask2 = goal_distance_after > 70 + pred_reward[mask1] = 800 + pred_reward[mask2] = 0 + return pred_reward + + def set_statistics(self, stats: dict) -> None: + self.world_model.set_statistics(stats) + self.set_stat = True + + def save_models(self, filename: str, filepath: str = "models") -> None: + # if not os.path.exists(filepath): + # os.makedirs(filepath) + # print(filepath) + # logging.info(filepath) + # torch.save(self.actor_net.state_dict(), f"{filepath}/{filename}_actor.pht") + # torch.save(self.critic_net.state_dict(), f"{filepath}/{filename}_critic.pht") + logging.info("models has been saved...") + + def load_models(self, filepath: str, filename: str) -> None: + self.actor_net.load_state_dict(torch.load(f"{filepath}/{filename}_actor.pht")) + self.critic_net.load_state_dict(torch.load(f"{filepath}/{filename}_critic.pht")) + logging.info("models has been loaded...") diff --git a/cares_reinforcement_learning/algorithm/mbrl/STEVESAC_Bounded_Yao.py b/cares_reinforcement_learning/algorithm/mbrl/STEVESAC_Bounded_Yao.py new file mode 100644 index 00000000..e9f88d7f --- /dev/null +++ b/cares_reinforcement_learning/algorithm/mbrl/STEVESAC_Bounded_Yao.py @@ -0,0 +1,344 @@ +""" +Sutton, Richard S. "Dyna, an integrated architecture for learning, planning, and reacting." + +Original Paper: https://dl.acm.org/doi/abs/10.1145/122344.122377 + +This code runs automatic entropy tuning +""" + +import copy +import logging + +import numpy as np +import torch +from torch import nn +from cares_reinforcement_learning.memory import MemoryBuffer + +from cares_reinforcement_learning.networks.world_models.ensemble import ( + Ensemble_Dyna_Big, +) +import torch.nn.functional as F + + +class STEVESAC_Bounded_Yao: + def __init__( + self, + actor_network: torch.nn.Module, + critic_network: torch.nn.Module, + world_network: Ensemble_Dyna_Big, + gamma: float, + tau: float, + action_num: int, + actor_lr: float, + critic_lr: float, + alpha_lr: float, + horizon: int, + device: torch.device, + train_reward: bool, + train_both: bool, + gripper: bool, + threshold: float, + exploration_sample: int, + ): + logging.info("------------------------------------------------") + logging.info("----I am runing the STEVESAC_Bounded Agent! ----") + logging.info("------------------------------------------------") + self.train_reward = train_reward + self.train_both = train_both + self.gripper = gripper + self.exploration_sample = exploration_sample + self.threshold = threshold + self.set_stat = False + self.type = "mbrl" + self.device = device + + # this may be called policy_net in other implementations + self.actor_net = actor_network.to(self.device) + # this may be called soft_q_net in other implementations + self.critic_net = critic_network.to(self.device) + self.target_critic_net = copy.deepcopy(self.critic_net) + + self.gamma = gamma + self.tau = tau + + self.horizon = horizon + self.action_num = action_num + + self.learn_counter = 0 + self.policy_update_freq = 1 + + self.actor_net_optimiser = torch.optim.Adam( + self.actor_net.parameters(), lr=actor_lr + ) + self.critic_net_optimiser = torch.optim.Adam( + self.critic_net.parameters(), lr=critic_lr + ) + + # Set to initial alpha to 1.0 according to other baselines. + self.log_alpha = torch.FloatTensor([np.log(1.0)]).to(device) + self.log_alpha.requires_grad = True + self.target_entropy = -action_num + self.log_alpha_optimizer = torch.optim.Adam([self.log_alpha], lr=alpha_lr) + + # World model + self.world_model = world_network + + self.k_l = nn.KLDivLoss(reduction="batchmean", log_target=True) + + @property + def _alpha(self) -> float: + return self.log_alpha.exp() + + def select_action_from_policy( + self, state: np.ndarray, evaluation: bool = False, noise_scale: float = 0 + ) -> np.ndarray: + # note that when evaluating this algorithm we need to select mu as + self.actor_net.eval() + with torch.no_grad(): + state_tensor = torch.FloatTensor(state).unsqueeze(0).to(self.device) + if evaluation is False: + (action, _, _) = self.actor_net(state_tensor) + if self.threshold == 0: + (action, _, _) = self.actor_net(state_tensor) + else: + if self.set_stat: + multi_state_tensor = torch.repeat_interleave( + state_tensor, self.exploration_sample, dim=0 + ) + (multi_action, multi_log_pi, _) = self.actor_net( + multi_state_tensor + ) + # Estimate uncertainty + # [6, 10, 17] + _, _, nstate_means, nstate_vars = ( + self.world_model.pred_next_states( + observation=multi_state_tensor, actions=multi_action + ) + ) + # [10, 17] + aleatoric = torch.mean(nstate_vars**2, dim=0) ** 0.5 + epistemic = torch.var(nstate_means, dim=0) ** 0.5 + aleatoric = torch.clamp(aleatoric, max=10e3) + epistemic = torch.clamp(epistemic, max=10e3) + total_unc = (aleatoric**2 + epistemic**2) ** 0.5 + world_dist = torch.mean(total_unc, dim=1) + # world_dist = F.softmax(uncert, dim=0) + # world_dist -= torch.min(world_dist) + + Q_1, Q_2 = self.critic_net(multi_state_tensor, multi_action) + Q_s = torch.minimum(Q_1, Q_2) + Q_s = Q_s.squeeze() + policy_dist = Q_s + + # multi_log_pi = multi_log_pi.squeeze() + # policy_dist = F.softmax(multi_log_pi, dim=0) + + final_dist = policy_dist + self.threshold * world_dist + + # candi = torch.argmax(final_dist) + # final_dist = F.softmax(final_dist, dim=0) + # new_dist = torch.distributions.Categorical(final_dist) + candi = torch.argmax(final_dist) + + action = multi_action[candi] + else: + (action, _, _) = self.actor_net(state_tensor) + else: + (_, _, action) = self.actor_net(state_tensor) + action = action.cpu().data.numpy().flatten() + self.actor_net.train() + return action + + def _train_policy( + self, + states: torch.Tensor, + actions: torch.Tensor, + rewards: torch.Tensor, + next_states: torch.Tensor, + dones: torch.Tensor, + weights: torch.Tensor, + ) -> None: + if weights is None: + weights = torch.ones(rewards.shape) + ################## Update the Critic First #################### + with torch.no_grad(): + not_dones = 1 - dones + q_means = [] + q_weights = [] + accum_dist_rewards = torch.repeat_interleave( + rewards.unsqueeze(dim=0), repeats=30, dim=0 + ) + # 5 * 5 * 4 = 100 + for hori in range(self.horizon): + _, curr_hori_log_pi, curr_hori_action = self.actor_net(next_states) + mean_predictions, all_mean_next, _, _ = ( + self.world_model.pred_next_states(next_states, curr_hori_action) + ) + pred_rewards, _ = self.world_model.pred_all_rewards( + observation=next_states, + action=curr_hori_action, + next_observation=all_mean_next, + ) + pred_rewards *= self.gamma ** (hori + 1) + accum_dist_rewards += pred_rewards + # V = Q - alpha * logi + pred_q1, pred_q2 = self.target_critic_net(next_states, curr_hori_action) + pred_q3, pred_q4 = self.critic_net(next_states, curr_hori_action) + pred_v1 = pred_q1 - self._alpha * curr_hori_log_pi + pred_v2 = pred_q2 - self._alpha * curr_hori_log_pi + pred_v3 = pred_q3 - self._alpha * curr_hori_log_pi + pred_v4 = pred_q4 - self._alpha * curr_hori_log_pi + q_0 = [] + for i in range(pred_rewards.shape[0]): + pred_tq1 = ( + accum_dist_rewards[i] + + not_dones * (self.gamma ** (hori + 2)) * pred_v1 + ) + pred_tq2 = ( + accum_dist_rewards[i] + + not_dones * (self.gamma ** (hori + 2)) * pred_v2 + ) + pred_tq3 = ( + accum_dist_rewards[i] + + not_dones * (self.gamma ** (hori + 2)) * pred_v3 + ) + pred_tq4 = ( + accum_dist_rewards[i] + + not_dones * (self.gamma ** (hori + 2)) * pred_v4 + ) + q_0.append(pred_tq1) + q_0.append(pred_tq2) + q_0.append(pred_tq3) + q_0.append(pred_tq4) + q_0 = torch.stack(q_0) + # Compute var, mean and add them to the queue + # [100, 256, 1] -> [256, 1] + mean_0 = torch.mean(q_0, dim=0) + q_means.append(mean_0) + var_0 = torch.var(q_0, dim=0) + var_0[torch.abs(var_0) < 0.0001] = 0.0001 + weights_0 = 1.0 / var_0 + q_weights.append(weights_0) + next_states = mean_predictions + all_means = torch.stack(q_means) + all_weights = torch.stack(q_weights) + total_weights = torch.sum(all_weights, dim=0) + for n in range(self.horizon): + all_weights[n] /= total_weights + q_target = torch.sum(all_weights * all_means, dim=0) + + q_values_one, q_values_two = self.critic_net(states, actions) + critic_loss_one = ((q_values_one - q_target).pow(2)).mean() + critic_loss_two = ((q_values_two - q_target).pow(2)).mean() + critic_loss_total = critic_loss_one + critic_loss_two + # Update the Critic + self.critic_net_optimiser.zero_grad() + critic_loss_total.backward() + self.critic_net_optimiser.step() + + ################## Update the Actor Second #################### + pi, first_log_p, _ = self.actor_net(states) + qf1_pi, qf2_pi = self.critic_net(states, pi) + min_qf_pi = torch.minimum(qf1_pi, qf2_pi) + actor_loss = ((self._alpha * first_log_p) - min_qf_pi).mean() + + # Update the Actor + self.actor_net_optimiser.zero_grad() + actor_loss.backward() + self.actor_net_optimiser.step() + + # Update the temperature + alpha_loss = -( + self.log_alpha * (first_log_p + self.target_entropy).detach() + ).mean() + + self.log_alpha_optimizer.zero_grad() + alpha_loss.backward() + self.log_alpha_optimizer.step() + + if self.learn_counter % self.policy_update_freq == 0: + for target_param, param in zip( + self.target_critic_net.parameters(), self.critic_net.parameters() + ): + target_param.data.copy_( + param.data * self.tau + target_param.data * (1.0 - self.tau) + ) + + def train_world_model(self, memory: MemoryBuffer, batch_size: int) -> None: + + experiences = memory.sample_uniform(batch_size) + states, actions, rewards, next_states, _, _ = experiences + + states = torch.FloatTensor(np.asarray(states)).to(self.device) + actions = torch.FloatTensor(np.asarray(actions)).to(self.device) + next_states = torch.FloatTensor(np.asarray(next_states)).to(self.device) + + self.world_model.train_world( + states=states, + actions=actions, + next_states=next_states, + ) + + batch_size = len(states) + # Reshape to batch_size x whatever + if self.train_reward: + rewards = torch.FloatTensor(np.asarray(rewards)).to(self.device) + rewards = rewards.unsqueeze(0).reshape(batch_size, 1) + if self.train_both: + self.world_model.train_together(states, actions, rewards) + else: + self.world_model.train_reward(states, actions, next_states, rewards) + + def train_policy(self, memory: MemoryBuffer, batch_size: int) -> None: + self.learn_counter += 1 + + experiences = memory.sample_uniform(batch_size) + states, actions, rewards, next_states, dones, _ = experiences + + # Convert into tensor + states = torch.FloatTensor(np.asarray(states)).to(self.device) + actions = torch.FloatTensor(np.asarray(actions)).to(self.device) + rewards = torch.FloatTensor(np.asarray(rewards)).to(self.device).unsqueeze(1) + next_states = torch.FloatTensor(np.asarray(next_states)).to(self.device) + dones = torch.LongTensor(np.asarray(dones)).to(self.device).unsqueeze(1) + + # Step 2 train as usual + self._train_policy( + states=states, + actions=actions, + rewards=rewards, + next_states=next_states, + dones=dones, + weights=torch.ones(rewards.shape), + ) + + def reward_function(self, curr_states, next_states): + target_goal_tensor = curr_states[:, -2:] + object_current = next_states[:, -4:-2] + sq_diff = (target_goal_tensor - object_current) ** 2 + # [256, 1] + goal_distance_after = torch.sqrt(torch.sum(sq_diff, dim=1)).unsqueeze(dim=1) + pred_reward = -goal_distance_after + 70 + mask1 = goal_distance_after <= 10 + mask2 = goal_distance_after > 70 + pred_reward[mask1] = 800 + pred_reward[mask2] = 0 + return pred_reward + + def set_statistics(self, stats: dict) -> None: + self.world_model.set_statistics(stats) + self.set_stat = True + + def save_models(self, filename: str, filepath: str = "models") -> None: + # if not os.path.exists(filepath): + # os.makedirs(filepath) + # print(filepath) + # logging.info(filepath) + # torch.save(self.actor_net.state_dict(), f"{filepath}/{filename}_actor.pht") + # torch.save(self.critic_net.state_dict(), f"{filepath}/{filename}_critic.pht") + logging.info("models has been saved...") + + def load_models(self, filepath: str, filename: str) -> None: + self.actor_net.load_state_dict(torch.load(f"{filepath}/{filename}_actor.pht")) + self.critic_net.load_state_dict(torch.load(f"{filepath}/{filename}_critic.pht")) + logging.info("models has been loaded...") diff --git a/cares_reinforcement_learning/algorithm/mbrl/__init__.py b/cares_reinforcement_learning/algorithm/mbrl/__init__.py index 63f0e7cb..c12650b5 100644 --- a/cares_reinforcement_learning/algorithm/mbrl/__init__.py +++ b/cares_reinforcement_learning/algorithm/mbrl/__init__.py @@ -5,7 +5,8 @@ # Bounded Exploration from .DynaSAC_Bounded import DynaSAC_Bounded from .STEVESAC_Bounded import STEVESAC_Bounded - +from .DynaSAC_Bounded_Yao import DynaSAC_Bounded_Yao +from .STEVESAC_Bounded_Yao import STEVESAC_Bounded_Yao # Immersive Weighting from .DynaSAC_NS_IW import DynaSAC_NS_IW from .DynaSAC_SUNRISE_NS import DynaSAC_SUNRISEReweight diff --git a/cares_reinforcement_learning/util/configurations.py b/cares_reinforcement_learning/util/configurations.py index 0791a081..a0a112b3 100644 --- a/cares_reinforcement_learning/util/configurations.py +++ b/cares_reinforcement_learning/util/configurations.py @@ -281,6 +281,72 @@ class DynaSAC_BoundedConfig(AlgorithmConfig): exploration_sample: int = 5 +class STEVESAC_BoundedConfig_Yao(AlgorithmConfig): + algorithm: str = Field("STEVESAC_Bounded_Yao", Literal=True) + type: str = Field("mbrl", Literal=True) + G: int = (1,) + G_model: float = (1,) + + actor_lr: float = 3e-4 + critic_lr: float = 3e-4 + alpha_lr: float = 3e-4 + gamma: float = 0.99 + tau: float = 0.005 + reward_scale: float = 1.0 + log_std_bounds: list[float] = [-20, 2] + policy_update_freq: int = 1 + target_update_freq: int = 1 + actor_config: MLPConfig = MLPConfig(hidden_sizes=[256, 256]) + critic_config: MLPConfig = MLPConfig(hidden_sizes=[256, 256]) + + max_steps_exploration: int = 256 + + num_models: int = 6 + num_rwd_models: int = 5 + world_model_lr: float = 0.001 + + horizon: int = 3 + + sas: bool = False + train_reward: bool = True + train_both: bool = False + gripper: bool = False + + threshold: float = 0.1 + exploration_sample: int = 5 + +class DynaSAC_Bounded_YaoConfig(AlgorithmConfig): + algorithm: str = Field("DynaSAC_Bounded_Yao", Literal=True) + type: str = Field("mbrl", Literal=True) + G: int = (1,) + G_model: float = (1,) + + actor_lr: float = 3e-4 + critic_lr: float = 3e-4 + alpha_lr: float = 3e-4 + gamma: float = 0.99 + tau: float = 0.005 + reward_scale: float = 1.0 + log_std_bounds: list[float] = [-20, 2] + policy_update_freq: int = 1 + target_update_freq: int = 1 + actor_config: MLPConfig = MLPConfig(hidden_sizes=[256, 256]) + critic_config: MLPConfig = MLPConfig(hidden_sizes=[256, 256]) + + num_rwd_models: int = 1 + max_steps_exploration: int = 256 + num_models: int = 5 + world_model_lr: float = 0.001 + horizon: int = 3 + num_samples: int = 10 + sas: bool = False + train_reward: bool = True + train_both: bool = False + gripper: bool = False + threshold: float = 0.1 + exploration_sample: int = 5 + + class STEVE_MEANConfig(AlgorithmConfig): algorithm: str = Field("STEVE", Literal=True) type: str = Field("mbrl", Literal=True) diff --git a/cares_reinforcement_learning/util/network_factory.py b/cares_reinforcement_learning/util/network_factory.py index cf1dbd6d..61738453 100644 --- a/cares_reinforcement_learning/util/network_factory.py +++ b/cares_reinforcement_learning/util/network_factory.py @@ -129,6 +129,58 @@ def create_DynaSAC_Bounded( return agent +def create_DynaSAC_Bounded_Yao( + observation_size, action_num, config: acf.DynaSAC_Bounded_YaoConfig +): + """ + Create networks for model-based SAC agent. The Actor and Critic is same. + An extra world model is added. + """ + from cares_reinforcement_learning.algorithm.mbrl import DynaSAC_Bounded_Yao + from cares_reinforcement_learning.networks.SAC import Actor, Critic + from cares_reinforcement_learning.networks.world_models.ensemble import ( + Ensemble_Dyna_Big, + ) + + actor = Actor(observation_size, action_num, config=config) + critic = Critic(observation_size, action_num, config=config) + + device = hlp.get_device() + + world_model = Ensemble_Dyna_Big( + observation_size=observation_size, + num_actions=action_num, + num_models=config.num_models, + device=device, + l_r=config.world_model_lr, + sas=config.sas, + prob_rwd=True, + boost_inter=30, + ) + + agent = DynaSAC_Bounded_Yao( + actor_network=actor, + critic_network=critic, + world_network=world_model, + actor_lr=config.actor_lr, + critic_lr=config.critic_lr, + gamma=config.gamma, + tau=config.tau, + action_num=action_num, + alpha_lr=config.alpha_lr, + horizon=config.horizon, + num_samples=config.num_samples, + device=device, + train_both=config.train_both, + train_reward=config.train_reward, + gripper=config.gripper, + threshold=config.threshold, + exploration_sample=config.exploration_sample, + ) + + return agent + + def create_STEVESAC(observation_size, action_num, config: acf.STEVESACConfig): """ Create networks for model-based SAC agent. The Actor and Critic is same. @@ -225,6 +277,57 @@ def create_STEVESAC_Bounded( return agent +def create_STEVESAC_Bounded_Yao( + observation_size, action_num, config: acf.STEVESAC_Bounded_YaoConfig +): + """ + Create networks for model-based SAC agent. The Actor and Critic is same. + An extra world model is added. + """ + + from cares_reinforcement_learning.algorithm.mbrl import STEVESAC_Bounded_Yao + from cares_reinforcement_learning.networks.SAC import Actor, Critic + from cares_reinforcement_learning.networks.world_models.ensemble import ( + Ensemble_Dyna_Big, + ) + + actor = Actor(observation_size, action_num, config=config) + critic = Critic(observation_size, action_num, config=config) + + device = hlp.get_device() + + world_model = Ensemble_Dyna_Big( + observation_size=observation_size, + num_actions=action_num, + num_models=config.num_models, + num_rwd_model=config.num_rwd_models, + device=device, + l_r=config.world_model_lr, + sas=config.sas, + ) + + agent = STEVESAC_Bounded_Yao( + actor_network=actor, + critic_network=critic, + world_network=world_model, + actor_lr=config.actor_lr, + critic_lr=config.critic_lr, + gamma=config.gamma, + tau=config.tau, + action_num=action_num, + alpha_lr=config.alpha_lr, + horizon=config.horizon, + device=device, + train_both=config.train_both, + train_reward=config.train_reward, + gripper=config.gripper, + threshold=config.threshold, + exploration_sample=config.exploration_sample, + ) + + return agent + + def create_DynaSAC_NS_IW(observation_size, action_num, config: acf.DynaSAC_NS_IWConfig): """ Create networks for model-based SAC agent. The Actor and Critic is same. From 12e03c04fb89edced7900e5de62f33f11d0e3860 Mon Sep 17 00:00:00 2001 From: tony Date: Thu, 9 Jan 2025 14:56:30 +1300 Subject: [PATCH 89/91] add yaos' for bounded exploration --- cares_reinforcement_learning/util/network_factory.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/cares_reinforcement_learning/util/network_factory.py b/cares_reinforcement_learning/util/network_factory.py index 61738453..b38474e0 100644 --- a/cares_reinforcement_learning/util/network_factory.py +++ b/cares_reinforcement_learning/util/network_factory.py @@ -328,7 +328,9 @@ def create_STEVESAC_Bounded_Yao( return agent -def create_DynaSAC_NS_IW(observation_size, action_num, config: acf.DynaSAC_NS_IWConfig): +def create_DynaSAC_NS_IW( + observation_size, action_num, config: acf.DynaSAC_NS_IWConfig +): """ Create networks for model-based SAC agent. The Actor and Critic is same. An extra world model is added. From e2537410268817cef386158cef16893338738595 Mon Sep 17 00:00:00 2001 From: "Formatter [BOT]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Thu, 9 Jan 2025 02:01:40 +0000 Subject: [PATCH 90/91] =?UTF-8?q?Auto-format=20code=20=F0=9F=A7=B9?= =?UTF-8?q?=F0=9F=8C=9F=F0=9F=A4=96?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- cares_reinforcement_learning/algorithm/mbrl/__init__.py | 1 + cares_reinforcement_learning/util/configurations.py | 1 + cares_reinforcement_learning/util/network_factory.py | 4 +--- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/cares_reinforcement_learning/algorithm/mbrl/__init__.py b/cares_reinforcement_learning/algorithm/mbrl/__init__.py index c12650b5..2c0b2dd2 100644 --- a/cares_reinforcement_learning/algorithm/mbrl/__init__.py +++ b/cares_reinforcement_learning/algorithm/mbrl/__init__.py @@ -7,6 +7,7 @@ from .STEVESAC_Bounded import STEVESAC_Bounded from .DynaSAC_Bounded_Yao import DynaSAC_Bounded_Yao from .STEVESAC_Bounded_Yao import STEVESAC_Bounded_Yao + # Immersive Weighting from .DynaSAC_NS_IW import DynaSAC_NS_IW from .DynaSAC_SUNRISE_NS import DynaSAC_SUNRISEReweight diff --git a/cares_reinforcement_learning/util/configurations.py b/cares_reinforcement_learning/util/configurations.py index a0a112b3..1fc68e80 100644 --- a/cares_reinforcement_learning/util/configurations.py +++ b/cares_reinforcement_learning/util/configurations.py @@ -315,6 +315,7 @@ class STEVESAC_BoundedConfig_Yao(AlgorithmConfig): threshold: float = 0.1 exploration_sample: int = 5 + class DynaSAC_Bounded_YaoConfig(AlgorithmConfig): algorithm: str = Field("DynaSAC_Bounded_Yao", Literal=True) type: str = Field("mbrl", Literal=True) diff --git a/cares_reinforcement_learning/util/network_factory.py b/cares_reinforcement_learning/util/network_factory.py index b38474e0..61738453 100644 --- a/cares_reinforcement_learning/util/network_factory.py +++ b/cares_reinforcement_learning/util/network_factory.py @@ -328,9 +328,7 @@ def create_STEVESAC_Bounded_Yao( return agent -def create_DynaSAC_NS_IW( - observation_size, action_num, config: acf.DynaSAC_NS_IWConfig -): +def create_DynaSAC_NS_IW(observation_size, action_num, config: acf.DynaSAC_NS_IWConfig): """ Create networks for model-based SAC agent. The Actor and Critic is same. An extra world model is added. From ef4c2e933ede083eba7fdb89ceb6d71c7393e734 Mon Sep 17 00:00:00 2001 From: tony Date: Thu, 9 Jan 2025 15:19:36 +1300 Subject: [PATCH 91/91] add yaos' for bounded exploration --- cares_reinforcement_learning/util/configurations.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cares_reinforcement_learning/util/configurations.py b/cares_reinforcement_learning/util/configurations.py index 1fc68e80..a1a47c4b 100644 --- a/cares_reinforcement_learning/util/configurations.py +++ b/cares_reinforcement_learning/util/configurations.py @@ -281,7 +281,7 @@ class DynaSAC_BoundedConfig(AlgorithmConfig): exploration_sample: int = 5 -class STEVESAC_BoundedConfig_Yao(AlgorithmConfig): +class STEVESAC_Bounded_YaoConfig(AlgorithmConfig): algorithm: str = Field("STEVESAC_Bounded_Yao", Literal=True) type: str = Field("mbrl", Literal=True) G: int = (1,)