diff --git a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC.py b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC.py deleted file mode 100644 index a527b428..00000000 --- a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC.py +++ /dev/null @@ -1,253 +0,0 @@ -""" -Sutton, Richard S. "Dyna, an integrated architecture for learning, planning, and reacting." - -Original Paper: https://dl.acm.org/doi/abs/10.1145/122344.122377 - -This code runs automatic entropy tuning -""" - -import copy -import logging -import os - -import numpy as np -import torch -import torch.nn.functional as F - -import cares_reinforcement_learning.util.helpers as hlp -from cares_reinforcement_learning.memory import MemoryBuffer -from cares_reinforcement_learning.networks.DynaSAC import Actor, Critic -from cares_reinforcement_learning.networks.world_models.ensemble_integrated import ( - EnsembleWorldReward, -) -from cares_reinforcement_learning.util.configurations import DynaSACConfig - - -class DynaSAC: - def __init__( - self, - actor_network: Actor, - critic_network: Critic, - world_network: EnsembleWorldReward, - config: DynaSACConfig, - device: torch.device, - ): - self.type = "mbrl" - self.device = device - - # this may be called policy_net in other implementations - self.actor_net = actor_network.to(self.device) - # this may be called soft_q_net in other implementations - self.critic_net = critic_network.to(self.device) - self.target_critic_net = copy.deepcopy(self.critic_net) - - self.gamma = config.gamma - self.tau = config.tau - - self.num_samples = config.num_samples - self.horizon = config.horizon - self.action_num = self.actor_net.num_actions - - self.learn_counter = 0 - self.policy_update_freq = config.policy_update_freq - self.target_update_freq = config.target_update_freq - - self.target_entropy = -self.action_num - - self.actor_net_optimiser = torch.optim.Adam( - self.actor_net.parameters(), lr=config.actor_lr - ) - self.critic_net_optimiser = torch.optim.Adam( - self.critic_net.parameters(), lr=config.critic_lr - ) - - # Set to initial alpha to 1.0 according to other baselines. - self.log_alpha = torch.tensor(np.log(1.0)).to(device) - self.log_alpha.requires_grad = True - self.log_alpha_optimizer = torch.optim.Adam( - [self.log_alpha], lr=config.alpha_lr - ) - - # World model - self.world_model = world_network - - @property - def _alpha(self) -> torch.Tensor: - return self.log_alpha.exp() - - def select_action_from_policy( - self, state: np.ndarray, evaluation: bool = False, noise_scale: float = 0 - ) -> np.ndarray: - # pylint: disable-next=unused-argument - - # note that when evaluating this algorithm we need to select mu as - self.actor_net.eval() - with torch.no_grad(): - state_tensor = torch.FloatTensor(state).unsqueeze(0).to(self.device) - if evaluation is False: - (action, _, _) = self.actor_net(state_tensor) - else: - (_, _, action) = self.actor_net(state_tensor) - action = action.cpu().data.numpy().flatten() - self.actor_net.train() - return action - - def _update_critic_actor(self, states, actions, rewards, next_states, dones): - # Update Critic - self._update_critic(states, actions, rewards, next_states, dones) - - if self.learn_counter % self.policy_update_freq == 0: - # Update Actor - self._update_actor(states) - - if self.learn_counter % self.target_update_freq == 0: - hlp.soft_update_params(self.critic_net, self.target_critic_net, self.tau) - - def _update_critic(self, states, actions, rewards, next_states, dones): - with torch.no_grad(): - next_actions, next_log_pi, _ = self.actor_net(next_states) - target_q_one, target_q_two = self.target_critic_net( - next_states, next_actions - ) - target_q_values = ( - torch.minimum(target_q_one, target_q_two) - self._alpha * next_log_pi - ) - q_target = rewards + self.gamma * (1 - dones) * target_q_values - - q_values_one, q_values_two = self.critic_net(states, actions) - critic_loss_one = F.mse_loss(q_values_one, q_target) - critic_loss_two = F.mse_loss(q_values_two, q_target) - critic_loss_total = critic_loss_one + critic_loss_two - - # Update the Critic - self.critic_net_optimiser.zero_grad() - critic_loss_total.backward() - self.critic_net_optimiser.step() - - def _update_actor(self, states): - pi, first_log_p, _ = self.actor_net(states) - qf1_pi, qf2_pi = self.critic_net(states, pi) - min_qf_pi = torch.minimum(qf1_pi, qf2_pi) - actor_loss = ((self._alpha * first_log_p) - min_qf_pi).mean() - - # Update the Actor - self.actor_net_optimiser.zero_grad() - actor_loss.backward() - self.actor_net_optimiser.step() - - # Update the temperature - alpha_loss = -( - self.log_alpha * (first_log_p + self.target_entropy).detach() - ).mean() - - self.log_alpha_optimizer.zero_grad() - alpha_loss.backward() - self.log_alpha_optimizer.step() - - def _dyna_generate_and_train(self, next_states: torch.Tensor) -> None: - pred_states = [] - pred_actions = [] - pred_rs = [] - pred_n_states = [] - - pred_state = next_states - - for _ in range(self.horizon): - pred_state = torch.repeat_interleave(pred_state, self.num_samples, dim=0) - # This part is controversial. But random actions is empirically better. - rand_acts = np.random.uniform(-1, 1, (pred_state.shape[0], self.action_num)) - pred_acts = torch.FloatTensor(rand_acts).to(self.device) - pred_next_state, _, _, _ = self.world_model.pred_next_states( - pred_state, pred_acts - ) - - pred_reward, _ = self.world_model.pred_rewards(pred_state, pred_acts) - pred_states.append(pred_state) - pred_actions.append(pred_acts.detach()) - pred_rs.append(pred_reward.detach()) - pred_n_states.append(pred_next_state.detach()) - pred_state = pred_next_state.detach() - - pred_states = torch.vstack(pred_states) - pred_actions = torch.vstack(pred_actions) - pred_rs = torch.vstack(pred_rs) - pred_n_states = torch.vstack(pred_n_states) - - # Pay attention to here! It is dones in the Cares RL Code! - pred_dones = torch.FloatTensor(np.zeros(pred_rs.shape)).to(self.device) - - # states, actions, rewards, next_states, not_dones - self._update_critic_actor( - pred_states, pred_actions, pred_rs, pred_n_states, pred_dones - ) - - def train_policy(self, memory: MemoryBuffer, batch_size: int) -> None: - self.learn_counter += 1 - - experiences = memory.sample_uniform(batch_size) - states, actions, rewards, next_states, dones, _ = experiences - - # Convert into tensor - states = torch.FloatTensor(np.asarray(states)).to(self.device) - actions = torch.FloatTensor(np.asarray(actions)).to(self.device) - rewards = torch.FloatTensor(np.asarray(rewards)).to(self.device).unsqueeze(1) - next_states = torch.FloatTensor(np.asarray(next_states)).to(self.device) - dones = torch.LongTensor(np.asarray(dones)).to(self.device).unsqueeze(1) - - # Step 1 train as usual - self._update_critic_actor(states, actions, rewards, next_states, dones) - - # # # Step 2 Dyna add more data - self._dyna_generate_and_train(next_states=next_states) - - def train_world_model(self, memory: MemoryBuffer, batch_size: int) -> None: - experiences = memory.sample_consecutive(batch_size) - - ( - states, - actions, - rewards, - next_states, - _, - _, - next_actions, - next_rewards, - _, - _, - _, - ) = experiences - - states = torch.FloatTensor(np.asarray(states)).to(self.device) - actions = torch.FloatTensor(np.asarray(actions)).to(self.device) - rewards = torch.FloatTensor(np.asarray(rewards)).to(self.device).unsqueeze(1) - next_states = torch.FloatTensor(np.asarray(next_states)).to(self.device) - next_rewards = ( - torch.FloatTensor(np.asarray(next_rewards)).to(self.device).unsqueeze(1) - ) - next_actions = torch.FloatTensor(np.asarray(next_actions)).to(self.device) - - # Step 1 train the world model. - self.world_model.train_world( - states=states, - actions=actions, - rewards=rewards, - next_states=next_states, - next_actions=next_actions, - next_rewards=next_rewards, - ) - - def set_statistics(self, stats: dict) -> None: - self.world_model.set_statistics(stats) - - def save_models(self, filepath: str, filename: str) -> None: - if not os.path.exists(filepath): - os.makedirs(filepath) - - torch.save(self.actor_net.state_dict(), f"{filepath}/{filename}_actor.pth") - torch.save(self.critic_net.state_dict(), f"{filepath}/{filename}_critic.pth") - logging.info("models has been saved...") - - def load_models(self, filepath: str, filename: str) -> None: - self.actor_net.load_state_dict(torch.load(f"{filepath}/{filename}_actor.pth")) - self.critic_net.load_state_dict(torch.load(f"{filepath}/{filename}_critic.pth")) - logging.info("models has been loaded...") diff --git a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_BIV_NS.py b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_BIV_NS.py new file mode 100644 index 00000000..a11c326e --- /dev/null +++ b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_BIV_NS.py @@ -0,0 +1,488 @@ +""" +Sutton, Richard S. "Dyna, an integrated architecture for learning, planning, and reacting." + +Original Paper: https://dl.acm.org/doi/abs/10.1145/122344.122377 + +This code runs automatic entropy tuning +""" + +import copy +import logging +import os +from scipy.optimize import minimize +import numpy as np +import torch + +import torch.nn.functional as F + +from cares_reinforcement_learning.memory import MemoryBuffer +from cares_reinforcement_learning.networks.world_models.ensemble import ( + Ensemble_Dyna_Big, +) + +from cares_reinforcement_learning.util.helpers import denormalize_observation_delta + + +class DynaSAC_BIVReweight: + """ + Max as ? + """ + + def __init__( + self, + actor_network: torch.nn.Module, + critic_network: torch.nn.Module, + world_network: Ensemble_Dyna_Big, + gamma: float, + tau: float, + action_num: int, + actor_lr: float, + critic_lr: float, + alpha_lr: float, + num_samples: int, + horizon: int, + threshold_scale: float, + reweight_critic: bool, + reweight_actor: bool, + mode: int, + sample_times: int, + device: torch.device, + ): + self.type = "mbrl" + self.device = device + self.reweight_critic = reweight_critic + self.reweight_actor = reweight_actor + # this may be called policy_net in other implementations + self.actor_net = actor_network.to(self.device) + # this may be called soft_q_net in other implementations + self.critic_net = critic_network.to(self.device) + self.target_critic_net = copy.deepcopy(self.critic_net) + + self.gamma = gamma + self.tau = tau + + self.num_samples = num_samples + self.horizon = horizon + self.action_num = action_num + + self.learn_counter = 0 + self.policy_update_freq = 1 + + self.actor_net_optimiser = torch.optim.Adam( + self.actor_net.parameters(), lr=actor_lr + ) + self.critic_net_optimiser = torch.optim.Adam( + self.critic_net.parameters(), lr=critic_lr + ) + + # Set to initial alpha to 1.0 according to other baselines. + self.log_alpha = torch.FloatTensor([np.log(1.0)]).to(device) + self.log_alpha.requires_grad = True + self.target_entropy = -action_num + self.log_alpha_optimizer = torch.optim.Adam([self.log_alpha], lr=alpha_lr) + + # World model + self.world_model = world_network + # Parameter + self.threshold_scale = threshold_scale + self.mode = mode + self.sample_times = sample_times + + @property + def _alpha(self) -> float: + return self.log_alpha.exp() + + # pylint: disable-next=unused-argument to keep the same interface + def select_action_from_policy( + self, state: np.ndarray, evaluation: bool = False, noise_scale: float = 0 + ) -> np.ndarray: + # note that when evaluating this algorithm we need to select mu as + self.actor_net.eval() + with torch.no_grad(): + state_tensor = torch.FloatTensor(state).unsqueeze(0).to(self.device) + if evaluation is False: + (action, _, _) = self.actor_net(state_tensor) + else: + (_, _, action) = self.actor_net(state_tensor) + action = action.cpu().data.numpy().flatten() + self.actor_net.train() + return action + + def _train_policy( + self, + states: torch.Tensor, + actions: torch.Tensor, + rewards: torch.Tensor, + next_states: torch.Tensor, + dones: torch.Tensor, + weights: torch.Tensor, + ) -> None: + ################## Update the Critic First #################### + # Have more target values? + with torch.no_grad(): + next_actions, next_log_pi, _ = self.actor_net(next_states) + target_q_one, target_q_two = self.target_critic_net( + next_states, next_actions + ) + target_q_values = ( + torch.minimum(target_q_one, target_q_two) - self._alpha * next_log_pi + ) + q_target = rewards + self.gamma * (1 - dones) * target_q_values + + q_values_one, q_values_two = self.critic_net(states, actions) + + if self.reweight_critic: + # Reweighted loss function. weight not participant in training. + l2_loss_one = (q_values_one - q_target).pow(2) + l2_loss_two = (q_values_two - q_target).pow(2) + + weights = weights.detach() + disc_l2_loss_one = l2_loss_one * weights + disc_l2_loss_two = l2_loss_two * weights + # A ratio to scale the loss back to original loss scale. + + ratio_1 = torch.mean(l2_loss_one) / torch.mean(disc_l2_loss_one) + ratio_1 = ratio_1.detach() + ratio_2 = torch.mean(l2_loss_two) / torch.mean(disc_l2_loss_two) + ratio_2 = ratio_2.detach() + + critic_loss_one = disc_l2_loss_one.mean() * ratio_1 + critic_loss_two = disc_l2_loss_two.mean() * ratio_2 + + critic_loss_total = critic_loss_one + critic_loss_two + else: + critic_loss_one = F.mse_loss(q_values_one, q_target) + critic_loss_two = F.mse_loss(q_values_two, q_target) + critic_loss_total = critic_loss_one + critic_loss_two + + # Update the Critic + self.critic_net_optimiser.zero_grad() + critic_loss_total.backward() + self.critic_net_optimiser.step() + + ################## Update the Actor Second #################### + pi, first_log_p, _ = self.actor_net(states) + qf1_pi, qf2_pi = self.critic_net(states, pi) + min_qf_pi = torch.minimum(qf1_pi, qf2_pi) + + if self.reweight_actor: + weights = weights.detach() + a_loss = (self._alpha * first_log_p) - min_qf_pi + disc_actor_loss = a_loss * weights + ratio = torch.mean(a_loss) / torch.mean(disc_actor_loss) + ratio = ratio.detach() + actor_loss = ratio * torch.mean(disc_actor_loss) + else: + actor_loss = ((self._alpha * first_log_p) - min_qf_pi).mean() + + # Update the Actor + self.actor_net_optimiser.zero_grad() + actor_loss.backward() + self.actor_net_optimiser.step() + + # Update the temperature + alpha_loss = -( + self.log_alpha * (first_log_p + self.target_entropy).detach() + ).mean() + + self.log_alpha_optimizer.zero_grad() + alpha_loss.backward() + self.log_alpha_optimizer.step() + + if self.learn_counter % self.policy_update_freq == 0: + for target_param, param in zip( + self.target_critic_net.parameters(), self.critic_net.parameters() + ): + target_param.data.copy_( + param.data * self.tau + target_param.data * (1.0 - self.tau) + ) + + def train_world_model(self, memory: MemoryBuffer, batch_size: int) -> None: + experiences = memory.sample_uniform(batch_size) + states, actions, rewards, next_states, _, _ = experiences + + states = torch.FloatTensor(np.asarray(states)).to(self.device) + actions = torch.FloatTensor(np.asarray(actions)).to(self.device) + rewards = torch.FloatTensor(np.asarray(rewards)).to(self.device).unsqueeze(1) + next_states = torch.FloatTensor(np.asarray(next_states)).to(self.device) + + self.world_model.train_world( + states=states, + actions=actions, + next_states=next_states, + ) + self.world_model.train_reward( + next_states=next_states, + rewards=rewards, + ) + + def train_policy(self, memory: MemoryBuffer, batch_size: int) -> None: + self.learn_counter += 1 + + experiences = memory.sample_uniform(batch_size) + states, actions, rewards, next_states, dones, _ = experiences + + # Convert into tensor + states = torch.FloatTensor(np.asarray(states)).to(self.device) + actions = torch.FloatTensor(np.asarray(actions)).to(self.device) + rewards = torch.FloatTensor(np.asarray(rewards)).to(self.device).unsqueeze(1) + next_states = torch.FloatTensor(np.asarray(next_states)).to(self.device) + dones = torch.LongTensor(np.asarray(dones)).to(self.device).unsqueeze(1) + full_weights = torch.ones(rewards.shape).to(self.device) + # Step 2 train as usual + self._train_policy( + states=states, + actions=actions, + rewards=rewards, + next_states=next_states, + dones=dones, + weights=full_weights, + ) + # # # Step 3 Dyna add more data + self._dyna_generate_and_train(next_states=next_states) + + def _dyna_generate_and_train(self, next_states): + """ + Only off-policy Dyna will work. + :param next_states: + """ + pred_states = [] + pred_actions = [] + pred_rs = [] + pred_n_states = [] + pred_uncerts = [] + with torch.no_grad(): + pred_state = next_states + for _ in range(self.horizon): + pred_state = torch.repeat_interleave( + pred_state, self.num_samples, dim=0 + ) + # This part is controversial. But random actions is empirically better. + rand_acts = np.random.uniform( + -1, 1, (pred_state.shape[0], self.action_num) + ) + pred_acts = torch.FloatTensor(rand_acts).to(self.device) + + pred_next_state, _, pred_mean, pred_var = ( + self.world_model.pred_next_states(pred_state, pred_acts) + ) + uncert = self.sampling( + curr_states=pred_state, pred_means=pred_mean, pred_vars=pred_var + ) + uncert = uncert.unsqueeze(dim=1).to(self.device) + pred_uncerts.append(uncert) + + pred_reward = self.world_model.pred_rewards(pred_next_state) + pred_states.append(pred_state) + pred_actions.append(pred_acts.detach()) + pred_rs.append(pred_reward.detach()) + pred_n_states.append(pred_next_state.detach()) + pred_state = pred_next_state.detach() + pred_states = torch.vstack(pred_states) + pred_actions = torch.vstack(pred_actions) + pred_rs = torch.vstack(pred_rs) + pred_n_states = torch.vstack(pred_n_states) + pred_weights = torch.vstack(pred_uncerts) + # Pay attention to here! It is dones in the Cares RL Code! + pred_dones = torch.FloatTensor(np.zeros(pred_rs.shape)).to(self.device) + # states, actions, rewards, next_states, not_dones + self._train_policy( + pred_states, pred_actions, pred_rs, pred_n_states, pred_dones, pred_weights + ) + + def sampling(self, curr_states, pred_means, pred_vars): + """ + High std means low uncertainty. Therefore, divided by 1 + + :param pred_means: + :param pred_vars: + :return: + """ + with torch.no_grad(): + # 5 models. Each predict 10 next_states. + sample1 = torch.distributions.Normal(pred_means[0], pred_vars[0]).sample( + [self.sample_times] + ) + sample2 = torch.distributions.Normal(pred_means[1], pred_vars[1]).sample( + [self.sample_times] + ) + sample3 = torch.distributions.Normal(pred_means[2], pred_vars[2]).sample( + [self.sample_times] + ) + sample4 = torch.distributions.Normal(pred_means[3], pred_vars[3]).sample( + [self.sample_times] + ) + sample5 = torch.distributions.Normal(pred_means[4], pred_vars[4]).sample( + [self.sample_times] + ) + rs = [] + acts = [] + qs = [] + # Varying the next_state's distribution. + for i in range(self.sample_times): + sample1i = denormalize_observation_delta( + sample1[i], self.world_model.statistics + ) + sample1i += curr_states + sample2i = denormalize_observation_delta( + sample2[i], self.world_model.statistics + ) + sample2i += curr_states + sample3i = denormalize_observation_delta( + sample3[i], self.world_model.statistics + ) + sample3i += curr_states + sample4i = denormalize_observation_delta( + sample4[i], self.world_model.statistics + ) + sample4i += curr_states + sample5i = denormalize_observation_delta( + sample5[i], self.world_model.statistics + ) + sample5i += curr_states + + # 5 models, each sampled 10 times = 50, + pred_rwd1 = self.world_model.pred_rewards(sample1i) + pred_rwd2 = self.world_model.pred_rewards(sample2i) + pred_rwd3 = self.world_model.pred_rewards(sample3i) + pred_rwd4 = self.world_model.pred_rewards(sample4i) + pred_rwd5 = self.world_model.pred_rewards(sample5i) + rs.append(pred_rwd1) + rs.append(pred_rwd2) + rs.append(pred_rwd3) + rs.append(pred_rwd4) + rs.append(pred_rwd5) + # Each times, 5 models predict different actions. + # [2560, 17] + pred_act1, log_pi1, _ = self.actor_net(sample1i) + pred_act2, log_pi2, _ = self.actor_net(sample2i) + pred_act3, log_pi3, _ = self.actor_net(sample3i) + pred_act4, log_pi4, _ = self.actor_net(sample4i) + pred_act5, log_pi5, _ = self.actor_net(sample5i) + acts.append(log_pi1) + acts.append(log_pi2) + acts.append(log_pi3) + acts.append(log_pi4) + acts.append(log_pi5) + # How to become the same next state, different action. + # Now: sample1 sample2... same next state, different model. + # Pred_act1 pred_act2 same next_state, different actions. + # 5[] * 10[var of state] + qa1, qa2 = self.target_critic_net(sample1i, pred_act1) + qa = torch.minimum(qa1, qa2) + qb1, qb2 = self.target_critic_net(sample2i, pred_act2) + qb = torch.minimum(qb1, qb2) + qc1, qc2 = self.target_critic_net(sample3i, pred_act3) + qc = torch.minimum(qc1, qc2) + qd1, qd2 = self.target_critic_net(sample4i, pred_act4) + qd = torch.minimum(qd1, qd2) + qe1, qe2 = self.target_critic_net(sample5i, pred_act5) + qe = torch.minimum(qe1, qe2) + qs.append(qa) + qs.append(qb) + qs.append(qc) + qs.append(qd) + qs.append(qe) + + rs = torch.stack(rs) + acts = torch.stack(acts) + qs = torch.stack(qs) + + var_r = torch.var(rs, dim=0) + + if self.mode < 3: + var_a = torch.var(acts, dim=0) + var_q = torch.var(qs, dim=0) + + # Computing covariance. + if self.mode < 2: + mean_a = torch.mean(acts, dim=0, keepdim=True) + mean_q = torch.mean(qs, dim=0, keepdim=True) + diff_a = acts - mean_a + diff_q = qs - mean_q + cov_aq = torch.mean(diff_a * diff_q, dim=0) + + if self.mode < 1: + mean_r = torch.mean(rs, dim=0, keepdim=True) + diff_r = rs - mean_r + cov_rq = torch.mean(diff_r * diff_q, dim=0) + + cov_ra = torch.mean(diff_r * diff_a, dim=0) + + gamma_sq = self.gamma * self.gamma + # Ablation + if self.mode == 0: + total_var = ( + var_r + + gamma_sq * var_a + + gamma_sq * var_q + + gamma_sq * 2 * cov_aq + + gamma_sq * 2 * cov_rq + + gamma_sq * 2 * cov_ra + ) + if self.mode == 1: + total_var = ( + var_r + gamma_sq * var_a + gamma_sq * var_q + gamma_sq * 2 * cov_aq + ) + if self.mode == 2: + total_var = var_r + gamma_sq * var_a + gamma_sq * var_q + if self.mode == 3: + total_var = var_r + + xi = self.get_optimal_xi(total_var.detach().cpu().squeeze().numpy()) + total_var += xi + # Weight = inverse of sum of weights * inverse of varaince. + weights = 1.0 / total_var + ratio = 1.0 / torch.sum(weights) + total_stds = ratio * weights + return total_stds.detach() + + def get_optimal_xi(self, variances): + minimal_size = self.threshold_scale + if self.compute_eff_bs(self.get_iv_weights(variances)) >= minimal_size: + return 0 + fn = lambda x: np.abs( + self.compute_eff_bs(self.get_iv_weights(variances + np.abs(x))) + - minimal_size + ) + epsilon = minimize( + fn, 0, method="Nelder-Mead", options={"fatol": 1.0, "maxiter": 100} + ) + xi = np.abs(epsilon.x[0]) + xi = 0 if xi is None else xi + return xi + + def get_iv_weights(self, variances): + """ + Returns Inverse Variance weights + Params + ====== + variances (numpy array): variance of the targets + """ + weights = 1 / variances + weights = weights / np.sum(weights) + return weights + + def compute_eff_bs(self, weights): + # Compute original effective mini-batch size + eff_bs = 1 / np.sum(np.square(weights)) + eff_bs = eff_bs / np.shape(weights)[0] + return eff_bs + + def set_statistics(self, stats: dict) -> None: + self.world_model.set_statistics(stats) + + def save_models(self, filename: str, filepath: str = "models") -> None: + path = f"{filepath}/models" if filepath != "models" else filepath + dir_exists = os.path.exists(path) + if not dir_exists: + os.makedirs(path) + torch.save(self.actor_net.state_dict(), f"{path}/{filename}_actor.pth") + torch.save(self.critic_net.state_dict(), f"{path}/{filename}_critic.pth") + logging.info("models has been saved...") + + def load_models(self, filepath: str, filename: str) -> None: + path = f"{filepath}/models" if filepath != "models" else filepath + self.actor_net.load_state_dict(torch.load(f"{path}/{filename}_actor.pth")) + self.critic_net.load_state_dict(torch.load(f"{path}/{filename}_critic.pth")) + logging.info("models has been loaded...") diff --git a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_Bounded.py b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_Bounded.py new file mode 100644 index 00000000..2915266d --- /dev/null +++ b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_Bounded.py @@ -0,0 +1,352 @@ +""" +Sutton, Richard S. "Dyna, an integrated architecture for learning, planning, and reacting." + +Original Paper: https://dl.acm.org/doi/abs/10.1145/122344.122377 + +This code runs automatic entropy tuning +""" + +import copy +import logging + +import numpy as np +import torch +from torch import nn +from cares_reinforcement_learning.memory import MemoryBuffer + +from cares_reinforcement_learning.networks.world_models.ensemble import ( + Ensemble_Dyna_Big, +) +import torch.nn.functional as F + + +class DynaSAC_Bounded: + def __init__( + self, + actor_network: torch.nn.Module, + critic_network: torch.nn.Module, + world_network: Ensemble_Dyna_Big, + gamma: float, + tau: float, + action_num: int, + actor_lr: float, + critic_lr: float, + alpha_lr: float, + num_samples: int, + horizon: int, + device: torch.device, + train_reward: bool, + train_both: bool, + gripper: bool, + threshold: float, + exploration_sample: int, + ): + logging.info("-----------------------------------------------") + logging.info("----I am runing the DynaSAC_Bounded Agent! ----") + logging.info("-----------------------------------------------") + self.train_reward = train_reward + self.train_both = train_both + self.gripper = gripper + self.exploration_sample = exploration_sample + self.threshold = threshold + self.set_stat = False + self.type = "mbrl" + self.device = device + + # this may be called policy_net in other implementations + self.actor_net = actor_network.to(self.device) + # this may be called soft_q_net in other implementations + self.critic_net = critic_network.to(self.device) + self.target_critic_net = copy.deepcopy(self.critic_net) + + self.gamma = gamma + self.tau = tau + + self.num_samples = num_samples + self.horizon = horizon + self.action_num = action_num + + self.learn_counter = 0 + self.policy_update_freq = 1 + + self.actor_net_optimiser = torch.optim.Adam( + self.actor_net.parameters(), lr=actor_lr + ) + self.critic_net_optimiser = torch.optim.Adam( + self.critic_net.parameters(), lr=critic_lr + ) + + # Set to initial alpha to 1.0 according to other baselines. + self.log_alpha = torch.FloatTensor([np.log(1.0)]).to(device) + self.log_alpha.requires_grad = True + self.target_entropy = -action_num + self.log_alpha_optimizer = torch.optim.Adam([self.log_alpha], lr=alpha_lr) + + # World model + self.world_model = world_network + + self.k_l = nn.KLDivLoss(reduction="batchmean", log_target=True) + + @property + def _alpha(self) -> float: + return self.log_alpha.exp() + + def _jsd(self, p, q): + p, q = p.view(-1, p.size(-1)).log_softmax(-1), q.view( + -1, q.size(-1) + ).log_softmax(-1) + m = 0.5 * (p + q) + return 0.5 * (self.k_l(m, p) + self.k_l(m, q)) + + def select_action_from_policy( + self, state: np.ndarray, evaluation: bool = False, noise_scale: float = 0 + ) -> np.ndarray: + # note that when evaluating this algorithm we need to select mu as + self.actor_net.eval() + with torch.no_grad(): + state_tensor = torch.FloatTensor(state).unsqueeze(0).to(self.device) + if evaluation is False: + if self.threshold == 0: + (action, _, _) = self.actor_net(state_tensor) + else: + if self.set_stat: + multi_state_tensor = torch.repeat_interleave( + state_tensor, self.exploration_sample, dim=0 + ) + (multi_action, multi_log_pi, _) = self.actor_net( + multi_state_tensor + ) + # Estimate uncertainty + # [6, 10, 17] + _, _, nstate_means, nstate_vars = ( + self.world_model.pred_next_states( + observation=multi_state_tensor, actions=multi_action + ) + ) + # [10, 17] + aleatoric = torch.mean(nstate_vars**2, dim=0) ** 0.5 + epistemic = torch.var(nstate_means, dim=0) ** 0.5 + aleatoric = torch.clamp(aleatoric, max=10e3) + epistemic = torch.clamp(epistemic, max=10e3) + total_unc = (aleatoric**2 + epistemic**2) ** 0.5 + uncert = torch.mean(total_unc, dim=1) + world_dist = F.softmax(uncert, dim=0) + world_dist -= torch.min(world_dist) + + # Q_1, Q_2 = self.critic_net(multi_state_tensor, multi_action) + # Q_s = torch.minimum(Q_1, Q_2) + # Q_s = Q_s.squeeze() + # multi_log_pi = Q_s + + multi_log_pi = multi_log_pi.squeeze() + policy_dist = F.softmax(multi_log_pi, dim=0) + + final_dist = ( + 1 - self.threshold + ) * policy_dist + self.threshold * world_dist + final_dist = F.softmax(final_dist, dim=0) + # candi = torch.argmax(final_dist) + new_dist = torch.distributions.Categorical(final_dist) + candi = new_dist.sample([1]).squeeze() + action = multi_action[candi] + else: + (action, _, _) = self.actor_net(state_tensor) + else: + (_, _, action) = self.actor_net(state_tensor) + action = action.cpu().data.numpy().flatten() + self.actor_net.train() + return action + + def _train_policy( + self, + states: torch.Tensor, + actions: torch.Tensor, + rewards: torch.Tensor, + next_states: torch.Tensor, + dones: torch.Tensor, + weights: torch.Tensor, + ) -> None: + if weights is None: + weights = torch.ones(rewards.shape) + ################## Update the Critic First #################### + with torch.no_grad(): + next_actions, next_log_pi, _ = self.actor_net(next_states) + + target_q_one, target_q_two = self.target_critic_net( + next_states, next_actions + ) + target_q_values = ( + torch.minimum(target_q_one, target_q_two) - self._alpha * next_log_pi + ) + q_target = rewards + self.gamma * (1 - dones) * target_q_values + + q_values_one, q_values_two = self.critic_net(states, actions) + + critic_loss_one = ((q_values_one - q_target).pow(2)).mean() + critic_loss_two = ((q_values_two - q_target).pow(2)).mean() + + critic_loss_total = critic_loss_one + critic_loss_two + + # Update the Critic + self.critic_net_optimiser.zero_grad() + critic_loss_total.backward() + self.critic_net_optimiser.step() + + ################## Update the Actor Second #################### + pi, first_log_p, _ = self.actor_net(states) + qf1_pi, qf2_pi = self.critic_net(states, pi) + min_qf_pi = torch.minimum(qf1_pi, qf2_pi) + actor_loss = ((self._alpha * first_log_p) - min_qf_pi).mean() + + # Update the Actor + self.actor_net_optimiser.zero_grad() + actor_loss.backward() + self.actor_net_optimiser.step() + + # Update the temperature + alpha_loss = -( + self.log_alpha * (first_log_p + self.target_entropy).detach() + ).mean() + + self.log_alpha_optimizer.zero_grad() + alpha_loss.backward() + self.log_alpha_optimizer.step() + + if self.learn_counter % self.policy_update_freq == 0: + for target_param, param in zip( + self.target_critic_net.parameters(), self.critic_net.parameters() + ): + target_param.data.copy_( + param.data * self.tau + target_param.data * (1.0 - self.tau) + ) + + def train_world_model(self, memory: MemoryBuffer, batch_size: int) -> None: + + experiences = memory.sample_uniform(batch_size) + states, actions, rewards, next_states, _, _ = experiences + + states = torch.FloatTensor(np.asarray(states)).to(self.device) + actions = torch.FloatTensor(np.asarray(actions)).to(self.device) + next_states = torch.FloatTensor(np.asarray(next_states)).to(self.device) + + self.world_model.train_world( + states=states, + actions=actions, + next_states=next_states, + ) + + batch_size = len(states) + # Reshape to batch_size x whatever + if self.train_reward: + rewards = torch.FloatTensor(np.asarray(rewards)).to(self.device) + rewards = rewards.unsqueeze(0).reshape(batch_size, 1) + if self.train_both: + self.world_model.train_together(states, actions, rewards) + else: + self.world_model.train_reward(states, actions, next_states, rewards) + + def train_policy(self, memory: MemoryBuffer, batch_size: int) -> None: + self.learn_counter += 1 + + experiences = memory.sample_uniform(batch_size) + states, actions, rewards, next_states, dones, _ = experiences + + # Convert into tensor + states = torch.FloatTensor(np.asarray(states)).to(self.device) + actions = torch.FloatTensor(np.asarray(actions)).to(self.device) + rewards = torch.FloatTensor(np.asarray(rewards)).to(self.device).unsqueeze(1) + next_states = torch.FloatTensor(np.asarray(next_states)).to(self.device) + dones = torch.LongTensor(np.asarray(dones)).to(self.device).unsqueeze(1) + + # Step 2 train as usual + self._train_policy( + states=states, + actions=actions, + rewards=rewards, + next_states=next_states, + dones=dones, + weights=torch.ones(rewards.shape), + ) + self._dyna_generate_and_train(next_states) + + def _dyna_generate_and_train(self, next_states: torch.Tensor) -> None: + pred_states = [] + pred_actions = [] + pred_rs = [] + pred_n_states = [] + + with torch.no_grad(): + pred_state = next_states + for _ in range(self.horizon): + pred_state = torch.repeat_interleave( + pred_state, self.num_samples, dim=0 + ) + # This part is controversial. But random actions is empirically better. + # rand_acts = np.random.uniform(-1, 1, (pred_state.shape[0], self.action_num)) + # pred_acts = torch.FloatTensor(rand_acts).to(self.device) + pred_acts, _, _ = self.actor_net(pred_state) + pred_next_state, _, _, _ = self.world_model.pred_next_states( + pred_state, pred_acts + ) + + if self.gripper: + pred_reward = self.reward_function(pred_state, pred_next_state) + pred_next_state[:, -2:] = pred_state[:, -2:] + else: + pred_reward, _ = self.world_model.pred_rewards( + observation=pred_state, + action=pred_acts, + next_observation=pred_next_state, + ) + + pred_states.append(pred_state) + pred_actions.append(pred_acts.detach()) + pred_rs.append(pred_reward.detach()) + pred_n_states.append(pred_next_state.detach()) + pred_state = pred_next_state.detach() + pred_states = torch.vstack(pred_states) + pred_actions = torch.vstack(pred_actions) + pred_rs = torch.vstack(pred_rs) + pred_n_states = torch.vstack(pred_n_states) + # Pay attention to here! It is dones in the Cares RL Code! + pred_dones = torch.FloatTensor(np.zeros(pred_rs.shape)).to(self.device) + # states, actions, rewards, next_states, not_dones + self._train_policy( + pred_states, + pred_actions, + pred_rs, + pred_n_states, + pred_dones, + torch.ones(pred_rs.shape), + ) + + def reward_function(self, curr_states, next_states): + target_goal_tensor = curr_states[:, -2:] + object_current = next_states[:, -4:-2] + sq_diff = (target_goal_tensor - object_current) ** 2 + # [256, 1] + goal_distance_after = torch.sqrt(torch.sum(sq_diff, dim=1)).unsqueeze(dim=1) + pred_reward = -goal_distance_after + 70 + mask1 = goal_distance_after <= 10 + mask2 = goal_distance_after > 70 + pred_reward[mask1] = 800 + pred_reward[mask2] = 0 + return pred_reward + + def set_statistics(self, stats: dict) -> None: + self.world_model.set_statistics(stats) + self.set_stat = True + + def save_models(self, filename: str, filepath: str = "models") -> None: + # if not os.path.exists(filepath): + # os.makedirs(filepath) + # print(filepath) + # logging.info(filepath) + # torch.save(self.actor_net.state_dict(), f"{filepath}/{filename}_actor.pht") + # torch.save(self.critic_net.state_dict(), f"{filepath}/{filename}_critic.pht") + logging.info("models has been saved...") + + def load_models(self, filepath: str, filename: str) -> None: + self.actor_net.load_state_dict(torch.load(f"{filepath}/{filename}_actor.pht")) + self.critic_net.load_state_dict(torch.load(f"{filepath}/{filename}_critic.pht")) + logging.info("models has been loaded...") diff --git a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_Bounded_Yao.py b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_Bounded_Yao.py new file mode 100644 index 00000000..3caa4d41 --- /dev/null +++ b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_Bounded_Yao.py @@ -0,0 +1,351 @@ +""" +Sutton, Richard S. "Dyna, an integrated architecture for learning, planning, and reacting." + +Original Paper: https://dl.acm.org/doi/abs/10.1145/122344.122377 + +This code runs automatic entropy tuning +""" + +import copy +import logging + +import numpy as np +import torch +from torch import nn +from cares_reinforcement_learning.memory import MemoryBuffer + +from cares_reinforcement_learning.networks.world_models.ensemble import ( + Ensemble_Dyna_Big, +) +import torch.nn.functional as F + + +class DynaSAC_Bounded_Yao: + def __init__( + self, + actor_network: torch.nn.Module, + critic_network: torch.nn.Module, + world_network: Ensemble_Dyna_Big, + gamma: float, + tau: float, + action_num: int, + actor_lr: float, + critic_lr: float, + alpha_lr: float, + num_samples: int, + horizon: int, + device: torch.device, + train_reward: bool, + train_both: bool, + gripper: bool, + threshold: float, + exploration_sample: int, + ): + logging.info("-----------------------------------------------") + logging.info("----I am runing the DynaSAC_Bounded Agent! ----") + logging.info("-----------------------------------------------") + self.train_reward = train_reward + self.train_both = train_both + self.gripper = gripper + self.exploration_sample = exploration_sample + self.threshold = threshold + self.set_stat = False + self.type = "mbrl" + self.device = device + + # this may be called policy_net in other implementations + self.actor_net = actor_network.to(self.device) + # this may be called soft_q_net in other implementations + self.critic_net = critic_network.to(self.device) + self.target_critic_net = copy.deepcopy(self.critic_net) + + self.gamma = gamma + self.tau = tau + + self.num_samples = num_samples + self.horizon = horizon + self.action_num = action_num + + self.learn_counter = 0 + self.policy_update_freq = 1 + + self.actor_net_optimiser = torch.optim.Adam( + self.actor_net.parameters(), lr=actor_lr + ) + self.critic_net_optimiser = torch.optim.Adam( + self.critic_net.parameters(), lr=critic_lr + ) + + # Set to initial alpha to 1.0 according to other baselines. + self.log_alpha = torch.FloatTensor([np.log(1.0)]).to(device) + self.log_alpha.requires_grad = True + self.target_entropy = -action_num + self.log_alpha_optimizer = torch.optim.Adam([self.log_alpha], lr=alpha_lr) + + # World model + self.world_model = world_network + + self.k_l = nn.KLDivLoss(reduction="batchmean", log_target=True) + + @property + def _alpha(self) -> float: + return self.log_alpha.exp() + + def _jsd(self, p, q): + p, q = p.view(-1, p.size(-1)).log_softmax(-1), q.view( + -1, q.size(-1) + ).log_softmax(-1) + m = 0.5 * (p + q) + return 0.5 * (self.k_l(m, p) + self.k_l(m, q)) + + def select_action_from_policy( + self, state: np.ndarray, evaluation: bool = False, noise_scale: float = 0 + ) -> np.ndarray: + # note that when evaluating this algorithm we need to select mu as + self.actor_net.eval() + with torch.no_grad(): + state_tensor = torch.FloatTensor(state).unsqueeze(0).to(self.device) + if evaluation is False: + if self.threshold == 0: + (action, _, _) = self.actor_net(state_tensor) + else: + if self.set_stat: + multi_state_tensor = torch.repeat_interleave( + state_tensor, self.exploration_sample, dim=0 + ) + (multi_action, multi_log_pi, _) = self.actor_net( + multi_state_tensor + ) + # Estimate uncertainty + # [6, 10, 17] + _, _, nstate_means, nstate_vars = ( + self.world_model.pred_next_states( + observation=multi_state_tensor, actions=multi_action + ) + ) + # [10, 17] + aleatoric = torch.mean(nstate_vars**2, dim=0) ** 0.5 + epistemic = torch.var(nstate_means, dim=0) ** 0.5 + aleatoric = torch.clamp(aleatoric, max=10e3) + epistemic = torch.clamp(epistemic, max=10e3) + total_unc = (aleatoric**2 + epistemic**2) ** 0.5 + world_dist = torch.mean(total_unc, dim=1) + # world_dist = F.softmax(uncert, dim=0) + # world_dist -= torch.min(world_dist) + + Q_1, Q_2 = self.critic_net(multi_state_tensor, multi_action) + Q_s = torch.minimum(Q_1, Q_2) + Q_s = Q_s.squeeze() + policy_dist = Q_s + + # policy_dist = multi_log_pi.squeeze() + # policy_dist = F.softmax(multi_log_pi, dim=0) + final_dist = policy_dist + self.threshold * world_dist + + # final_dist = F.softmax(final_dist, dim=0) + # candi = torch.argmax(final_dist) + # new_dist = torch.distributions.Categorical(final_dist) + # candi = new_dist.sample([1]).squeeze() + candi = torch.argmax(final_dist) + action = multi_action[candi] + else: + (action, _, _) = self.actor_net(state_tensor) + else: + (_, _, action) = self.actor_net(state_tensor) + action = action.cpu().data.numpy().flatten() + self.actor_net.train() + return action + + def _train_policy( + self, + states: torch.Tensor, + actions: torch.Tensor, + rewards: torch.Tensor, + next_states: torch.Tensor, + dones: torch.Tensor, + weights: torch.Tensor, + ) -> None: + if weights is None: + weights = torch.ones(rewards.shape) + ################## Update the Critic First #################### + with torch.no_grad(): + next_actions, next_log_pi, _ = self.actor_net(next_states) + + target_q_one, target_q_two = self.target_critic_net( + next_states, next_actions + ) + target_q_values = ( + torch.minimum(target_q_one, target_q_two) - self._alpha * next_log_pi + ) + q_target = rewards + self.gamma * (1 - dones) * target_q_values + + q_values_one, q_values_two = self.critic_net(states, actions) + + critic_loss_one = ((q_values_one - q_target).pow(2)).mean() + critic_loss_two = ((q_values_two - q_target).pow(2)).mean() + + critic_loss_total = critic_loss_one + critic_loss_two + + # Update the Critic + self.critic_net_optimiser.zero_grad() + critic_loss_total.backward() + self.critic_net_optimiser.step() + + ################## Update the Actor Second #################### + pi, first_log_p, _ = self.actor_net(states) + qf1_pi, qf2_pi = self.critic_net(states, pi) + min_qf_pi = torch.minimum(qf1_pi, qf2_pi) + actor_loss = ((self._alpha * first_log_p) - min_qf_pi).mean() + + # Update the Actor + self.actor_net_optimiser.zero_grad() + actor_loss.backward() + self.actor_net_optimiser.step() + + # Update the temperature + alpha_loss = -( + self.log_alpha * (first_log_p + self.target_entropy).detach() + ).mean() + + self.log_alpha_optimizer.zero_grad() + alpha_loss.backward() + self.log_alpha_optimizer.step() + + if self.learn_counter % self.policy_update_freq == 0: + for target_param, param in zip( + self.target_critic_net.parameters(), self.critic_net.parameters() + ): + target_param.data.copy_( + param.data * self.tau + target_param.data * (1.0 - self.tau) + ) + + def train_world_model(self, memory: MemoryBuffer, batch_size: int) -> None: + + experiences = memory.sample_uniform(batch_size) + states, actions, rewards, next_states, _, _ = experiences + + states = torch.FloatTensor(np.asarray(states)).to(self.device) + actions = torch.FloatTensor(np.asarray(actions)).to(self.device) + next_states = torch.FloatTensor(np.asarray(next_states)).to(self.device) + + self.world_model.train_world( + states=states, + actions=actions, + next_states=next_states, + ) + + batch_size = len(states) + # Reshape to batch_size x whatever + if self.train_reward: + rewards = torch.FloatTensor(np.asarray(rewards)).to(self.device) + rewards = rewards.unsqueeze(0).reshape(batch_size, 1) + if self.train_both: + self.world_model.train_together(states, actions, rewards) + else: + self.world_model.train_reward(states, actions, next_states, rewards) + + def train_policy(self, memory: MemoryBuffer, batch_size: int) -> None: + self.learn_counter += 1 + + experiences = memory.sample_uniform(batch_size) + states, actions, rewards, next_states, dones, _ = experiences + + # Convert into tensor + states = torch.FloatTensor(np.asarray(states)).to(self.device) + actions = torch.FloatTensor(np.asarray(actions)).to(self.device) + rewards = torch.FloatTensor(np.asarray(rewards)).to(self.device).unsqueeze(1) + next_states = torch.FloatTensor(np.asarray(next_states)).to(self.device) + dones = torch.LongTensor(np.asarray(dones)).to(self.device).unsqueeze(1) + + # Step 2 train as usual + self._train_policy( + states=states, + actions=actions, + rewards=rewards, + next_states=next_states, + dones=dones, + weights=torch.ones(rewards.shape), + ) + self._dyna_generate_and_train(next_states) + + def _dyna_generate_and_train(self, next_states: torch.Tensor) -> None: + pred_states = [] + pred_actions = [] + pred_rs = [] + pred_n_states = [] + + with torch.no_grad(): + pred_state = next_states + for _ in range(self.horizon): + pred_state = torch.repeat_interleave( + pred_state, self.num_samples, dim=0 + ) + # This part is controversial. But random actions is empirically better. + # rand_acts = np.random.uniform(-1, 1, (pred_state.shape[0], self.action_num)) + # pred_acts = torch.FloatTensor(rand_acts).to(self.device) + pred_acts, _, _ = self.actor_net(pred_state) + pred_next_state, _, _, _ = self.world_model.pred_next_states( + pred_state, pred_acts + ) + + if self.gripper: + pred_reward = self.reward_function(pred_state, pred_next_state) + pred_next_state[:, -2:] = pred_state[:, -2:] + else: + pred_reward, _ = self.world_model.pred_rewards( + observation=pred_state, + action=pred_acts, + next_observation=pred_next_state, + ) + + pred_states.append(pred_state) + pred_actions.append(pred_acts.detach()) + pred_rs.append(pred_reward.detach()) + pred_n_states.append(pred_next_state.detach()) + pred_state = pred_next_state.detach() + pred_states = torch.vstack(pred_states) + pred_actions = torch.vstack(pred_actions) + pred_rs = torch.vstack(pred_rs) + pred_n_states = torch.vstack(pred_n_states) + # Pay attention to here! It is dones in the Cares RL Code! + pred_dones = torch.FloatTensor(np.zeros(pred_rs.shape)).to(self.device) + # states, actions, rewards, next_states, not_dones + self._train_policy( + pred_states, + pred_actions, + pred_rs, + pred_n_states, + pred_dones, + torch.ones(pred_rs.shape), + ) + + def reward_function(self, curr_states, next_states): + target_goal_tensor = curr_states[:, -2:] + object_current = next_states[:, -4:-2] + sq_diff = (target_goal_tensor - object_current) ** 2 + # [256, 1] + goal_distance_after = torch.sqrt(torch.sum(sq_diff, dim=1)).unsqueeze(dim=1) + pred_reward = -goal_distance_after + 70 + mask1 = goal_distance_after <= 10 + mask2 = goal_distance_after > 70 + pred_reward[mask1] = 800 + pred_reward[mask2] = 0 + return pred_reward + + def set_statistics(self, stats: dict) -> None: + self.world_model.set_statistics(stats) + self.set_stat = True + + def save_models(self, filename: str, filepath: str = "models") -> None: + # if not os.path.exists(filepath): + # os.makedirs(filepath) + # print(filepath) + # logging.info(filepath) + # torch.save(self.actor_net.state_dict(), f"{filepath}/{filename}_actor.pht") + # torch.save(self.critic_net.state_dict(), f"{filepath}/{filename}_critic.pht") + logging.info("models has been saved...") + + def load_models(self, filepath: str, filename: str) -> None: + self.actor_net.load_state_dict(torch.load(f"{filepath}/{filename}_actor.pht")) + self.critic_net.load_state_dict(torch.load(f"{filepath}/{filename}_critic.pht")) + logging.info("models has been loaded...") diff --git a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_NS.py b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_NS.py new file mode 100644 index 00000000..f3bd4ee1 --- /dev/null +++ b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_NS.py @@ -0,0 +1,293 @@ +""" +Sutton, Richard S. "Dyna, an integrated architecture for learning, planning, and reacting." + +Original Paper: https://dl.acm.org/doi/abs/10.1145/122344.122377 + +This code runs automatic entropy tuning +""" + +import copy +import logging + +import numpy as np +import torch + +from cares_reinforcement_learning.memory import MemoryBuffer + +from cares_reinforcement_learning.networks.world_models.ensemble import ( + Ensemble_Dyna_Big, +) + + +class DynaSAC_NS: + def __init__( + self, + actor_network: torch.nn.Module, + critic_network: torch.nn.Module, + world_network: Ensemble_Dyna_Big, + gamma: float, + tau: float, + action_num: int, + actor_lr: float, + critic_lr: float, + alpha_lr: float, + num_samples: int, + horizon: int, + device: torch.device, + train_reward: bool, + train_both: bool, + gripper: bool, + ): + logging.info("-------------------------------------------") + logging.info("----I am runing the Dyna_SAC_NS Agent! ----") + logging.info("-------------------------------------------") + self.train_reward = train_reward + self.train_both = train_both + self.gripper = gripper + + self.type = "mbrl" + self.device = device + + # this may be called policy_net in other implementations + self.actor_net = actor_network.to(self.device) + # this may be called soft_q_net in other implementations + self.critic_net = critic_network.to(self.device) + self.target_critic_net = copy.deepcopy(self.critic_net) + + self.gamma = gamma + self.tau = tau + + self.num_samples = num_samples + self.horizon = horizon + self.action_num = action_num + + self.learn_counter = 0 + self.policy_update_freq = 1 + + self.actor_net_optimiser = torch.optim.Adam( + self.actor_net.parameters(), lr=actor_lr + ) + self.critic_net_optimiser = torch.optim.Adam( + self.critic_net.parameters(), lr=critic_lr + ) + + # Set to initial alpha to 1.0 according to other baselines. + self.log_alpha = torch.FloatTensor([np.log(1.0)]).to(device) + self.log_alpha.requires_grad = True + self.target_entropy = -action_num + self.log_alpha_optimizer = torch.optim.Adam([self.log_alpha], lr=alpha_lr) + + # World model + self.world_model = world_network + + @property + def _alpha(self) -> float: + return self.log_alpha.exp() + + def select_action_from_policy( + self, state: np.ndarray, evaluation: bool = False, noise_scale: float = 0 + ) -> np.ndarray: + # note that when evaluating this algorithm we need to select mu as + self.actor_net.eval() + with torch.no_grad(): + state_tensor = torch.FloatTensor(state).unsqueeze(0).to(self.device) + if evaluation is False: + (action, _, _) = self.actor_net(state_tensor) + else: + (_, _, action) = self.actor_net(state_tensor) + action = action.cpu().data.numpy().flatten() + self.actor_net.train() + return action + + def _train_policy( + self, + states: torch.Tensor, + actions: torch.Tensor, + rewards: torch.Tensor, + next_states: torch.Tensor, + dones: torch.Tensor, + weights: torch.Tensor, + ) -> None: + if weights is None: + weights = torch.ones(rewards.shape) + ################## Update the Critic First #################### + with torch.no_grad(): + next_actions, next_log_pi, _ = self.actor_net(next_states) + + target_q_one, target_q_two = self.target_critic_net( + next_states, next_actions + ) + target_q_values = ( + torch.minimum(target_q_one, target_q_two) - self._alpha * next_log_pi + ) + q_target = rewards + self.gamma * (1 - dones) * target_q_values + + q_values_one, q_values_two = self.critic_net(states, actions) + + critic_loss_one = ((q_values_one - q_target).pow(2)).mean() + critic_loss_two = ((q_values_two - q_target).pow(2)).mean() + + critic_loss_total = critic_loss_one + critic_loss_two + + # Update the Critic + self.critic_net_optimiser.zero_grad() + critic_loss_total.backward() + self.critic_net_optimiser.step() + + ################## Update the Actor Second #################### + pi, first_log_p, _ = self.actor_net(states) + qf1_pi, qf2_pi = self.critic_net(states, pi) + min_qf_pi = torch.minimum(qf1_pi, qf2_pi) + actor_loss = ((self._alpha * first_log_p) - min_qf_pi).mean() + + # Update the Actor + self.actor_net_optimiser.zero_grad() + actor_loss.backward() + self.actor_net_optimiser.step() + + # Update the temperature + alpha_loss = -( + self.log_alpha * (first_log_p + self.target_entropy).detach() + ).mean() + + self.log_alpha_optimizer.zero_grad() + alpha_loss.backward() + self.log_alpha_optimizer.step() + + if self.learn_counter % self.policy_update_freq == 0: + for target_param, param in zip( + self.target_critic_net.parameters(), self.critic_net.parameters() + ): + target_param.data.copy_( + param.data * self.tau + target_param.data * (1.0 - self.tau) + ) + + def train_world_model(self, memory: MemoryBuffer, batch_size: int) -> None: + + experiences = memory.sample_uniform(batch_size) + states, actions, rewards, next_states, _, _ = experiences + + states = torch.FloatTensor(np.asarray(states)).to(self.device) + actions = torch.FloatTensor(np.asarray(actions)).to(self.device) + next_states = torch.FloatTensor(np.asarray(next_states)).to(self.device) + + self.world_model.train_world( + states=states, + actions=actions, + next_states=next_states, + ) + + batch_size = len(states) + # Reshape to batch_size x whatever + if self.train_reward: + rewards = torch.FloatTensor(np.asarray(rewards)).to(self.device) + rewards = rewards.unsqueeze(0).reshape(batch_size, 1) + if self.train_both: + self.world_model.train_together(states, actions, rewards) + else: + self.world_model.train_reward(states, actions, next_states, rewards) + + def train_policy(self, memory: MemoryBuffer, batch_size: int) -> None: + self.learn_counter += 1 + + experiences = memory.sample_uniform(batch_size) + states, actions, rewards, next_states, dones, _ = experiences + + # Convert into tensor + states = torch.FloatTensor(np.asarray(states)).to(self.device) + actions = torch.FloatTensor(np.asarray(actions)).to(self.device) + rewards = torch.FloatTensor(np.asarray(rewards)).to(self.device).unsqueeze(1) + next_states = torch.FloatTensor(np.asarray(next_states)).to(self.device) + dones = torch.LongTensor(np.asarray(dones)).to(self.device).unsqueeze(1) + + # Step 2 train as usual + self._train_policy( + states=states, + actions=actions, + rewards=rewards, + next_states=next_states, + dones=dones, + weights=torch.ones(rewards.shape), + ) + self._dyna_generate_and_train(next_states) + + def _dyna_generate_and_train(self, next_states: torch.Tensor) -> None: + pred_states = [] + pred_actions = [] + pred_rs = [] + pred_n_states = [] + + with torch.no_grad(): + pred_state = next_states + for _ in range(self.horizon): + pred_state = torch.repeat_interleave( + pred_state, self.num_samples, dim=0 + ) + # This part is controversial. But random actions is empirically better. + # rand_acts = np.random.uniform(-1, 1, (pred_state.shape[0], self.action_num)) + # pred_acts = torch.FloatTensor(rand_acts).to(self.device) + pred_acts, _, _ = self.actor_net(pred_state) + pred_next_state, _, _, _ = self.world_model.pred_next_states( + pred_state, pred_acts + ) + + if self.gripper: + pred_reward = self.reward_function(pred_state, pred_next_state) + pred_next_state[:, -2:] = pred_state[:, -2:] + else: + pred_reward, _ = self.world_model.pred_rewards( + observation=pred_state, + action=pred_acts, + next_observation=pred_next_state, + ) + + pred_states.append(pred_state) + pred_actions.append(pred_acts.detach()) + pred_rs.append(pred_reward.detach()) + pred_n_states.append(pred_next_state.detach()) + pred_state = pred_next_state.detach() + pred_states = torch.vstack(pred_states) + pred_actions = torch.vstack(pred_actions) + pred_rs = torch.vstack(pred_rs) + pred_n_states = torch.vstack(pred_n_states) + # Pay attention to here! It is dones in the Cares RL Code! + pred_dones = torch.FloatTensor(np.zeros(pred_rs.shape)).to(self.device) + # states, actions, rewards, next_states, not_dones + self._train_policy( + pred_states, + pred_actions, + pred_rs, + pred_n_states, + pred_dones, + torch.ones(pred_rs.shape), + ) + + def reward_function(self, curr_states, next_states): + target_goal_tensor = curr_states[:, -2:] + object_current = next_states[:, -4:-2] + sq_diff = (target_goal_tensor - object_current) ** 2 + # [256, 1] + goal_distance_after = torch.sqrt(torch.sum(sq_diff, dim=1)).unsqueeze(dim=1) + pred_reward = -goal_distance_after + 70 + mask1 = goal_distance_after <= 10 + mask2 = goal_distance_after > 70 + pred_reward[mask1] = 800 + pred_reward[mask2] = 0 + return pred_reward + + def set_statistics(self, stats: dict) -> None: + self.world_model.set_statistics(stats) + + def save_models(self, filename: str, filepath: str = "models") -> None: + # if not os.path.exists(filepath): + # os.makedirs(filepath) + # print(filepath) + # logging.info(filepath) + # torch.save(self.actor_net.state_dict(), f"{filepath}/{filename}_actor.pht") + # torch.save(self.critic_net.state_dict(), f"{filepath}/{filename}_critic.pht") + logging.info("models has been saved...") + + def load_models(self, filepath: str, filename: str) -> None: + self.actor_net.load_state_dict(torch.load(f"{filepath}/{filename}_actor.pht")) + self.critic_net.load_state_dict(torch.load(f"{filepath}/{filename}_critic.pht")) + logging.info("models has been loaded...") diff --git a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_NS_IW.py b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_NS_IW.py new file mode 100644 index 00000000..1092aeb3 --- /dev/null +++ b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_NS_IW.py @@ -0,0 +1,373 @@ +""" +Sutton, Richard S. "Dyna, an integrated architecture for learning, planning, and reacting." + +Original Paper: https://dl.acm.org/doi/abs/10.1145/122344.122377 + +This code runs automatic entropy tuning +""" + +import copy +import logging + +import numpy as np +import torch +from cares_reinforcement_learning.util.helpers import denormalize_observation_delta +from cares_reinforcement_learning.memory import MemoryBuffer + +from cares_reinforcement_learning.networks.world_models.ensemble import ( + Ensemble_Dyna_Big, +) + + +class DynaSAC_NS_IW: + def __init__( + self, + actor_network: torch.nn.Module, + critic_network: torch.nn.Module, + world_network: Ensemble_Dyna_Big, + gamma: float, + tau: float, + action_num: int, + actor_lr: float, + critic_lr: float, + alpha_lr: float, + num_samples: int, + horizon: int, + threshold: float, + device: torch.device, + train_reward: bool, + train_both: bool, + gripper: bool, + ): + logging.info("-------------------------------------------") + logging.info("----I am runing the Dyna_SAC_NS Agent! ----") + logging.info("-------------------------------------------") + self.train_reward = train_reward + self.train_both = train_both + self.gripper = gripper + self.threshold = threshold + self.type = "mbrl" + self.device = device + + # this may be called policy_net in other implementations + self.actor_net = actor_network.to(self.device) + # this may be called soft_q_net in other implementations + self.critic_net = critic_network.to(self.device) + self.target_critic_net = copy.deepcopy(self.critic_net) + + self.gamma = gamma + self.tau = tau + + self.num_samples = num_samples + self.horizon = horizon + self.action_num = action_num + + self.learn_counter = 0 + self.policy_update_freq = 1 + + self.actor_net_optimiser = torch.optim.Adam( + self.actor_net.parameters(), lr=actor_lr + ) + self.critic_net_optimiser = torch.optim.Adam( + self.critic_net.parameters(), lr=critic_lr + ) + + # Set to initial alpha to 1.0 according to other baselines. + self.log_alpha = torch.FloatTensor([np.log(1.0)]).to(device) + self.log_alpha.requires_grad = True + self.target_entropy = -action_num + self.log_alpha_optimizer = torch.optim.Adam([self.log_alpha], lr=alpha_lr) + + # World model + self.world_model = world_network + + @property + def _alpha(self) -> float: + return self.log_alpha.exp() + + def select_action_from_policy( + self, state: np.ndarray, evaluation: bool = False, noise_scale: float = 0 + ) -> np.ndarray: + # note that when evaluating this algorithm we need to select mu as + self.actor_net.eval() + with torch.no_grad(): + state_tensor = torch.FloatTensor(state).unsqueeze(0).to(self.device) + if evaluation is False: + (action, _, _) = self.actor_net(state_tensor) + else: + (_, _, action) = self.actor_net(state_tensor) + action = action.cpu().data.numpy().flatten() + self.actor_net.train() + return action + + def _train_policy( + self, + states: torch.Tensor, + actions: torch.Tensor, + rewards: torch.Tensor, + next_states: torch.Tensor, + dones: torch.Tensor, + weights: torch.Tensor, + ) -> None: + if weights is None: + weights = torch.ones(rewards.shape).to(self.device) + weights = weights.to(self.device) + info = {} + with torch.no_grad(): + next_actions, next_log_pi, _ = self.actor_net(next_states) + target_q_one, target_q_two = self.target_critic_net( + next_states, next_actions + ) + target_q_values = ( + torch.minimum(target_q_one, target_q_two) - self._alpha * next_log_pi + ) + q_target = rewards + self.gamma * (1 - dones) * target_q_values + assert (len(q_target.shape) == 2) and (q_target.shape[1] == 1) + q_target = q_target.detach() + q_values_one, q_values_two = self.critic_net(states, actions) + # critic_loss_one = F.mse_loss(q_values_one, q_target) + td_error1 = q_target - q_values_one # * weights + td_error2 = q_target - q_values_two # * weights + critic_loss_one = 0.5 * (td_error1.pow(2) * weights).mean() + critic_loss_two = 0.5 * (td_error2.pow(2) * weights).mean() + critic_loss_total = critic_loss_one + critic_loss_two + # Update the Critic + self.critic_net_optimiser.zero_grad() + critic_loss_total.backward() + self.critic_net_optimiser.step() + ################## Update the Actor Second #################### + pi, first_log_p, _ = self.actor_net(states) + qf1_pi, qf2_pi = self.critic_net(states, pi) + min_qf_pi = torch.minimum(qf1_pi, qf2_pi) + actor_loss = ((self._alpha * first_log_p) - min_qf_pi).mean() + + # Update the Actor + self.actor_net_optimiser.zero_grad() + actor_loss.backward() + self.actor_net_optimiser.step() + + # update the temperature + alpha_loss = -( + self.log_alpha * (first_log_p + self.target_entropy).detach() + ).mean() + self.log_alpha_optimizer.zero_grad() + alpha_loss.backward() + self.log_alpha_optimizer.step() + + if self.learn_counter % self.policy_update_freq == 0: + for target_param, param in zip( + self.target_critic_net.parameters(), self.critic_net.parameters() + ): + target_param.data.copy_( + param.data * self.tau + target_param.data * (1.0 - self.tau) + ) + + def train_world_model(self, memory: MemoryBuffer, batch_size: int) -> None: + + experiences = memory.sample_uniform(batch_size) + states, actions, rewards, next_states, _, _ = experiences + + states = torch.FloatTensor(np.asarray(states)).to(self.device) + actions = torch.FloatTensor(np.asarray(actions)).to(self.device) + next_states = torch.FloatTensor(np.asarray(next_states)).to(self.device) + + self.world_model.train_world( + states=states, + actions=actions, + next_states=next_states, + ) + + batch_size = len(states) + # Reshape to batch_size x whatever + if self.train_reward: + rewards = torch.FloatTensor(np.asarray(rewards)).to(self.device) + rewards = rewards.unsqueeze(0).reshape(batch_size, 1) + if self.train_both: + self.world_model.train_together(states, actions, rewards) + else: + self.world_model.train_reward(states, actions, next_states, rewards) + + def train_policy(self, memory: MemoryBuffer, batch_size: int) -> None: + self.learn_counter += 1 + + experiences = memory.sample_uniform(batch_size) + states, actions, rewards, next_states, dones, _ = experiences + + # Convert into tensor + states = torch.FloatTensor(np.asarray(states)).to(self.device) + actions = torch.FloatTensor(np.asarray(actions)).to(self.device) + rewards = torch.FloatTensor(np.asarray(rewards)).to(self.device).unsqueeze(1) + next_states = torch.FloatTensor(np.asarray(next_states)).to(self.device) + dones = torch.LongTensor(np.asarray(dones)).to(self.device).unsqueeze(1) + + # Step 2 train as usual + self._train_policy( + states=states, + actions=actions, + rewards=rewards, + next_states=next_states, + dones=dones, + weights=torch.ones(rewards.shape), + ) + self._dyna_generate_and_train(next_states) + + def _dyna_generate_and_train(self, next_states: torch.Tensor) -> None: + """ + Only off-policy Dyna will work. + :param next_states: + """ + pred_states = [] + pred_actions = [] + pred_rs = [] + pred_n_states = [] + weights = [] + + with torch.no_grad(): + pred_state = next_states + for _ in range(self.horizon): + pred_state = torch.repeat_interleave( + pred_state, self.num_samples, dim=0 + ) + # This part is controversial. But random actions is empirically better. + # rand_acts = np.random.uniform(-1, 1, (pred_state.shape[0], self.action_num)) + # pred_acts = torch.FloatTensor(rand_acts).to(self.device) + (pred_acts, _, _) = self.actor_net(pred_state) + # [2560, 18] + pred_next_state, _, norm_means_, norm_vars_ = ( + self.world_model.pred_next_states(pred_state, pred_acts) + ) + if self.gripper: + pred_reward = self.reward_function(pred_state, pred_next_state) + pred_next_state[:, -2:] = pred_state[:, -2:] + else: + pred_reward, _ = self.world_model.pred_rewards( + observation=pred_state, + action=pred_acts, + next_observation=pred_next_state, + ) + uncert = self.sampling(pred_state, norm_means_, norm_vars_) + # Q, A, R + weights.append(uncert) + + pred_states.append(pred_state) + pred_actions.append(pred_acts.detach()) + pred_rs.append(pred_reward.detach()) + pred_n_states.append(pred_next_state.detach()) + pred_state = pred_next_state.detach() + pred_states = torch.vstack(pred_states) + pred_actions = torch.vstack(pred_actions) + pred_rs = torch.vstack(pred_rs) + pred_n_states = torch.vstack(pred_n_states) + pred_weights = torch.vstack(weights) + # Pay attention to here! It is dones in the Cares RL Code! + pred_dones = torch.FloatTensor(np.zeros(pred_rs.shape)).to(self.device) + # states, actions, rewards, next_states, not_dones + self._train_policy( + pred_states, pred_actions, pred_rs, pred_n_states, pred_dones, pred_weights + ) + + def reward_function(self, curr_states, next_states): + target_goal_tensor = curr_states[:, -2:] + object_current = next_states[:, -4:-2] + sq_diff = (target_goal_tensor - object_current) ** 2 + # [256, 1] + goal_distance_after = torch.sqrt(torch.sum(sq_diff, dim=1)).unsqueeze(dim=1) + pred_reward = -goal_distance_after + 70 + mask1 = goal_distance_after <= 10 + mask2 = goal_distance_after > 70 + pred_reward[mask1] = 800 + pred_reward[mask2] = 0 + return pred_reward + + def sampling(self, curr_states, pred_means, pred_vars): + """ + High std means low uncertainty. Therefore, divided by 1 + + :param pred_means: [num_model, batch_size * 10, observation_dim] + :param pred_vars: + :return: + """ + with torch.no_grad(): + # 5 models. Each predict 10 next_states. + r_s = [] + act_logs = [] + q_s = [] + # For each model + for i in range(pred_means.shape[0]): + sample_times = 10 + samples = torch.distributions.Normal( + pred_means[i], pred_vars[i] + ).sample([sample_times]) + # For each sampling + for i in range(sample_times): + samples[i] = denormalize_observation_delta( + samples[i], self.world_model.statistics + ) + samples[i] += curr_states + pred_act, log_pi, _ = self.actor_net(samples[i]) + act_logs.append(log_pi) + # pred_rwd1 = self.world_model.pred_rewards(samples[i]) + rewards = self.reward_function(curr_states, samples[i]) + r_s.append(rewards) + qa1, qa2 = self.target_critic_net(samples[i], pred_act) + q_a = torch.minimum(qa1, qa2) + q_s.append(q_a) + r_s = torch.stack(r_s) + act_logs = torch.stack(act_logs) + q_s = torch.stack(q_s) + + var_r = torch.var(r_s, dim=0) + var_a = torch.var(act_logs, dim=0) + var_q = torch.var(q_s, dim=0) + + mean_a = torch.mean(act_logs, dim=0, keepdim=True) + mean_q = torch.mean(q_s, dim=0, keepdim=True) + diff_a = act_logs - mean_a + diff_q = q_s - mean_q + cov_aq = torch.mean(diff_a * diff_q, dim=0) + + mean_r = torch.mean(r_s, dim=0, keepdim=True) + diff_r = r_s - mean_r + cov_rq = torch.mean(diff_r * diff_q, dim=0) + cov_ra = torch.mean(diff_r * diff_a, dim=0) + + gamma_sq = self.gamma * self.gamma + total_var = ( + var_r + + gamma_sq * var_a + + gamma_sq * var_q + + gamma_sq * 2 * cov_aq + + gamma_sq * 2 * cov_rq + + gamma_sq * 2 * cov_ra + ) + # # For actor: alpha^2 * var_a + var_q + min_var = torch.min(total_var) + max_var = torch.max(total_var) + # As (max-min) decrease, threshold should go down. + threshold = self.threshold * (max_var - min_var) + min_var + total_var[total_var <= threshold] = threshold + # Inverse variance. + weights = 1 / total_var + # Normalization + new_min_var = torch.min(weights) + new_max_var = torch.max(weights) + weights = (weights - new_min_var) / (new_max_var - new_min_var) + weights += 0.0001 + return weights.detach() + + def set_statistics(self, stats: dict) -> None: + self.world_model.set_statistics(stats) + + def save_models(self, filename: str, filepath: str = "models") -> None: + # if not os.path.exists(filepath): + # os.makedirs(filepath) + # print(filepath) + # logging.info(filepath) + # torch.save(self.actor_net.state_dict(), f"{filepath}/{filename}_actor.pht") + # torch.save(self.critic_net.state_dict(), f"{filepath}/{filename}_critic.pht") + logging.info("models has been saved...") + + def load_models(self, filepath: str, filename: str) -> None: + self.actor_net.load_state_dict(torch.load(f"{filepath}/{filename}_actor.pht")) + self.critic_net.load_state_dict(torch.load(f"{filepath}/{filename}_critic.pht")) + logging.info("models has been loaded...") diff --git a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_SUNRISE_NS.py b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_SUNRISE_NS.py new file mode 100644 index 00000000..1133eddb --- /dev/null +++ b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_SUNRISE_NS.py @@ -0,0 +1,443 @@ +""" +Sutton, Richard S. "Dyna, an integrated architecture for learning, planning, and reacting." + +Original Paper: https://dl.acm.org/doi/abs/10.1145/122344.122377 + +This code runs automatic entropy tuning +""" + +import copy +import logging +import os + +import numpy as np +import torch +import torch.nn.functional as F + +from cares_reinforcement_learning.memory import MemoryBuffer +from cares_reinforcement_learning.networks.world_models.ensemble import ( + Ensemble_Dyna_Big, +) +from cares_reinforcement_learning.util.helpers import denormalize_observation_delta + + +class DynaSAC_SUNRISEReweight: + """ + Max as ? + """ + + def __init__( + self, + actor_network: torch.nn.Module, + critic_network: torch.nn.Module, + world_network: Ensemble_Dyna_Big, + gamma: float, + tau: float, + action_num: int, + actor_lr: float, + critic_lr: float, + alpha_lr: float, + num_samples: int, + horizon: int, + threshold_scale: float, + reweight_critic: bool, + reweight_actor: bool, + mode: int, + sample_times: int, + device: torch.device, + ): + self.type = "mbrl" + self.device = device + self.reweight_critic = reweight_critic + self.reweight_actor = reweight_actor + # this may be called policy_net in other implementations + self.actor_net = actor_network.to(self.device) + # this may be called soft_q_net in other implementations + self.critic_net = critic_network.to(self.device) + self.target_critic_net = copy.deepcopy(self.critic_net) + + self.gamma = gamma + self.tau = tau + + self.num_samples = num_samples + self.horizon = horizon + self.action_num = action_num + + self.learn_counter = 0 + self.policy_update_freq = 1 + + self.actor_net_optimiser = torch.optim.Adam( + self.actor_net.parameters(), lr=actor_lr + ) + self.critic_net_optimiser = torch.optim.Adam( + self.critic_net.parameters(), lr=critic_lr + ) + + # Set to initial alpha to 1.0 according to other baselines. + self.log_alpha = torch.FloatTensor([np.log(1.0)]).to(device) + self.log_alpha.requires_grad = True + self.target_entropy = -action_num + self.log_alpha_optimizer = torch.optim.Adam([self.log_alpha], lr=alpha_lr) + + # World model + self.world_model = world_network + # Parameter + self.threshold_scale = threshold_scale + self.mode = mode + self.sample_times = sample_times + + @property + def _alpha(self) -> float: + return self.log_alpha.exp() + + # pylint: disable-next=unused-argument to keep the same interface + def select_action_from_policy( + self, state: np.ndarray, evaluation: bool = False, noise_scale: float = 0 + ) -> np.ndarray: + # note that when evaluating this algorithm we need to select mu as + self.actor_net.eval() + with torch.no_grad(): + state_tensor = torch.FloatTensor(state).unsqueeze(0).to(self.device) + if evaluation is False: + (action, _, _) = self.actor_net(state_tensor) + else: + (_, _, action) = self.actor_net(state_tensor) + action = action.cpu().data.numpy().flatten() + self.actor_net.train() + return action + + def _train_policy( + self, + states: torch.Tensor, + actions: torch.Tensor, + rewards: torch.Tensor, + next_states: torch.Tensor, + dones: torch.Tensor, + weights: torch.Tensor, + ) -> None: + ################## Update the Critic First #################### + # Have more target values? + with torch.no_grad(): + next_actions, next_log_pi, _ = self.actor_net(next_states) + target_q_one, target_q_two = self.target_critic_net( + next_states, next_actions + ) + target_q_values = ( + torch.minimum(target_q_one, target_q_two) - self._alpha * next_log_pi + ) + q_target = rewards + self.gamma * (1 - dones) * target_q_values + + q_values_one, q_values_two = self.critic_net(states, actions) + + if self.reweight_critic: + # Reweighted loss function. weight not participant in training. + l2_loss_one = (q_values_one - q_target).pow(2) + l2_loss_two = (q_values_two - q_target).pow(2) + + weights = weights.detach() + disc_l2_loss_one = l2_loss_one * weights + disc_l2_loss_two = l2_loss_two * weights + # A ratio to scale the loss back to original loss scale. + + ratio_1 = torch.mean(l2_loss_one) / torch.mean(disc_l2_loss_one) + ratio_1 = ratio_1.detach() + ratio_2 = torch.mean(l2_loss_two) / torch.mean(disc_l2_loss_two) + ratio_2 = ratio_2.detach() + + critic_loss_one = disc_l2_loss_one.mean() * ratio_1 + critic_loss_two = disc_l2_loss_two.mean() * ratio_2 + + critic_loss_total = critic_loss_one + critic_loss_two + else: + critic_loss_one = F.mse_loss(q_values_one, q_target) + critic_loss_two = F.mse_loss(q_values_two, q_target) + critic_loss_total = critic_loss_one + critic_loss_two + + # Update the Critic + self.critic_net_optimiser.zero_grad() + critic_loss_total.backward() + self.critic_net_optimiser.step() + + ################## Update the Actor Second #################### + pi, first_log_p, _ = self.actor_net(states) + qf1_pi, qf2_pi = self.critic_net(states, pi) + min_qf_pi = torch.minimum(qf1_pi, qf2_pi) + + if self.reweight_actor: + weights = weights.detach() + a_loss = (self._alpha * first_log_p) - min_qf_pi + disc_actor_loss = a_loss * weights + ratio = torch.mean(a_loss) / torch.mean(disc_actor_loss) + ratio = ratio.detach() + actor_loss = ratio * torch.mean(disc_actor_loss) + else: + actor_loss = ((self._alpha * first_log_p) - min_qf_pi).mean() + + # Update the Actor + self.actor_net_optimiser.zero_grad() + actor_loss.backward() + self.actor_net_optimiser.step() + + # Update the temperature + alpha_loss = -( + self.log_alpha * (first_log_p + self.target_entropy).detach() + ).mean() + + self.log_alpha_optimizer.zero_grad() + alpha_loss.backward() + self.log_alpha_optimizer.step() + + if self.learn_counter % self.policy_update_freq == 0: + for target_param, param in zip( + self.target_critic_net.parameters(), self.critic_net.parameters() + ): + target_param.data.copy_( + param.data * self.tau + target_param.data * (1.0 - self.tau) + ) + + def train_world_model(self, memory: MemoryBuffer, batch_size: int) -> None: + experiences = memory.sample_uniform(batch_size) + states, actions, rewards, next_states, _, _ = experiences + + states = torch.FloatTensor(np.asarray(states)).to(self.device) + actions = torch.FloatTensor(np.asarray(actions)).to(self.device) + rewards = torch.FloatTensor(np.asarray(rewards)).to(self.device).unsqueeze(1) + next_states = torch.FloatTensor(np.asarray(next_states)).to(self.device) + + self.world_model.train_world( + states=states, + actions=actions, + next_states=next_states, + ) + self.world_model.train_reward( + next_states=next_states, + rewards=rewards, + ) + + def train_policy(self, memory: MemoryBuffer, batch_size: int) -> None: + self.learn_counter += 1 + + experiences = memory.sample_uniform(batch_size) + states, actions, rewards, next_states, dones, _ = experiences + + # Convert into tensor + states = torch.FloatTensor(np.asarray(states)).to(self.device) + actions = torch.FloatTensor(np.asarray(actions)).to(self.device) + rewards = torch.FloatTensor(np.asarray(rewards)).to(self.device).unsqueeze(1) + next_states = torch.FloatTensor(np.asarray(next_states)).to(self.device) + dones = torch.LongTensor(np.asarray(dones)).to(self.device).unsqueeze(1) + full_weights = torch.ones(rewards.shape).to(self.device) + # Step 2 train as usual + self._train_policy( + states=states, + actions=actions, + rewards=rewards, + next_states=next_states, + dones=dones, + weights=full_weights, + ) + # # # Step 3 Dyna add more data + self._dyna_generate_and_train(next_states=next_states) + + def _dyna_generate_and_train(self, next_states): + """ + Only off-policy Dyna will work. + :param next_states: + """ + pred_states = [] + pred_actions = [] + pred_rs = [] + pred_n_states = [] + pred_uncerts = [] + with torch.no_grad(): + pred_state = next_states + for _ in range(self.horizon): + pred_state = torch.repeat_interleave( + pred_state, self.num_samples, dim=0 + ) + # This part is controversial. But random actions is empirically better. + rand_acts = np.random.uniform( + -1, 1, (pred_state.shape[0], self.action_num) + ) + pred_acts = torch.FloatTensor(rand_acts).to(self.device) + + pred_next_state, _, pred_mean, pred_var = ( + self.world_model.pred_next_states(pred_state, pred_acts) + ) + uncert = self.sampling( + curr_states=pred_state, pred_means=pred_mean, pred_vars=pred_var + ) + uncert = uncert.unsqueeze(dim=1).to(self.device) + pred_uncerts.append(uncert) + + pred_reward = self.world_model.pred_rewards(pred_next_state) + pred_states.append(pred_state) + pred_actions.append(pred_acts.detach()) + pred_rs.append(pred_reward.detach()) + pred_n_states.append(pred_next_state.detach()) + pred_state = pred_next_state.detach() + pred_states = torch.vstack(pred_states) + pred_actions = torch.vstack(pred_actions) + pred_rs = torch.vstack(pred_rs) + pred_n_states = torch.vstack(pred_n_states) + pred_weights = torch.vstack(pred_uncerts) + # Pay attention to here! It is dones in the Cares RL Code! + pred_dones = torch.FloatTensor(np.zeros(pred_rs.shape)).to(self.device) + # states, actions, rewards, next_states, not_dones + self._train_policy( + pred_states, pred_actions, pred_rs, pred_n_states, pred_dones, pred_weights + ) + + def sampling(self, curr_states, pred_means, pred_vars): + """ + High std means low uncertainty. Therefore, divided by 1 + + :param pred_means: + :param pred_vars: + :return: + """ + with torch.no_grad(): + # 5 models. Each predict 10 next_states. + sample1 = torch.distributions.Normal(pred_means[0], pred_vars[0]).sample( + [self.sample_times] + ) + sample2 = torch.distributions.Normal(pred_means[1], pred_vars[1]).sample( + [self.sample_times] + ) + sample3 = torch.distributions.Normal(pred_means[2], pred_vars[2]).sample( + [self.sample_times] + ) + sample4 = torch.distributions.Normal(pred_means[3], pred_vars[3]).sample( + [self.sample_times] + ) + sample5 = torch.distributions.Normal(pred_means[4], pred_vars[4]).sample( + [self.sample_times] + ) + rs = [] + acts = [] + qs = [] + # Varying the next_state's distribution. + for i in range(self.sample_times): + sample1i = denormalize_observation_delta( + sample1[i], self.world_model.statistics + ) + sample1i += curr_states + sample2i = denormalize_observation_delta( + sample2[i], self.world_model.statistics + ) + sample2i += curr_states + sample3i = denormalize_observation_delta( + sample3[i], self.world_model.statistics + ) + sample3i += curr_states + sample4i = denormalize_observation_delta( + sample4[i], self.world_model.statistics + ) + sample4i += curr_states + sample5i = denormalize_observation_delta( + sample5[i], self.world_model.statistics + ) + sample5i += curr_states + # 5 models, each sampled 10 times = 50, + pred_rwd1 = self.world_model.pred_rewards(sample1i) + pred_rwd2 = self.world_model.pred_rewards(sample2i) + pred_rwd3 = self.world_model.pred_rewards(sample3i) + pred_rwd4 = self.world_model.pred_rewards(sample4i) + pred_rwd5 = self.world_model.pred_rewards(sample5i) + rs.append(pred_rwd1) + rs.append(pred_rwd2) + rs.append(pred_rwd3) + rs.append(pred_rwd4) + rs.append(pred_rwd5) + # Each times, 5 models predict different actions. + # [2560, 17] + pred_act1, log_pi1, _ = self.actor_net(sample1i) + pred_act2, log_pi2, _ = self.actor_net(sample2i) + pred_act3, log_pi3, _ = self.actor_net(sample3i) + pred_act4, log_pi4, _ = self.actor_net(sample4i) + pred_act5, log_pi5, _ = self.actor_net(sample5i) + acts.append(log_pi1) + acts.append(log_pi2) + acts.append(log_pi3) + acts.append(log_pi4) + acts.append(log_pi5) + # How to become the same next state, different action. + # Now: sample1 sample2... same next state, different model. + # Pred_act1 pred_act2 same next_state, different actions. + # 5[] * 10[var of state] + qa1, qa2 = self.target_critic_net(sample1i, pred_act1) + qa = torch.minimum(qa1, qa2) + qb1, qb2 = self.target_critic_net(sample2i, pred_act2) + qb = torch.minimum(qb1, qb2) + qc1, qc2 = self.target_critic_net(sample3i, pred_act3) + qc = torch.minimum(qc1, qc2) + qd1, qd2 = self.target_critic_net(sample4i, pred_act4) + qd = torch.minimum(qd1, qd2) + qe1, qe2 = self.target_critic_net(sample5i, pred_act5) + qe = torch.minimum(qe1, qe2) + qs.append(qa) + qs.append(qb) + qs.append(qc) + qs.append(qd) + qs.append(qe) + + rs = torch.stack(rs) + acts = torch.stack(acts) + qs = torch.stack(qs) + + var_r = torch.var(rs, dim=0) + + if self.mode < 3: + var_a = torch.var(acts, dim=0) + var_q = torch.var(qs, dim=0) + + # Computing covariance. + if self.mode < 2: + mean_a = torch.mean(acts, dim=0, keepdim=True) + mean_q = torch.mean(qs, dim=0, keepdim=True) + diff_a = acts - mean_a + diff_q = qs - mean_q + cov_aq = torch.mean(diff_a * diff_q, dim=0) + + if self.mode < 1: + mean_r = torch.mean(rs, dim=0, keepdim=True) + diff_r = rs - mean_r + cov_rq = torch.mean(diff_r * diff_q, dim=0) + + cov_ra = torch.mean(diff_r * diff_a, dim=0) + + gamma_sq = self.gamma * self.gamma + # Ablation + if self.mode == 0: + total_var = ( + var_r + + gamma_sq * var_a + + gamma_sq * var_q + + gamma_sq * 2 * cov_aq + + gamma_sq * 2 * cov_rq + + gamma_sq * 2 * cov_ra + ) + + total_stds = ( + torch.sigmoid(-1 * torch.sqrt(total_var) * self.threshold_scale) + 0.5 + ) + + return total_stds.detach() + + def set_statistics(self, stats: dict) -> None: + self.world_model.set_statistics(stats) + + def save_models(self, filename: str, filepath: str = "models") -> None: + path = f"{filepath}/models" if filepath != "models" else filepath + dir_exists = os.path.exists(path) + if not dir_exists: + os.makedirs(path) + torch.save(self.actor_net.state_dict(), f"{path}/{filename}_actor.pth") + torch.save(self.critic_net.state_dict(), f"{path}/{filename}_critic.pth") + logging.info("models has been saved...") + + def load_models(self, filepath: str, filename: str) -> None: + path = f"{filepath}/models" if filepath != "models" else filepath + self.actor_net.load_state_dict(torch.load(f"{path}/{filename}_actor.pth")) + self.critic_net.load_state_dict(torch.load(f"{path}/{filename}_critic.pth")) + logging.info("models has been loaded...") diff --git a/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_UWAC_NS.py b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_UWAC_NS.py new file mode 100644 index 00000000..3f526b25 --- /dev/null +++ b/cares_reinforcement_learning/algorithm/mbrl/DynaSAC_UWAC_NS.py @@ -0,0 +1,447 @@ +""" +Sutton, Richard S. "Dyna, an integrated architecture for learning, planning, and reacting." + +Original Paper: https://dl.acm.org/doi/abs/10.1145/122344.122377 + +This code runs automatic entropy tuning +""" + +import copy +import logging +import os + +import numpy as np +import torch +import torch.nn.functional as F + +from cares_reinforcement_learning.memory import MemoryBuffer +from cares_reinforcement_learning.networks.world_models.ensemble import ( + Ensemble_Dyna_Big, +) + +from cares_reinforcement_learning.util.helpers import denormalize_observation_delta + + +class DynaSAC_UWACReweight: + """ + Max as ? + """ + + def __init__( + self, + actor_network: torch.nn.Module, + critic_network: torch.nn.Module, + world_network: Ensemble_Dyna_Big, + gamma: float, + tau: float, + action_num: int, + actor_lr: float, + critic_lr: float, + alpha_lr: float, + num_samples: int, + horizon: int, + threshold_scale: float, + reweight_critic: bool, + reweight_actor: bool, + mode: int, + sample_times: int, + device: torch.device, + ): + self.type = "mbrl" + self.device = device + self.reweight_critic = reweight_critic + self.reweight_actor = reweight_actor + # this may be called policy_net in other implementations + self.actor_net = actor_network.to(self.device) + # this may be called soft_q_net in other implementations + self.critic_net = critic_network.to(self.device) + self.target_critic_net = copy.deepcopy(self.critic_net) + + self.gamma = gamma + self.tau = tau + + self.num_samples = num_samples + self.horizon = horizon + self.action_num = action_num + + self.learn_counter = 0 + self.policy_update_freq = 1 + + self.actor_net_optimiser = torch.optim.Adam( + self.actor_net.parameters(), lr=actor_lr + ) + self.critic_net_optimiser = torch.optim.Adam( + self.critic_net.parameters(), lr=critic_lr + ) + + # Set to initial alpha to 1.0 according to other baselines. + self.log_alpha = torch.FloatTensor([np.log(1.0)]).to(device) + self.log_alpha.requires_grad = True + self.target_entropy = -action_num + self.log_alpha_optimizer = torch.optim.Adam([self.log_alpha], lr=alpha_lr) + + # World model + self.world_model = world_network + # Parameter + self.threshold_scale = threshold_scale + self.mode = mode + self.sample_times = sample_times + + @property + def _alpha(self) -> float: + return self.log_alpha.exp() + + # pylint: disable-next=unused-argument to keep the same interface + def select_action_from_policy( + self, state: np.ndarray, evaluation: bool = False, noise_scale: float = 0 + ) -> np.ndarray: + # note that when evaluating this algorithm we need to select mu as + self.actor_net.eval() + with torch.no_grad(): + state_tensor = torch.FloatTensor(state).unsqueeze(0).to(self.device) + if evaluation is False: + (action, _, _) = self.actor_net(state_tensor) + else: + (_, _, action) = self.actor_net(state_tensor) + action = action.cpu().data.numpy().flatten() + self.actor_net.train() + return action + + def _train_policy( + self, + states: torch.Tensor, + actions: torch.Tensor, + rewards: torch.Tensor, + next_states: torch.Tensor, + dones: torch.Tensor, + weights: torch.Tensor, + ) -> None: + ################## Update the Critic First #################### + # Have more target values? + with torch.no_grad(): + next_actions, next_log_pi, _ = self.actor_net(next_states) + target_q_one, target_q_two = self.target_critic_net( + next_states, next_actions + ) + target_q_values = ( + torch.minimum(target_q_one, target_q_two) - self._alpha * next_log_pi + ) + q_target = rewards + self.gamma * (1 - dones) * target_q_values + + q_values_one, q_values_two = self.critic_net(states, actions) + + if self.reweight_critic: + # Reweighted loss function. weight not participant in training. + l2_loss_one = (q_values_one - q_target).pow(2) + l2_loss_two = (q_values_two - q_target).pow(2) + + weights = weights.detach() + disc_l2_loss_one = l2_loss_one * weights + disc_l2_loss_two = l2_loss_two * weights + # A ratio to scale the loss back to original loss scale. + + ratio_1 = torch.mean(l2_loss_one) / torch.mean(disc_l2_loss_one) + ratio_1 = ratio_1.detach() + ratio_2 = torch.mean(l2_loss_two) / torch.mean(disc_l2_loss_two) + ratio_2 = ratio_2.detach() + + critic_loss_one = disc_l2_loss_one.mean() * ratio_1 + critic_loss_two = disc_l2_loss_two.mean() * ratio_2 + + critic_loss_total = critic_loss_one + critic_loss_two + else: + critic_loss_one = F.mse_loss(q_values_one, q_target) + critic_loss_two = F.mse_loss(q_values_two, q_target) + critic_loss_total = critic_loss_one + critic_loss_two + + # Update the Critic + self.critic_net_optimiser.zero_grad() + critic_loss_total.backward() + self.critic_net_optimiser.step() + + ################## Update the Actor Second #################### + pi, first_log_p, _ = self.actor_net(states) + qf1_pi, qf2_pi = self.critic_net(states, pi) + min_qf_pi = torch.minimum(qf1_pi, qf2_pi) + + if self.reweight_actor: + weights = weights.detach() + a_loss = (self._alpha * first_log_p) - min_qf_pi + disc_actor_loss = a_loss * weights + ratio = torch.mean(a_loss) / torch.mean(disc_actor_loss) + ratio = ratio.detach() + actor_loss = ratio * torch.mean(disc_actor_loss) + else: + actor_loss = ((self._alpha * first_log_p) - min_qf_pi).mean() + + # Update the Actor + self.actor_net_optimiser.zero_grad() + actor_loss.backward() + self.actor_net_optimiser.step() + + # Update the temperature + alpha_loss = -( + self.log_alpha * (first_log_p + self.target_entropy).detach() + ).mean() + + self.log_alpha_optimizer.zero_grad() + alpha_loss.backward() + self.log_alpha_optimizer.step() + + if self.learn_counter % self.policy_update_freq == 0: + for target_param, param in zip( + self.target_critic_net.parameters(), self.critic_net.parameters() + ): + target_param.data.copy_( + param.data * self.tau + target_param.data * (1.0 - self.tau) + ) + + def train_world_model(self, memory: MemoryBuffer, batch_size: int) -> None: + experiences = memory.sample_uniform(batch_size) + states, actions, rewards, next_states, _, _ = experiences + + states = torch.FloatTensor(np.asarray(states)).to(self.device) + actions = torch.FloatTensor(np.asarray(actions)).to(self.device) + rewards = torch.FloatTensor(np.asarray(rewards)).to(self.device).unsqueeze(1) + next_states = torch.FloatTensor(np.asarray(next_states)).to(self.device) + + self.world_model.train_world( + states=states, + actions=actions, + next_states=next_states, + ) + self.world_model.train_reward( + next_states=next_states, + rewards=rewards, + ) + + def train_policy(self, memory: MemoryBuffer, batch_size: int) -> None: + self.learn_counter += 1 + + experiences = memory.sample_uniform(batch_size) + states, actions, rewards, next_states, dones, _ = experiences + + # Convert into tensor + states = torch.FloatTensor(np.asarray(states)).to(self.device) + actions = torch.FloatTensor(np.asarray(actions)).to(self.device) + rewards = torch.FloatTensor(np.asarray(rewards)).to(self.device).unsqueeze(1) + next_states = torch.FloatTensor(np.asarray(next_states)).to(self.device) + dones = torch.LongTensor(np.asarray(dones)).to(self.device).unsqueeze(1) + full_weights = torch.ones(rewards.shape).to(self.device) + # Step 2 train as usual + self._train_policy( + states=states, + actions=actions, + rewards=rewards, + next_states=next_states, + dones=dones, + weights=full_weights, + ) + # # # Step 3 Dyna add more data + self._dyna_generate_and_train(next_states=next_states) + + def _dyna_generate_and_train(self, next_states): + """ + Only off-policy Dyna will work. + :param next_states: + """ + pred_states = [] + pred_actions = [] + pred_rs = [] + pred_n_states = [] + pred_uncerts = [] + with torch.no_grad(): + pred_state = next_states + for _ in range(self.horizon): + pred_state = torch.repeat_interleave( + pred_state, self.num_samples, dim=0 + ) + # This part is controversial. But random actions is empirically better. + rand_acts = np.random.uniform( + -1, 1, (pred_state.shape[0], self.action_num) + ) + pred_acts = torch.FloatTensor(rand_acts).to(self.device) + + pred_next_state, _, pred_mean, pred_var = ( + self.world_model.pred_next_states(pred_state, pred_acts) + ) + uncert = self.sampling( + curr_states=pred_state, pred_means=pred_mean, pred_vars=pred_var + ) + uncert = uncert.unsqueeze(dim=1).to(self.device) + pred_uncerts.append(uncert) + + pred_reward = self.world_model.pred_rewards(pred_next_state) + pred_states.append(pred_state) + pred_actions.append(pred_acts.detach()) + pred_rs.append(pred_reward.detach()) + pred_n_states.append(pred_next_state.detach()) + pred_state = pred_next_state.detach() + pred_states = torch.vstack(pred_states) + pred_actions = torch.vstack(pred_actions) + pred_rs = torch.vstack(pred_rs) + pred_n_states = torch.vstack(pred_n_states) + pred_weights = torch.vstack(pred_uncerts) + # Pay attention to here! It is dones in the Cares RL Code! + pred_dones = torch.FloatTensor(np.zeros(pred_rs.shape)).to(self.device) + # states, actions, rewards, next_states, not_dones + self._train_policy( + pred_states, pred_actions, pred_rs, pred_n_states, pred_dones, pred_weights + ) + + def sampling(self, curr_states, pred_means, pred_vars): + """ + High std means low uncertainty. Therefore, divided by 1 + + :param pred_means: + :param pred_vars: + :return: + """ + with torch.no_grad(): + # 5 models. Each predict 10 next_states. + sample1 = torch.distributions.Normal(pred_means[0], pred_vars[0]).sample( + [self.sample_times] + ) + sample2 = torch.distributions.Normal(pred_means[1], pred_vars[1]).sample( + [self.sample_times] + ) + sample3 = torch.distributions.Normal(pred_means[2], pred_vars[2]).sample( + [self.sample_times] + ) + sample4 = torch.distributions.Normal(pred_means[3], pred_vars[3]).sample( + [self.sample_times] + ) + sample5 = torch.distributions.Normal(pred_means[4], pred_vars[4]).sample( + [self.sample_times] + ) + rs = [] + acts = [] + qs = [] + # Varying the next_state's distribution. + for i in range(self.sample_times): + sample1i = denormalize_observation_delta( + sample1[i], self.world_model.statistics + ) + sample1i += curr_states + sample2i = denormalize_observation_delta( + sample2[i], self.world_model.statistics + ) + sample2i += curr_states + sample3i = denormalize_observation_delta( + sample3[i], self.world_model.statistics + ) + sample3i += curr_states + sample4i = denormalize_observation_delta( + sample4[i], self.world_model.statistics + ) + sample4i += curr_states + sample5i = denormalize_observation_delta( + sample5[i], self.world_model.statistics + ) + sample5i += curr_states + # 5 models, each sampled 10 times = 50, + pred_rwd1 = self.world_model.pred_rewards(sample1i) + pred_rwd2 = self.world_model.pred_rewards(sample2i) + pred_rwd3 = self.world_model.pred_rewards(sample3i) + pred_rwd4 = self.world_model.pred_rewards(sample4i) + pred_rwd5 = self.world_model.pred_rewards(sample5i) + rs.append(pred_rwd1) + rs.append(pred_rwd2) + rs.append(pred_rwd3) + rs.append(pred_rwd4) + rs.append(pred_rwd5) + # Each times, 5 models predict different actions. + # [2560, 17] + pred_act1, log_pi1, _ = self.actor_net(sample1i) + pred_act2, log_pi2, _ = self.actor_net(sample2i) + pred_act3, log_pi3, _ = self.actor_net(sample3i) + pred_act4, log_pi4, _ = self.actor_net(sample4i) + pred_act5, log_pi5, _ = self.actor_net(sample5i) + acts.append(log_pi1) + acts.append(log_pi2) + acts.append(log_pi3) + acts.append(log_pi4) + acts.append(log_pi5) + # How to become the same next state, different action. + # Now: sample1 sample2... same next state, different model. + # Pred_act1 pred_act2 same next_state, different actions. + # 5[] * 10[var of state] + qa1, qa2 = self.target_critic_net(sample1i, pred_act1) + qa = torch.minimum(qa1, qa2) + qb1, qb2 = self.target_critic_net(sample2i, pred_act2) + qb = torch.minimum(qb1, qb2) + qc1, qc2 = self.target_critic_net(sample3i, pred_act3) + qc = torch.minimum(qc1, qc2) + qd1, qd2 = self.target_critic_net(sample4i, pred_act4) + qd = torch.minimum(qd1, qd2) + qe1, qe2 = self.target_critic_net(sample5i, pred_act5) + qe = torch.minimum(qe1, qe2) + qs.append(qa) + qs.append(qb) + qs.append(qc) + qs.append(qd) + qs.append(qe) + + rs = torch.stack(rs) + acts = torch.stack(acts) + qs = torch.stack(qs) + + var_r = torch.var(rs, dim=0) + + if self.mode < 3: + var_a = torch.var(acts, dim=0) + var_q = torch.var(qs, dim=0) + + # Computing covariance. + if self.mode < 2: + mean_a = torch.mean(acts, dim=0, keepdim=True) + mean_q = torch.mean(qs, dim=0, keepdim=True) + diff_a = acts - mean_a + diff_q = qs - mean_q + cov_aq = torch.mean(diff_a * diff_q, dim=0) + + if self.mode < 1: + mean_r = torch.mean(rs, dim=0, keepdim=True) + diff_r = rs - mean_r + cov_rq = torch.mean(diff_r * diff_q, dim=0) + + cov_ra = torch.mean(diff_r * diff_a, dim=0) + + gamma_sq = self.gamma * self.gamma + # Ablation + if self.mode == 0: + total_var = ( + var_r + + gamma_sq * var_a + + gamma_sq * var_q + + gamma_sq * 2 * cov_aq + + gamma_sq * 2 * cov_rq + + gamma_sq * 2 * cov_ra + ) + if self.mode == 1: + total_var = gamma_sq * var_a + gamma_sq * var_q + gamma_sq * 2 * cov_aq + + total_stds = torch.minimum( + self.threshold_scale / total_var, + torch.ones(total_var.shape).to(self.device) * 1.5, + ) + + return total_stds.detach() + + def set_statistics(self, stats: dict) -> None: + self.world_model.set_statistics(stats) + + def save_models(self, filename: str, filepath: str = "models") -> None: + path = f"{filepath}/models" if filepath != "models" else filepath + dir_exists = os.path.exists(path) + if not dir_exists: + os.makedirs(path) + torch.save(self.actor_net.state_dict(), f"{path}/{filename}_actor.pth") + torch.save(self.critic_net.state_dict(), f"{path}/{filename}_critic.pth") + logging.info("models has been saved...") + + def load_models(self, filepath: str, filename: str) -> None: + path = f"{filepath}/models" if filepath != "models" else filepath + self.actor_net.load_state_dict(torch.load(f"{path}/{filename}_actor.pth")) + self.critic_net.load_state_dict(torch.load(f"{path}/{filename}_critic.pth")) + logging.info("models has been loaded...") diff --git a/cares_reinforcement_learning/algorithm/mbrl/STEVESAC.py b/cares_reinforcement_learning/algorithm/mbrl/STEVESAC.py new file mode 100644 index 00000000..bcfb356a --- /dev/null +++ b/cares_reinforcement_learning/algorithm/mbrl/STEVESAC.py @@ -0,0 +1,281 @@ +""" +Sutton, Richard S. "Dyna, an integrated architecture for learning, planning, and reacting." + +Original Paper: https://dl.acm.org/doi/abs/10.1145/122344.122377 + +This code runs automatic entropy tuning +""" + +import copy +import logging + +import numpy as np +import torch +from torch import nn +from cares_reinforcement_learning.memory import MemoryBuffer + +from cares_reinforcement_learning.networks.world_models.ensemble import ( + Ensemble_Dyna_Big, +) + + +class STEVESAC: + def __init__( + self, + actor_network: torch.nn.Module, + critic_network: torch.nn.Module, + world_network: Ensemble_Dyna_Big, + gamma: float, + tau: float, + action_num: int, + actor_lr: float, + critic_lr: float, + alpha_lr: float, + horizon: int, + device: torch.device, + train_reward: bool, + train_both: bool, + gripper: bool, + ): + logging.info("----------------------------------------") + logging.info("----I am runing the STEVESAC Agent! ----") + logging.info("----------------------------------------") + self.train_reward = train_reward + self.train_both = train_both + self.gripper = gripper + self.set_stat = False + self.type = "mbrl" + self.device = device + + # this may be called policy_net in other implementations + self.actor_net = actor_network.to(self.device) + # this may be called soft_q_net in other implementations + self.critic_net = critic_network.to(self.device) + self.target_critic_net = copy.deepcopy(self.critic_net) + + self.gamma = gamma + self.tau = tau + + self.horizon = horizon + self.action_num = action_num + + self.learn_counter = 0 + self.policy_update_freq = 1 + + self.actor_net_optimiser = torch.optim.Adam( + self.actor_net.parameters(), lr=actor_lr + ) + self.critic_net_optimiser = torch.optim.Adam( + self.critic_net.parameters(), lr=critic_lr + ) + + # Set to initial alpha to 1.0 according to other baselines. + self.log_alpha = torch.FloatTensor([np.log(1.0)]).to(device) + self.log_alpha.requires_grad = True + self.target_entropy = -action_num + self.log_alpha_optimizer = torch.optim.Adam([self.log_alpha], lr=alpha_lr) + + # World model + self.world_model = world_network + + self.k_l = nn.KLDivLoss(reduction="batchmean", log_target=True) + + @property + def _alpha(self) -> float: + return self.log_alpha.exp() + + def select_action_from_policy( + self, state: np.ndarray, evaluation: bool = False, noise_scale: float = 0 + ) -> np.ndarray: + # note that when evaluating this algorithm we need to select mu as + self.actor_net.eval() + with torch.no_grad(): + state_tensor = torch.FloatTensor(state).unsqueeze(0).to(self.device) + if evaluation is False: + (action, _, _) = self.actor_net(state_tensor) + else: + (_, _, action) = self.actor_net(state_tensor) + action = action.cpu().data.numpy().flatten() + self.actor_net.train() + return action + + def _train_policy( + self, + states: torch.Tensor, + actions: torch.Tensor, + rewards: torch.Tensor, + next_states: torch.Tensor, + dones: torch.Tensor, + weights: torch.Tensor, + ) -> None: + if weights is None: + weights = torch.ones(rewards.shape) + ################## Update the Critic First #################### + with torch.no_grad(): + not_dones = 1 - dones + q_means = [] + q_weights = [] + accum_dist_rewards = torch.repeat_interleave( + rewards.unsqueeze(dim=0), repeats=30, dim=0 + ) + # 5 * 5 * 4 = 100 + for hori in range(self.horizon): + _, curr_hori_log_pi, curr_hori_action = self.actor_net(next_states) + mean_predictions, all_mean_next, _, _ = ( + self.world_model.pred_next_states(next_states, curr_hori_action) + ) + pred_rewards, _ = self.world_model.pred_all_rewards( + observation=next_states, + action=curr_hori_action, + next_observation=all_mean_next, + ) + pred_rewards *= self.gamma ** (hori + 1) + accum_dist_rewards += pred_rewards + # V = Q - alpha * logi + pred_q1, pred_q2 = self.target_critic_net(next_states, curr_hori_action) + pred_q3, pred_q4 = self.critic_net(next_states, curr_hori_action) + pred_v1 = pred_q1 - self._alpha * curr_hori_log_pi + pred_v2 = pred_q2 - self._alpha * curr_hori_log_pi + pred_v3 = pred_q3 - self._alpha * curr_hori_log_pi + pred_v4 = pred_q4 - self._alpha * curr_hori_log_pi + q_0 = [] + for i in range(pred_rewards.shape[0]): + pred_tq1 = ( + accum_dist_rewards[i] + + not_dones * (self.gamma ** (hori + 2)) * pred_v1 + ) + pred_tq2 = ( + accum_dist_rewards[i] + + not_dones * (self.gamma ** (hori + 2)) * pred_v2 + ) + pred_tq3 = ( + accum_dist_rewards[i] + + not_dones * (self.gamma ** (hori + 2)) * pred_v3 + ) + pred_tq4 = ( + accum_dist_rewards[i] + + not_dones * (self.gamma ** (hori + 2)) * pred_v4 + ) + q_0.append(pred_tq1) + q_0.append(pred_tq2) + q_0.append(pred_tq3) + q_0.append(pred_tq4) + q_0 = torch.stack(q_0) + # Compute var, mean and add them to the queue + # [100, 256, 1] -> [256, 1] + mean_0 = torch.mean(q_0, dim=0) + q_means.append(mean_0) + var_0 = torch.var(q_0, dim=0) + var_0[torch.abs(var_0) < 0.0001] = 0.0001 + weights_0 = 1.0 / var_0 + q_weights.append(weights_0) + next_states = mean_predictions + all_means = torch.stack(q_means) + all_weights = torch.stack(q_weights) + total_weights = torch.sum(all_weights, dim=0) + for n in range(self.horizon): + all_weights[n] /= total_weights + q_target = torch.sum(all_weights * all_means, dim=0) + + q_values_one, q_values_two = self.critic_net(states, actions) + critic_loss_one = ((q_values_one - q_target).pow(2)).mean() + critic_loss_two = ((q_values_two - q_target).pow(2)).mean() + critic_loss_total = critic_loss_one + critic_loss_two + # Update the Critic + self.critic_net_optimiser.zero_grad() + critic_loss_total.backward() + self.critic_net_optimiser.step() + + ################## Update the Actor Second #################### + pi, first_log_p, _ = self.actor_net(states) + qf1_pi, qf2_pi = self.critic_net(states, pi) + min_qf_pi = torch.minimum(qf1_pi, qf2_pi) + actor_loss = ((self._alpha * first_log_p) - min_qf_pi).mean() + + # Update the Actor + self.actor_net_optimiser.zero_grad() + actor_loss.backward() + self.actor_net_optimiser.step() + + # Update the temperature + alpha_loss = -( + self.log_alpha * (first_log_p + self.target_entropy).detach() + ).mean() + + self.log_alpha_optimizer.zero_grad() + alpha_loss.backward() + self.log_alpha_optimizer.step() + + if self.learn_counter % self.policy_update_freq == 0: + for target_param, param in zip( + self.target_critic_net.parameters(), self.critic_net.parameters() + ): + target_param.data.copy_( + param.data * self.tau + target_param.data * (1.0 - self.tau) + ) + + def train_world_model(self, memory: MemoryBuffer, batch_size: int) -> None: + + experiences = memory.sample_uniform(batch_size) + states, actions, rewards, next_states, _, _ = experiences + + states = torch.FloatTensor(np.asarray(states)).to(self.device) + actions = torch.FloatTensor(np.asarray(actions)).to(self.device) + next_states = torch.FloatTensor(np.asarray(next_states)).to(self.device) + + self.world_model.train_world( + states=states, + actions=actions, + next_states=next_states, + ) + + batch_size = len(states) + # Reshape to batch_size x whatever + if self.train_reward: + rewards = torch.FloatTensor(np.asarray(rewards)).to(self.device) + rewards = rewards.unsqueeze(0).reshape(batch_size, 1) + if self.train_both: + self.world_model.train_together(states, actions, rewards) + else: + self.world_model.train_reward(states, actions, next_states, rewards) + + def train_policy(self, memory: MemoryBuffer, batch_size: int) -> None: + self.learn_counter += 1 + + experiences = memory.sample_uniform(batch_size) + states, actions, rewards, next_states, dones, _ = experiences + + # Convert into tensor + states = torch.FloatTensor(np.asarray(states)).to(self.device) + actions = torch.FloatTensor(np.asarray(actions)).to(self.device) + rewards = torch.FloatTensor(np.asarray(rewards)).to(self.device).unsqueeze(1) + next_states = torch.FloatTensor(np.asarray(next_states)).to(self.device) + dones = torch.LongTensor(np.asarray(dones)).to(self.device).unsqueeze(1) + + # Step 2 train as usual + self._train_policy( + states=states, + actions=actions, + rewards=rewards, + next_states=next_states, + dones=dones, + weights=torch.ones(rewards.shape), + ) + + def set_statistics(self, stats: dict) -> None: + self.world_model.set_statistics(stats) + self.set_stat = True + + def save_models(self, filename: str, filepath: str = "models") -> None: + # if not os.path.exists(filepath): + # os.makedirs(filepath) + # print(filepath) + # logging.info(filepath) + # torch.save(self.actor_net.state_dict(), f"{filepath}/{filename}_actor.pht") + # torch.save(self.critic_net.state_dict(), f"{filepath}/{filename}_critic.pht") + logging.info("models has been saved...") + + def load_models(self, filepath: str, filename: str) -> None: + self.actor_net.load_state_dict(torch.load(f"{filepath}/{filename}_actor.pht")) + self.critic_net.load_state_dict(torch.load(f"{filepath}/{filename}_critic.pht")) + logging.info("models has been loaded...") diff --git a/cares_reinforcement_learning/algorithm/mbrl/STEVESAC_Bounded.py b/cares_reinforcement_learning/algorithm/mbrl/STEVESAC_Bounded.py new file mode 100644 index 00000000..8e602c20 --- /dev/null +++ b/cares_reinforcement_learning/algorithm/mbrl/STEVESAC_Bounded.py @@ -0,0 +1,345 @@ +""" +Sutton, Richard S. "Dyna, an integrated architecture for learning, planning, and reacting." + +Original Paper: https://dl.acm.org/doi/abs/10.1145/122344.122377 + +This code runs automatic entropy tuning +""" + +import copy +import logging + +import numpy as np +import torch +from torch import nn +from cares_reinforcement_learning.memory import MemoryBuffer + +from cares_reinforcement_learning.networks.world_models.ensemble import ( + Ensemble_Dyna_Big, +) +import torch.nn.functional as F + + +class STEVESAC_Bounded: + def __init__( + self, + actor_network: torch.nn.Module, + critic_network: torch.nn.Module, + world_network: Ensemble_Dyna_Big, + gamma: float, + tau: float, + action_num: int, + actor_lr: float, + critic_lr: float, + alpha_lr: float, + horizon: int, + device: torch.device, + train_reward: bool, + train_both: bool, + gripper: bool, + threshold: float, + exploration_sample: int, + ): + logging.info("------------------------------------------------") + logging.info("----I am runing the STEVESAC_Bounded Agent! ----") + logging.info("------------------------------------------------") + self.train_reward = train_reward + self.train_both = train_both + self.gripper = gripper + self.exploration_sample = exploration_sample + self.threshold = threshold + self.set_stat = False + self.type = "mbrl" + self.device = device + + # this may be called policy_net in other implementations + self.actor_net = actor_network.to(self.device) + # this may be called soft_q_net in other implementations + self.critic_net = critic_network.to(self.device) + self.target_critic_net = copy.deepcopy(self.critic_net) + + self.gamma = gamma + self.tau = tau + + self.horizon = horizon + self.action_num = action_num + + self.learn_counter = 0 + self.policy_update_freq = 1 + + self.actor_net_optimiser = torch.optim.Adam( + self.actor_net.parameters(), lr=actor_lr + ) + self.critic_net_optimiser = torch.optim.Adam( + self.critic_net.parameters(), lr=critic_lr + ) + + # Set to initial alpha to 1.0 according to other baselines. + self.log_alpha = torch.FloatTensor([np.log(1.0)]).to(device) + self.log_alpha.requires_grad = True + self.target_entropy = -action_num + self.log_alpha_optimizer = torch.optim.Adam([self.log_alpha], lr=alpha_lr) + + # World model + self.world_model = world_network + + self.k_l = nn.KLDivLoss(reduction="batchmean", log_target=True) + + @property + def _alpha(self) -> float: + return self.log_alpha.exp() + + def select_action_from_policy( + self, state: np.ndarray, evaluation: bool = False, noise_scale: float = 0 + ) -> np.ndarray: + # note that when evaluating this algorithm we need to select mu as + self.actor_net.eval() + with torch.no_grad(): + state_tensor = torch.FloatTensor(state).unsqueeze(0).to(self.device) + if evaluation is False: + (action, _, _) = self.actor_net(state_tensor) + if self.threshold == 0: + (action, _, _) = self.actor_net(state_tensor) + else: + if self.set_stat: + multi_state_tensor = torch.repeat_interleave( + state_tensor, self.exploration_sample, dim=0 + ) + (multi_action, multi_log_pi, _) = self.actor_net( + multi_state_tensor + ) + # Estimate uncertainty + # [6, 10, 17] + _, _, nstate_means, nstate_vars = ( + self.world_model.pred_next_states( + observation=multi_state_tensor, actions=multi_action + ) + ) + # [10, 17] + aleatoric = torch.mean(nstate_vars**2, dim=0) ** 0.5 + epistemic = torch.var(nstate_means, dim=0) ** 0.5 + aleatoric = torch.clamp(aleatoric, max=10e3) + epistemic = torch.clamp(epistemic, max=10e3) + total_unc = (aleatoric**2 + epistemic**2) ** 0.5 + uncert = torch.mean(total_unc, dim=1) + world_dist = F.softmax(uncert, dim=0) + world_dist -= torch.min(world_dist) + + # Q_1, Q_2 = self.critic_net(multi_state_tensor, multi_action) + # Q_s = torch.minimum(Q_1, Q_2) + # Q_s = Q_s.squeeze() + # multi_log_pi = Q_s + + multi_log_pi = multi_log_pi.squeeze() + policy_dist = F.softmax(multi_log_pi, dim=0) + final_dist = ( + 1 - self.threshold + ) * policy_dist + self.threshold * world_dist + + # candi = torch.argmax(final_dist) + final_dist = F.softmax(final_dist, dim=0) + new_dist = torch.distributions.Categorical(final_dist) + candi = new_dist.sample([1]).squeeze() + + action = multi_action[candi] + else: + (action, _, _) = self.actor_net(state_tensor) + else: + (_, _, action) = self.actor_net(state_tensor) + action = action.cpu().data.numpy().flatten() + self.actor_net.train() + return action + + def _train_policy( + self, + states: torch.Tensor, + actions: torch.Tensor, + rewards: torch.Tensor, + next_states: torch.Tensor, + dones: torch.Tensor, + weights: torch.Tensor, + ) -> None: + if weights is None: + weights = torch.ones(rewards.shape) + ################## Update the Critic First #################### + with torch.no_grad(): + not_dones = 1 - dones + q_means = [] + q_weights = [] + accum_dist_rewards = torch.repeat_interleave( + rewards.unsqueeze(dim=0), repeats=30, dim=0 + ) + # 5 * 5 * 4 = 100 + for hori in range(self.horizon): + _, curr_hori_log_pi, curr_hori_action = self.actor_net(next_states) + mean_predictions, all_mean_next, _, _ = ( + self.world_model.pred_next_states(next_states, curr_hori_action) + ) + pred_rewards, _ = self.world_model.pred_all_rewards( + observation=next_states, + action=curr_hori_action, + next_observation=all_mean_next, + ) + pred_rewards *= self.gamma ** (hori + 1) + accum_dist_rewards += pred_rewards + # V = Q - alpha * logi + pred_q1, pred_q2 = self.target_critic_net(next_states, curr_hori_action) + pred_q3, pred_q4 = self.critic_net(next_states, curr_hori_action) + pred_v1 = pred_q1 - self._alpha * curr_hori_log_pi + pred_v2 = pred_q2 - self._alpha * curr_hori_log_pi + pred_v3 = pred_q3 - self._alpha * curr_hori_log_pi + pred_v4 = pred_q4 - self._alpha * curr_hori_log_pi + q_0 = [] + for i in range(pred_rewards.shape[0]): + pred_tq1 = ( + accum_dist_rewards[i] + + not_dones * (self.gamma ** (hori + 2)) * pred_v1 + ) + pred_tq2 = ( + accum_dist_rewards[i] + + not_dones * (self.gamma ** (hori + 2)) * pred_v2 + ) + pred_tq3 = ( + accum_dist_rewards[i] + + not_dones * (self.gamma ** (hori + 2)) * pred_v3 + ) + pred_tq4 = ( + accum_dist_rewards[i] + + not_dones * (self.gamma ** (hori + 2)) * pred_v4 + ) + q_0.append(pred_tq1) + q_0.append(pred_tq2) + q_0.append(pred_tq3) + q_0.append(pred_tq4) + q_0 = torch.stack(q_0) + # Compute var, mean and add them to the queue + # [100, 256, 1] -> [256, 1] + mean_0 = torch.mean(q_0, dim=0) + q_means.append(mean_0) + var_0 = torch.var(q_0, dim=0) + var_0[torch.abs(var_0) < 0.0001] = 0.0001 + weights_0 = 1.0 / var_0 + q_weights.append(weights_0) + next_states = mean_predictions + all_means = torch.stack(q_means) + all_weights = torch.stack(q_weights) + total_weights = torch.sum(all_weights, dim=0) + for n in range(self.horizon): + all_weights[n] /= total_weights + q_target = torch.sum(all_weights * all_means, dim=0) + + q_values_one, q_values_two = self.critic_net(states, actions) + critic_loss_one = ((q_values_one - q_target).pow(2)).mean() + critic_loss_two = ((q_values_two - q_target).pow(2)).mean() + critic_loss_total = critic_loss_one + critic_loss_two + # Update the Critic + self.critic_net_optimiser.zero_grad() + critic_loss_total.backward() + self.critic_net_optimiser.step() + + ################## Update the Actor Second #################### + pi, first_log_p, _ = self.actor_net(states) + qf1_pi, qf2_pi = self.critic_net(states, pi) + min_qf_pi = torch.minimum(qf1_pi, qf2_pi) + actor_loss = ((self._alpha * first_log_p) - min_qf_pi).mean() + + # Update the Actor + self.actor_net_optimiser.zero_grad() + actor_loss.backward() + self.actor_net_optimiser.step() + + # Update the temperature + alpha_loss = -( + self.log_alpha * (first_log_p + self.target_entropy).detach() + ).mean() + + self.log_alpha_optimizer.zero_grad() + alpha_loss.backward() + self.log_alpha_optimizer.step() + + if self.learn_counter % self.policy_update_freq == 0: + for target_param, param in zip( + self.target_critic_net.parameters(), self.critic_net.parameters() + ): + target_param.data.copy_( + param.data * self.tau + target_param.data * (1.0 - self.tau) + ) + + def train_world_model(self, memory: MemoryBuffer, batch_size: int) -> None: + + experiences = memory.sample_uniform(batch_size) + states, actions, rewards, next_states, _, _ = experiences + + states = torch.FloatTensor(np.asarray(states)).to(self.device) + actions = torch.FloatTensor(np.asarray(actions)).to(self.device) + next_states = torch.FloatTensor(np.asarray(next_states)).to(self.device) + + self.world_model.train_world( + states=states, + actions=actions, + next_states=next_states, + ) + + batch_size = len(states) + # Reshape to batch_size x whatever + if self.train_reward: + rewards = torch.FloatTensor(np.asarray(rewards)).to(self.device) + rewards = rewards.unsqueeze(0).reshape(batch_size, 1) + if self.train_both: + self.world_model.train_together(states, actions, rewards) + else: + self.world_model.train_reward(states, actions, next_states, rewards) + + def train_policy(self, memory: MemoryBuffer, batch_size: int) -> None: + self.learn_counter += 1 + + experiences = memory.sample_uniform(batch_size) + states, actions, rewards, next_states, dones, _ = experiences + + # Convert into tensor + states = torch.FloatTensor(np.asarray(states)).to(self.device) + actions = torch.FloatTensor(np.asarray(actions)).to(self.device) + rewards = torch.FloatTensor(np.asarray(rewards)).to(self.device).unsqueeze(1) + next_states = torch.FloatTensor(np.asarray(next_states)).to(self.device) + dones = torch.LongTensor(np.asarray(dones)).to(self.device).unsqueeze(1) + + # Step 2 train as usual + self._train_policy( + states=states, + actions=actions, + rewards=rewards, + next_states=next_states, + dones=dones, + weights=torch.ones(rewards.shape), + ) + + def reward_function(self, curr_states, next_states): + target_goal_tensor = curr_states[:, -2:] + object_current = next_states[:, -4:-2] + sq_diff = (target_goal_tensor - object_current) ** 2 + # [256, 1] + goal_distance_after = torch.sqrt(torch.sum(sq_diff, dim=1)).unsqueeze(dim=1) + pred_reward = -goal_distance_after + 70 + mask1 = goal_distance_after <= 10 + mask2 = goal_distance_after > 70 + pred_reward[mask1] = 800 + pred_reward[mask2] = 0 + return pred_reward + + def set_statistics(self, stats: dict) -> None: + self.world_model.set_statistics(stats) + self.set_stat = True + + def save_models(self, filename: str, filepath: str = "models") -> None: + # if not os.path.exists(filepath): + # os.makedirs(filepath) + # print(filepath) + # logging.info(filepath) + # torch.save(self.actor_net.state_dict(), f"{filepath}/{filename}_actor.pht") + # torch.save(self.critic_net.state_dict(), f"{filepath}/{filename}_critic.pht") + logging.info("models has been saved...") + + def load_models(self, filepath: str, filename: str) -> None: + self.actor_net.load_state_dict(torch.load(f"{filepath}/{filename}_actor.pht")) + self.critic_net.load_state_dict(torch.load(f"{filepath}/{filename}_critic.pht")) + logging.info("models has been loaded...") diff --git a/cares_reinforcement_learning/algorithm/mbrl/STEVESAC_Bounded_Yao.py b/cares_reinforcement_learning/algorithm/mbrl/STEVESAC_Bounded_Yao.py new file mode 100644 index 00000000..e9f88d7f --- /dev/null +++ b/cares_reinforcement_learning/algorithm/mbrl/STEVESAC_Bounded_Yao.py @@ -0,0 +1,344 @@ +""" +Sutton, Richard S. "Dyna, an integrated architecture for learning, planning, and reacting." + +Original Paper: https://dl.acm.org/doi/abs/10.1145/122344.122377 + +This code runs automatic entropy tuning +""" + +import copy +import logging + +import numpy as np +import torch +from torch import nn +from cares_reinforcement_learning.memory import MemoryBuffer + +from cares_reinforcement_learning.networks.world_models.ensemble import ( + Ensemble_Dyna_Big, +) +import torch.nn.functional as F + + +class STEVESAC_Bounded_Yao: + def __init__( + self, + actor_network: torch.nn.Module, + critic_network: torch.nn.Module, + world_network: Ensemble_Dyna_Big, + gamma: float, + tau: float, + action_num: int, + actor_lr: float, + critic_lr: float, + alpha_lr: float, + horizon: int, + device: torch.device, + train_reward: bool, + train_both: bool, + gripper: bool, + threshold: float, + exploration_sample: int, + ): + logging.info("------------------------------------------------") + logging.info("----I am runing the STEVESAC_Bounded Agent! ----") + logging.info("------------------------------------------------") + self.train_reward = train_reward + self.train_both = train_both + self.gripper = gripper + self.exploration_sample = exploration_sample + self.threshold = threshold + self.set_stat = False + self.type = "mbrl" + self.device = device + + # this may be called policy_net in other implementations + self.actor_net = actor_network.to(self.device) + # this may be called soft_q_net in other implementations + self.critic_net = critic_network.to(self.device) + self.target_critic_net = copy.deepcopy(self.critic_net) + + self.gamma = gamma + self.tau = tau + + self.horizon = horizon + self.action_num = action_num + + self.learn_counter = 0 + self.policy_update_freq = 1 + + self.actor_net_optimiser = torch.optim.Adam( + self.actor_net.parameters(), lr=actor_lr + ) + self.critic_net_optimiser = torch.optim.Adam( + self.critic_net.parameters(), lr=critic_lr + ) + + # Set to initial alpha to 1.0 according to other baselines. + self.log_alpha = torch.FloatTensor([np.log(1.0)]).to(device) + self.log_alpha.requires_grad = True + self.target_entropy = -action_num + self.log_alpha_optimizer = torch.optim.Adam([self.log_alpha], lr=alpha_lr) + + # World model + self.world_model = world_network + + self.k_l = nn.KLDivLoss(reduction="batchmean", log_target=True) + + @property + def _alpha(self) -> float: + return self.log_alpha.exp() + + def select_action_from_policy( + self, state: np.ndarray, evaluation: bool = False, noise_scale: float = 0 + ) -> np.ndarray: + # note that when evaluating this algorithm we need to select mu as + self.actor_net.eval() + with torch.no_grad(): + state_tensor = torch.FloatTensor(state).unsqueeze(0).to(self.device) + if evaluation is False: + (action, _, _) = self.actor_net(state_tensor) + if self.threshold == 0: + (action, _, _) = self.actor_net(state_tensor) + else: + if self.set_stat: + multi_state_tensor = torch.repeat_interleave( + state_tensor, self.exploration_sample, dim=0 + ) + (multi_action, multi_log_pi, _) = self.actor_net( + multi_state_tensor + ) + # Estimate uncertainty + # [6, 10, 17] + _, _, nstate_means, nstate_vars = ( + self.world_model.pred_next_states( + observation=multi_state_tensor, actions=multi_action + ) + ) + # [10, 17] + aleatoric = torch.mean(nstate_vars**2, dim=0) ** 0.5 + epistemic = torch.var(nstate_means, dim=0) ** 0.5 + aleatoric = torch.clamp(aleatoric, max=10e3) + epistemic = torch.clamp(epistemic, max=10e3) + total_unc = (aleatoric**2 + epistemic**2) ** 0.5 + world_dist = torch.mean(total_unc, dim=1) + # world_dist = F.softmax(uncert, dim=0) + # world_dist -= torch.min(world_dist) + + Q_1, Q_2 = self.critic_net(multi_state_tensor, multi_action) + Q_s = torch.minimum(Q_1, Q_2) + Q_s = Q_s.squeeze() + policy_dist = Q_s + + # multi_log_pi = multi_log_pi.squeeze() + # policy_dist = F.softmax(multi_log_pi, dim=0) + + final_dist = policy_dist + self.threshold * world_dist + + # candi = torch.argmax(final_dist) + # final_dist = F.softmax(final_dist, dim=0) + # new_dist = torch.distributions.Categorical(final_dist) + candi = torch.argmax(final_dist) + + action = multi_action[candi] + else: + (action, _, _) = self.actor_net(state_tensor) + else: + (_, _, action) = self.actor_net(state_tensor) + action = action.cpu().data.numpy().flatten() + self.actor_net.train() + return action + + def _train_policy( + self, + states: torch.Tensor, + actions: torch.Tensor, + rewards: torch.Tensor, + next_states: torch.Tensor, + dones: torch.Tensor, + weights: torch.Tensor, + ) -> None: + if weights is None: + weights = torch.ones(rewards.shape) + ################## Update the Critic First #################### + with torch.no_grad(): + not_dones = 1 - dones + q_means = [] + q_weights = [] + accum_dist_rewards = torch.repeat_interleave( + rewards.unsqueeze(dim=0), repeats=30, dim=0 + ) + # 5 * 5 * 4 = 100 + for hori in range(self.horizon): + _, curr_hori_log_pi, curr_hori_action = self.actor_net(next_states) + mean_predictions, all_mean_next, _, _ = ( + self.world_model.pred_next_states(next_states, curr_hori_action) + ) + pred_rewards, _ = self.world_model.pred_all_rewards( + observation=next_states, + action=curr_hori_action, + next_observation=all_mean_next, + ) + pred_rewards *= self.gamma ** (hori + 1) + accum_dist_rewards += pred_rewards + # V = Q - alpha * logi + pred_q1, pred_q2 = self.target_critic_net(next_states, curr_hori_action) + pred_q3, pred_q4 = self.critic_net(next_states, curr_hori_action) + pred_v1 = pred_q1 - self._alpha * curr_hori_log_pi + pred_v2 = pred_q2 - self._alpha * curr_hori_log_pi + pred_v3 = pred_q3 - self._alpha * curr_hori_log_pi + pred_v4 = pred_q4 - self._alpha * curr_hori_log_pi + q_0 = [] + for i in range(pred_rewards.shape[0]): + pred_tq1 = ( + accum_dist_rewards[i] + + not_dones * (self.gamma ** (hori + 2)) * pred_v1 + ) + pred_tq2 = ( + accum_dist_rewards[i] + + not_dones * (self.gamma ** (hori + 2)) * pred_v2 + ) + pred_tq3 = ( + accum_dist_rewards[i] + + not_dones * (self.gamma ** (hori + 2)) * pred_v3 + ) + pred_tq4 = ( + accum_dist_rewards[i] + + not_dones * (self.gamma ** (hori + 2)) * pred_v4 + ) + q_0.append(pred_tq1) + q_0.append(pred_tq2) + q_0.append(pred_tq3) + q_0.append(pred_tq4) + q_0 = torch.stack(q_0) + # Compute var, mean and add them to the queue + # [100, 256, 1] -> [256, 1] + mean_0 = torch.mean(q_0, dim=0) + q_means.append(mean_0) + var_0 = torch.var(q_0, dim=0) + var_0[torch.abs(var_0) < 0.0001] = 0.0001 + weights_0 = 1.0 / var_0 + q_weights.append(weights_0) + next_states = mean_predictions + all_means = torch.stack(q_means) + all_weights = torch.stack(q_weights) + total_weights = torch.sum(all_weights, dim=0) + for n in range(self.horizon): + all_weights[n] /= total_weights + q_target = torch.sum(all_weights * all_means, dim=0) + + q_values_one, q_values_two = self.critic_net(states, actions) + critic_loss_one = ((q_values_one - q_target).pow(2)).mean() + critic_loss_two = ((q_values_two - q_target).pow(2)).mean() + critic_loss_total = critic_loss_one + critic_loss_two + # Update the Critic + self.critic_net_optimiser.zero_grad() + critic_loss_total.backward() + self.critic_net_optimiser.step() + + ################## Update the Actor Second #################### + pi, first_log_p, _ = self.actor_net(states) + qf1_pi, qf2_pi = self.critic_net(states, pi) + min_qf_pi = torch.minimum(qf1_pi, qf2_pi) + actor_loss = ((self._alpha * first_log_p) - min_qf_pi).mean() + + # Update the Actor + self.actor_net_optimiser.zero_grad() + actor_loss.backward() + self.actor_net_optimiser.step() + + # Update the temperature + alpha_loss = -( + self.log_alpha * (first_log_p + self.target_entropy).detach() + ).mean() + + self.log_alpha_optimizer.zero_grad() + alpha_loss.backward() + self.log_alpha_optimizer.step() + + if self.learn_counter % self.policy_update_freq == 0: + for target_param, param in zip( + self.target_critic_net.parameters(), self.critic_net.parameters() + ): + target_param.data.copy_( + param.data * self.tau + target_param.data * (1.0 - self.tau) + ) + + def train_world_model(self, memory: MemoryBuffer, batch_size: int) -> None: + + experiences = memory.sample_uniform(batch_size) + states, actions, rewards, next_states, _, _ = experiences + + states = torch.FloatTensor(np.asarray(states)).to(self.device) + actions = torch.FloatTensor(np.asarray(actions)).to(self.device) + next_states = torch.FloatTensor(np.asarray(next_states)).to(self.device) + + self.world_model.train_world( + states=states, + actions=actions, + next_states=next_states, + ) + + batch_size = len(states) + # Reshape to batch_size x whatever + if self.train_reward: + rewards = torch.FloatTensor(np.asarray(rewards)).to(self.device) + rewards = rewards.unsqueeze(0).reshape(batch_size, 1) + if self.train_both: + self.world_model.train_together(states, actions, rewards) + else: + self.world_model.train_reward(states, actions, next_states, rewards) + + def train_policy(self, memory: MemoryBuffer, batch_size: int) -> None: + self.learn_counter += 1 + + experiences = memory.sample_uniform(batch_size) + states, actions, rewards, next_states, dones, _ = experiences + + # Convert into tensor + states = torch.FloatTensor(np.asarray(states)).to(self.device) + actions = torch.FloatTensor(np.asarray(actions)).to(self.device) + rewards = torch.FloatTensor(np.asarray(rewards)).to(self.device).unsqueeze(1) + next_states = torch.FloatTensor(np.asarray(next_states)).to(self.device) + dones = torch.LongTensor(np.asarray(dones)).to(self.device).unsqueeze(1) + + # Step 2 train as usual + self._train_policy( + states=states, + actions=actions, + rewards=rewards, + next_states=next_states, + dones=dones, + weights=torch.ones(rewards.shape), + ) + + def reward_function(self, curr_states, next_states): + target_goal_tensor = curr_states[:, -2:] + object_current = next_states[:, -4:-2] + sq_diff = (target_goal_tensor - object_current) ** 2 + # [256, 1] + goal_distance_after = torch.sqrt(torch.sum(sq_diff, dim=1)).unsqueeze(dim=1) + pred_reward = -goal_distance_after + 70 + mask1 = goal_distance_after <= 10 + mask2 = goal_distance_after > 70 + pred_reward[mask1] = 800 + pred_reward[mask2] = 0 + return pred_reward + + def set_statistics(self, stats: dict) -> None: + self.world_model.set_statistics(stats) + self.set_stat = True + + def save_models(self, filename: str, filepath: str = "models") -> None: + # if not os.path.exists(filepath): + # os.makedirs(filepath) + # print(filepath) + # logging.info(filepath) + # torch.save(self.actor_net.state_dict(), f"{filepath}/{filename}_actor.pht") + # torch.save(self.critic_net.state_dict(), f"{filepath}/{filename}_critic.pht") + logging.info("models has been saved...") + + def load_models(self, filepath: str, filename: str) -> None: + self.actor_net.load_state_dict(torch.load(f"{filepath}/{filename}_actor.pht")) + self.critic_net.load_state_dict(torch.load(f"{filepath}/{filename}_critic.pht")) + logging.info("models has been loaded...") diff --git a/cares_reinforcement_learning/algorithm/mbrl/__init__.py b/cares_reinforcement_learning/algorithm/mbrl/__init__.py index 6d38815d..2c0b2dd2 100644 --- a/cares_reinforcement_learning/algorithm/mbrl/__init__.py +++ b/cares_reinforcement_learning/algorithm/mbrl/__init__.py @@ -1 +1,15 @@ -from .DynaSAC import DynaSAC +# Baseline +from .DynaSAC_NS import DynaSAC_NS +from .STEVESAC import STEVESAC + +# Bounded Exploration +from .DynaSAC_Bounded import DynaSAC_Bounded +from .STEVESAC_Bounded import STEVESAC_Bounded +from .DynaSAC_Bounded_Yao import DynaSAC_Bounded_Yao +from .STEVESAC_Bounded_Yao import STEVESAC_Bounded_Yao + +# Immersive Weighting +from .DynaSAC_NS_IW import DynaSAC_NS_IW +from .DynaSAC_SUNRISE_NS import DynaSAC_SUNRISEReweight +from .DynaSAC_UWAC_NS import DynaSAC_UWACReweight +from .DynaSAC_BIV_NS import DynaSAC_BIVReweight diff --git a/cares_reinforcement_learning/algorithm/policy/SAC.py b/cares_reinforcement_learning/algorithm/policy/SAC.py index 8ba82ae8..313cd3f3 100644 --- a/cares_reinforcement_learning/algorithm/policy/SAC.py +++ b/cares_reinforcement_learning/algorithm/policy/SAC.py @@ -59,7 +59,7 @@ def __init__( # Temperature (alpha) for the entropy loss # Set to initial alpha to 1.0 according to other baselines. init_temperature = 1.0 - self.log_alpha = torch.tensor(np.log(init_temperature)).to(device) + self.log_alpha = torch.FloatTensor([np.log(init_temperature)]).to(device) self.log_alpha.requires_grad = True self.log_alpha_optimizer = torch.optim.Adam( [self.log_alpha], lr=config.alpha_lr diff --git a/cares_reinforcement_learning/networks/SAC/__init__.py b/cares_reinforcement_learning/networks/SAC/__init__.py index cc7f4410..d5c3e629 100644 --- a/cares_reinforcement_learning/networks/SAC/__init__.py +++ b/cares_reinforcement_learning/networks/SAC/__init__.py @@ -1,2 +1,3 @@ from .actor import Actor, DefaultActor from .critic import Critic, DefaultCritic +from .triple_critic import TriCritic diff --git a/cares_reinforcement_learning/networks/SAC/triple_critic.py b/cares_reinforcement_learning/networks/SAC/triple_critic.py new file mode 100644 index 00000000..2250a269 --- /dev/null +++ b/cares_reinforcement_learning/networks/SAC/triple_critic.py @@ -0,0 +1,44 @@ +import torch +from torch import nn + + +class TriCritic(nn.Module): + def __init__(self, observation_size: int, num_actions: int): + super().__init__() + + self.hidden_size = [256, 256] + + # Q1 architecture + self.Q1 = nn.Sequential( + nn.Linear(observation_size + num_actions, self.hidden_size[0]), + nn.ReLU(), + nn.Linear(self.hidden_size[0], self.hidden_size[1]), + nn.ReLU(), + nn.Linear(self.hidden_size[1], 1), + ) + + # Q2 architecture + self.Q2 = nn.Sequential( + nn.Linear(observation_size + num_actions, self.hidden_size[0]), + nn.ReLU(), + nn.Linear(self.hidden_size[0], self.hidden_size[1]), + nn.ReLU(), + nn.Linear(self.hidden_size[1], 1), + ) + + self.Q3 = nn.Sequential( + nn.Linear(observation_size + num_actions, self.hidden_size[0]), + nn.ReLU(), + nn.Linear(self.hidden_size[0], self.hidden_size[1]), + nn.ReLU(), + nn.Linear(self.hidden_size[1], 1), + ) + + def forward( + self, state: torch.Tensor, action: torch.Tensor + ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + obs_action = torch.cat([state, action], dim=1) + q1 = self.Q1(obs_action) + q2 = self.Q2(obs_action) + q3 = self.Q3(obs_action) + return q1, q2, q3 diff --git a/cares_reinforcement_learning/networks/world_models/__init__.py b/cares_reinforcement_learning/networks/world_models/__init__.py index 8f30f0dc..3efe4bb0 100644 --- a/cares_reinforcement_learning/networks/world_models/__init__.py +++ b/cares_reinforcement_learning/networks/world_models/__init__.py @@ -1,9 +1,4 @@ -from cares_reinforcement_learning.networks.world_models.simple_rewards import ( - SimpleReward, -) -from cares_reinforcement_learning.networks.world_models.simple_dynamics import ( - SimpleDynamics, -) -from cares_reinforcement_learning.networks.world_models.ensemble_integrated import ( - EnsembleWorldReward, -) +# from cares_reinforcement_learning.networks.world_models.z_ensemble_integrated import ( +# EnsembleWorldReward, +# ) +from .world_model import World_Model diff --git a/cares_reinforcement_learning/networks/world_models/ensemble/__init__.py b/cares_reinforcement_learning/networks/world_models/ensemble/__init__.py new file mode 100644 index 00000000..69fd4fe7 --- /dev/null +++ b/cares_reinforcement_learning/networks/world_models/ensemble/__init__.py @@ -0,0 +1,2 @@ +from .world_ensemble_one_rwd import Ensemble_Dyna_One_Reward +from .world_ensemble_big import Ensemble_Dyna_Big diff --git a/cares_reinforcement_learning/networks/world_models/ensemble/world_ensemble_big.py b/cares_reinforcement_learning/networks/world_models/ensemble/world_ensemble_big.py new file mode 100644 index 00000000..7c88f74a --- /dev/null +++ b/cares_reinforcement_learning/networks/world_models/ensemble/world_ensemble_big.py @@ -0,0 +1,311 @@ +import math +import numpy as np +import torch +import torch.nn.functional as F +import torch.utils +from torch import optim +from cares_reinforcement_learning.networks.world_models.simple import ( + Probabilistic_Dynamics, +) +from cares_reinforcement_learning.networks.world_models import World_Model +from cares_reinforcement_learning.util.helpers import normalize_observation_delta +from cares_reinforcement_learning.util import ( + denormalize_observation_delta, + normalize_observation, +) + + +def sig(x): + """ + Sigmoid + :param x: + :return: + """ + return 1 / (1 + np.exp(-x)) + + +class Ensemble_Dyna_Big(World_Model): + """ + World Model + """ + + def __init__( + self, + observation_size: int, + num_actions: int, + device: str, + l_r: float = 0.001, + hidden_size=None, + sas: bool = True, + prob_rwd: bool = True, + num_models: int = 7, + boost_inter: int = 3, + num_rwd_model: int = 1, + ): + super().__init__( + observation_size=observation_size, + num_actions=num_actions, + l_r=l_r, + device=device, + hidden_size=hidden_size, + sas=sas, + prob_rwd=prob_rwd, + num_rwd_model=num_rwd_model, + ) + + self.num_models = num_models + self.observation_size = observation_size + self.num_actions = num_actions + self.l_r = l_r + self.curr_losses = np.ones((self.num_models,)) * 5 + + self.world_models = [] + for i in range(self.num_models): + i %= 3 + if i == 0: + model = Probabilistic_Dynamics( + observation_size=observation_size, + num_actions=num_actions, + hidden_size=[128, 128, 128], + ) + if i == 1: + model = Probabilistic_Dynamics( + observation_size=observation_size, + num_actions=num_actions, + hidden_size=[128, 128], + ) + if i == 2: + model = Probabilistic_Dynamics( + observation_size=observation_size, + num_actions=num_actions, + hidden_size=[256, 256], + ) + self.world_models.append(model) + + self.optimizers = [ + optim.Adam(self.world_models[i].parameters(), lr=l_r) + for i in range(self.num_models) + ] + self.statistics = {} + # Bring all reward prediction and dynamic rediction networks to device. + self.device = device + for model in self.world_models: + model.to(device) + self.boost_inter = boost_inter + self.update_counter = 0 + + def pred_next_states( + self, observation: torch.Tensor, actions: torch.Tensor + ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: + assert ( + observation.shape[1] + actions.shape[1] + == self.observation_size + self.num_actions + ) + norm_means = [] + norm_vars = [] + normalized_observation = normalize_observation(observation, self.statistics) + # Iterate over the neural networks and get the predictions + for model in self.world_models: + # Predict delta + n_mean, n_var = model.forward(normalized_observation, actions) + norm_means.append(n_mean) + norm_vars.append(n_var) + predictions_vars = torch.stack(norm_vars) + predictions_norm_means = torch.stack(norm_means) + # Normalized + predictions_means = denormalize_observation_delta( + predictions_norm_means, self.statistics + ) + all_predictions = predictions_means + observation + denorm_avg = torch.mean(predictions_means, dim=0) + prediction = denorm_avg + observation + return prediction, all_predictions, predictions_norm_means, predictions_vars + + def train_world( + self, + states: torch.Tensor, + actions: torch.Tensor, + next_states: torch.Tensor, + ) -> None: + # This boosting part is useless, cause inaccuracy. + # weights = 1.5 - sig(self.curr_losses) + # weights /= np.max(weights) + assert len(states.shape) >= 2 + assert len(actions.shape) == 2 + assert ( + states.shape[1] + actions.shape[1] + == self.num_actions + self.observation_size + ) + # min_ = np.min(self.curr_losses) + # max_ = np.max(self.curr_losses) + # delta = max_ - min_ + # if delta == 0: + # delta = 0.1 + # temp = (self.curr_losses - min_) / delta * 5.0 + # temp = sig(temp) + # temp[index] * + index = int(math.floor(self.update_counter / self.boost_inter)) + target = next_states - states + delta_targets_normalized = normalize_observation_delta(target, self.statistics) + normalized_state = normalize_observation(states, self.statistics) + n_mean, n_var = self.world_models[index].forward(normalized_state, actions) + model_loss = F.gaussian_nll_loss( + input=n_mean, target=delta_targets_normalized, var=n_var + ).mean() + self.optimizers[index].zero_grad() + model_loss.backward() + self.optimizers[index].step() + self.curr_losses[index] = model_loss.item() + self.update_counter += 1 + self.update_counter %= self.boost_inter * self.num_models + + def estimate_uncertainty( + self, observation: torch.Tensor, actions: torch.Tensor, train_reward: bool + ) -> tuple[float, float, torch.Tensor]: + """ + Estimate uncertainty. + + :param observation: + :param actions: + """ + next_state_samples = None + uncert_rwd = 0.0 + means = [] + vars_s = [] + normalized_state = normalize_observation(observation, self.statistics) + for model in self.world_models: + mean, var = model.forward(normalized_state, actions) + means.append(mean) + vars_s.append(var) + vars_s = torch.stack(vars_s).squeeze() + noises = vars_s.cpu().detach().numpy() + aleatoric = (noises**2).mean(axis=0) ** 0.5 + all_means = torch.stack(means).squeeze() + epistemic = all_means.cpu().detach().numpy() + epistemic = epistemic.var(axis=0) ** 0.5 + aleatoric = np.minimum(aleatoric, 10e3) + epistemic = np.minimum(epistemic, 10e3) + total_unc = (aleatoric**2 + epistemic**2) ** 0.5 + uncert = np.mean(total_unc) + if train_reward: + # Reward Uncertainty + sample_times = 20 + means = torch.vstack(means) + dist = torch.distributions.Normal(means, vars_s) + samples = dist.sample([sample_times]) + samples = torch.reshape( + samples, (sample_times * self.num_models, self.observation_size) + ) + samples = denormalize_observation_delta(samples, self.statistics) + observationss = torch.repeat_interleave( + observation, repeats=sample_times * self.num_models, dim=0 + ) + actionss = torch.repeat_interleave( + actions, repeats=sample_times * self.num_models, dim=0 + ) + samples += observationss + + if self.sas: + if self.prob_rwd: + rewards, rwd_var = self.reward_network( + observationss, actionss, samples + ) + epis_uncert = torch.var(rewards, dim=0).item() + rwd_var = rwd_var.squeeze().detach().cpu().numpy().mean() + alea_uncert = rwd_var + epis_uncert = np.minimum(epis_uncert, 10e3) + alea_uncert = np.minimum(alea_uncert, 10e3) + uncert_rwd = ((epis_uncert**2) + (alea_uncert**2)) ** 0.5 + else: + rewards = self.reward_network(observationss, actionss, samples) + uncert_rwd = torch.var(rewards, dim=0).item() + else: + if self.prob_rwd: + rewards, rwd_var = self.reward_network(samples, actionss) + epis_uncert = torch.var(rewards, dim=0).item() + rwd_var = rwd_var.squeeze().detach().cpu().numpy().mean() + alea_uncert = rwd_var + epis_uncert = np.minimum(epis_uncert, 10e3) + alea_uncert = np.minimum(alea_uncert, 10e3) + uncert_rwd = ((epis_uncert**2) + (alea_uncert**2)) ** 0.5 + else: + rewards = self.reward_network(samples, actionss) + uncert_rwd = torch.var(rewards, dim=0).item() + else: + dist = torch.distributions.Normal(all_means, vars_s) + next_state_samples = dist.sample([20]) + next_state_samples = next_state_samples.reshape( + (self.num_models * 20, self.observation_size) + ) + next_state_samples = denormalize_observation_delta( + next_state_samples, self.statistics + ) + next_state_samples += observation + return uncert, uncert_rwd, next_state_samples + + def train_together( + self, states: torch.Tensor, actions: torch.Tensor, rewards: torch.Tensor + ): + sample_times = 20 + normalized_state = normalize_observation(states, self.statistics) + mean_s = [] + var_s = [] + act_s = [] + state_s = [] + rwd_s = [] + for i in range(self.num_models): + mean, var = self.world_models[i].forward(normalized_state, actions) + mean_s.append(mean) + var_s.append(var) + act_s.append(actions) + state_s.append(states) + rwd_s.append(rewards) + + mean_s = torch.vstack(mean_s) + var_s = torch.vstack(var_s) + act_s = torch.vstack(act_s) + state_s = torch.vstack(state_s) + rwd_s = torch.vstack(rwd_s) + + dist = torch.distributions.Normal(mean_s, var_s) + samples = dist.sample([sample_times]) + + actions = torch.repeat_interleave( + act_s.unsqueeze(dim=0), repeats=sample_times, dim=0 + ) + states = torch.repeat_interleave( + state_s.unsqueeze(dim=0), repeats=sample_times, dim=0 + ) + rwd_s = torch.repeat_interleave( + rwd_s.unsqueeze(dim=0), repeats=sample_times, dim=0 + ) + + samples = torch.reshape( + samples, (samples.shape[0] * samples.shape[1], self.observation_size) + ) + states = torch.reshape( + states, (states.shape[0] * states.shape[1], states.shape[2]) + ) + actions = torch.reshape( + actions, (actions.shape[0] * actions.shape[1], actions.shape[2]) + ) + rwd_s = torch.reshape(rwd_s, (rwd_s.shape[0] * rwd_s.shape[1], rwd_s.shape[2])) + + samples = denormalize_observation_delta(samples, self.statistics) + samples += states + + if self.prob_rwd: + if self.sas: + rwd_mean, rwd_var = self.reward_network(states, actions, samples) + else: + rwd_mean, rwd_var = self.reward_network(samples) + rwd_loss = F.gaussian_nll_loss(rwd_mean, rwd_s, rwd_var) + else: + if self.sas: + rwd_mean = self.reward_network(states, actions, samples) + else: + rwd_mean = self.reward_network(samples) + rwd_loss = F.mse_loss(rwd_mean, rwd_s) + self.reward_optimizer.zero_grad() + rwd_loss.backward() + self.reward_optimizer.step() diff --git a/cares_reinforcement_learning/networks/world_models/ensemble/world_ensemble_one_rwd.py b/cares_reinforcement_learning/networks/world_models/ensemble/world_ensemble_one_rwd.py new file mode 100644 index 00000000..334b8419 --- /dev/null +++ b/cares_reinforcement_learning/networks/world_models/ensemble/world_ensemble_one_rwd.py @@ -0,0 +1,288 @@ +import math +import numpy as np +import torch +import torch.nn.functional as F +import torch.utils +from torch import optim +from cares_reinforcement_learning.networks.world_models.simple import ( + Probabilistic_Dynamics, +) +from cares_reinforcement_learning.networks.world_models import World_Model +from cares_reinforcement_learning.util.helpers import normalize_observation_delta +from cares_reinforcement_learning.util import ( + denormalize_observation_delta, + normalize_observation, +) + + +def sig(x): + """ + Sigmoid + :param x: + :return: + """ + return 1 / (1 + np.exp(-x)) + + +class Ensemble_Dyna_One_Reward(World_Model): + """ + World Model + """ + + def __init__( + self, + observation_size: int, + num_actions: int, + device: str, + num_models: int = 5, + l_r: float = 0.001, + boost_inter: int = 3, + hidden_size=None, + sas: bool = True, + prob_rwd: bool = True, + ): + super().__init__( + observation_size, num_actions, l_r, device, hidden_size, sas, prob_rwd + ) + if hidden_size is None: + hidden_size = [128, 128] + self.num_models = num_models + self.observation_size = observation_size + self.num_actions = num_actions + self.l_r = l_r + self.curr_losses = np.ones((self.num_models,)) * 5 + self.world_models = [ + Probabilistic_Dynamics( + observation_size=observation_size, + num_actions=num_actions, + hidden_size=hidden_size, + ) + for _ in range(self.num_models) + ] + self.optimizers = [ + optim.Adam(self.world_models[i].parameters(), lr=l_r) + for i in range(self.num_models) + ] + self.statistics = {} + # Bring all reward prediction and dynamic rediction networks to device. + self.device = device + for model in self.world_models: + model.to(device) + self.boost_inter = boost_inter + self.update_counter = 0 + + def pred_next_states( + self, observation: torch.Tensor, actions: torch.Tensor + ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: + assert ( + observation.shape[1] + actions.shape[1] + == self.observation_size + self.num_actions + ) + norm_means = [] + norm_vars = [] + normalized_observation = normalize_observation(observation, self.statistics) + # Iterate over the neural networks and get the predictions + for model in self.world_models: + # Predict delta + n_mean, n_var = model.forward(normalized_observation, actions) + norm_means.append(n_mean) + norm_vars.append(n_var) + predictions_vars = torch.stack(norm_vars) + predictions_norm_means = torch.stack(norm_means) + # Normalized + predictions_means = denormalize_observation_delta( + predictions_norm_means, self.statistics + ) + all_predictions = predictions_means + observation + denorm_avg = torch.mean(predictions_means, dim=0) + prediction = denorm_avg + observation + return prediction, all_predictions, predictions_norm_means, predictions_vars + + def train_world( + self, + states: torch.Tensor, + actions: torch.Tensor, + next_states: torch.Tensor, + ) -> None: + # This boosting part is useless, cause inaccuracy. + # weights = 1.5 - sig(self.curr_losses) + # weights /= np.max(weights) + assert len(states.shape) >= 2 + assert len(actions.shape) == 2 + assert ( + states.shape[1] + actions.shape[1] + == self.num_actions + self.observation_size + ) + # min_ = np.min(self.curr_losses) + # max_ = np.max(self.curr_losses) + # delta = max_ - min_ + # if delta == 0: + # delta = 0.1 + # temp = (self.curr_losses - min_) / delta * 5.0 + # temp = sig(temp) + # temp[index] * + index = int(math.floor(self.update_counter / self.boost_inter)) + target = next_states - states + delta_targets_normalized = normalize_observation_delta(target, self.statistics) + normalized_state = normalize_observation(states, self.statistics) + n_mean, n_var = self.world_models[index].forward(normalized_state, actions) + model_loss = F.gaussian_nll_loss( + input=n_mean, target=delta_targets_normalized, var=n_var + ).mean() + self.optimizers[index].zero_grad() + model_loss.backward() + self.optimizers[index].step() + self.curr_losses[index] = model_loss.item() + self.update_counter += 1 + self.update_counter %= self.boost_inter * self.num_models + + def estimate_uncertainty( + self, observation: torch.Tensor, actions: torch.Tensor, train_reward: bool + ) -> tuple[float, float, torch.Tensor]: + """ + Estimate uncertainty. + + :param observation: + :param actions: + """ + next_state_samples = None + uncert_rwd = 0.0 + means = [] + vars_s = [] + normalized_state = normalize_observation(observation, self.statistics) + for model in self.world_models: + mean, var = model.forward(normalized_state, actions) + means.append(mean) + vars_s.append(var) + vars_s = torch.stack(vars_s).squeeze() + noises = vars_s.cpu().detach().numpy() + aleatoric = (noises**2).mean(axis=0) ** 0.5 + all_means = torch.stack(means).squeeze() + epistemic = all_means.cpu().detach().numpy() + epistemic = epistemic.var(axis=0) ** 0.5 + aleatoric = np.minimum(aleatoric, 10e3) + epistemic = np.minimum(epistemic, 10e3) + total_unc = (aleatoric**2 + epistemic**2) ** 0.5 + uncert = np.mean(total_unc) + if train_reward: + # Reward Uncertainty + sample_times = 20 + means = torch.vstack(means) + dist = torch.distributions.Normal(means, vars_s) + samples = dist.sample([sample_times]) + samples = torch.reshape( + samples, (sample_times * self.num_models, self.observation_size) + ) + samples = denormalize_observation_delta(samples, self.statistics) + observationss = torch.repeat_interleave( + observation, repeats=sample_times * self.num_models, dim=0 + ) + actionss = torch.repeat_interleave( + actions, repeats=sample_times * self.num_models, dim=0 + ) + samples += observationss + + if self.sas: + if self.prob_rwd: + rewards, rwd_var = self.reward_network( + observationss, actionss, samples + ) + epis_uncert = torch.var(rewards, dim=0).item() + rwd_var = rwd_var.squeeze().detach().cpu().numpy().mean() + alea_uncert = rwd_var + epis_uncert = np.minimum(epis_uncert, 10e3) + alea_uncert = np.minimum(alea_uncert, 10e3) + uncert_rwd = ((epis_uncert**2) + (alea_uncert**2)) ** 0.5 + else: + rewards = self.reward_network(observationss, actionss, samples) + uncert_rwd = torch.var(rewards, dim=0).item() + else: + if self.prob_rwd: + rewards, rwd_var = self.reward_network(samples, actionss) + epis_uncert = torch.var(rewards, dim=0).item() + rwd_var = rwd_var.squeeze().detach().cpu().numpy().mean() + alea_uncert = rwd_var + epis_uncert = np.minimum(epis_uncert, 10e3) + alea_uncert = np.minimum(alea_uncert, 10e3) + uncert_rwd = ((epis_uncert**2) + (alea_uncert**2)) ** 0.5 + else: + rewards = self.reward_network(samples, actionss) + uncert_rwd = torch.var(rewards, dim=0).item() + else: + dist = torch.distributions.Normal(all_means, vars_s) + next_state_samples = dist.sample([20]) + next_state_samples = next_state_samples.reshape( + (self.num_models * 20, self.observation_size) + ) + next_state_samples = denormalize_observation_delta( + next_state_samples, self.statistics + ) + next_state_samples += observation + return uncert, uncert_rwd, next_state_samples + + def train_together( + self, states: torch.Tensor, actions: torch.Tensor, rewards: torch.Tensor + ): + sample_times = 20 + normalized_state = normalize_observation(states, self.statistics) + mean_s = [] + var_s = [] + act_s = [] + state_s = [] + rwd_s = [] + for i in range(self.num_models): + mean, var = self.world_models[i].forward(normalized_state, actions) + mean_s.append(mean) + var_s.append(var) + act_s.append(actions) + state_s.append(states) + rwd_s.append(rewards) + + mean_s = torch.vstack(mean_s) + var_s = torch.vstack(var_s) + act_s = torch.vstack(act_s) + state_s = torch.vstack(state_s) + rwd_s = torch.vstack(rwd_s) + + dist = torch.distributions.Normal(mean_s, var_s) + samples = dist.sample([sample_times]) + + actions = torch.repeat_interleave( + act_s.unsqueeze(dim=0), repeats=sample_times, dim=0 + ) + states = torch.repeat_interleave( + state_s.unsqueeze(dim=0), repeats=sample_times, dim=0 + ) + rwd_s = torch.repeat_interleave( + rwd_s.unsqueeze(dim=0), repeats=sample_times, dim=0 + ) + + samples = torch.reshape( + samples, (samples.shape[0] * samples.shape[1], self.observation_size) + ) + states = torch.reshape( + states, (states.shape[0] * states.shape[1], states.shape[2]) + ) + actions = torch.reshape( + actions, (actions.shape[0] * actions.shape[1], actions.shape[2]) + ) + rwd_s = torch.reshape(rwd_s, (rwd_s.shape[0] * rwd_s.shape[1], rwd_s.shape[2])) + + samples = denormalize_observation_delta(samples, self.statistics) + samples += states + + if self.prob_rwd: + if self.sas: + rwd_mean, rwd_var = self.reward_network(states, actions, samples) + else: + rwd_mean, rwd_var = self.reward_network(samples, actions) + rwd_loss = F.gaussian_nll_loss(rwd_mean, rwd_s, rwd_var) + else: + if self.sas: + rwd_mean = self.reward_network(states, actions, samples) + else: + rwd_mean = self.reward_network(samples, actions) + rwd_loss = F.mse_loss(rwd_mean, rwd_s) + self.reward_optimizer.zero_grad() + rwd_loss.backward() + self.reward_optimizer.step() diff --git a/cares_reinforcement_learning/networks/world_models/ensemble_integrated.py b/cares_reinforcement_learning/networks/world_models/ensemble_integrated.py deleted file mode 100644 index 9b4c3f3e..00000000 --- a/cares_reinforcement_learning/networks/world_models/ensemble_integrated.py +++ /dev/null @@ -1,313 +0,0 @@ -import logging -import math -import random -import sys - -import numpy as np -import torch -import torch.nn.functional as F -import torch.utils -from torch import optim - -from cares_reinforcement_learning.networks.world_models import ( - SimpleDynamics, - SimpleReward, -) -import cares_reinforcement_learning.util.helpers as hlp - - -class IntegratedWorldModel: - """ - A integrated world model aims to train the reward prediciton and next state - prediciton together. - - :param (int) observation_size -- dimension of states - :param (int) num_actions -- dimension of actions - :param (int) hidden_size -- size of neurons in hidden layers. - """ - - def __init__( - self, - observation_size: int, - num_actions: int, - hidden_size: int, - lr: float = 0.001, - ): - self.dyna_network = SimpleDynamics( - observation_size=observation_size, - num_actions=num_actions, - hidden_size=hidden_size, - ) - - self.reward_network = SimpleReward( - observation_size=observation_size, - num_actions=num_actions, - hidden_size=hidden_size, - ) - - self.reward_optimizer = optim.Adam(self.reward_network.parameters(), lr=lr) - - self.dyna_optimizer = optim.Adam(self.dyna_network.parameters(), lr=lr) - - self.all_optimizer = optim.Adam( - list(self.reward_network.parameters()) - + list(self.dyna_network.parameters()), - lr=lr, - ) - - self.statistics = {} - - def train_dynamics( - self, states: torch.Tensor, actions: torch.Tensor, next_states: torch.Tensor - ) -> None: - """ - Train the dynamics (next state prediciton) alone. Predicting the delta - rather than the next state. - - :param (Tensor) states -- states input - :param (Tensor) actions -- actions input - :param (Tensor) next_states -- target label. - """ - target = next_states - states - delta_targets_normalized = hlp.normalize_observation_delta( - target, self.statistics - ) - - _, n_mean, n_var = self.dyna_network.forward(states, actions) - - model_loss = F.gaussian_nll_loss( - input=n_mean, target=delta_targets_normalized, var=n_var - ).mean() - - self.dyna_optimizer.zero_grad() - model_loss.backward() - self.dyna_optimizer.step() - - def train_overall( - self, - states: torch.Tensor, - actions: torch.Tensor, - next_states: torch.Tensor, - next_actions: torch.Tensor, - next_rewards: torch.Tensor, - ) -> None: - """ - Do one step preidiciton, train both network together. Add Two loss - functions. - - :param (Tensor) states: - :param (Tensor) actions: - :param (Tensor) next_states: - :param (Tensor) next_actions: - :param (Tensor) next_rewards: - """ - # Get the dynamics training losses first - mean_deltas, normalized_mean, normalized_var = self.dyna_network.forward( - states, actions - ) - - # Always denormalized delta - pred_next_state = mean_deltas + states - target = next_states - states - - delta_targets_normalized = hlp.normalize_observation_delta( - target, self.statistics - ) - - model_loss = F.gaussian_nll_loss( - input=normalized_mean, target=delta_targets_normalized, var=normalized_var - ).mean() - - pred_rewards = self.reward_network.forward(pred_next_state, next_actions) - - all_loss = F.mse_loss(pred_rewards, next_rewards) + model_loss.mean() - - # Update - self.all_optimizer.zero_grad() - all_loss.backward() - self.all_optimizer.step() - - -class EnsembleWorldReward: - """ - Ensemble the integrated dynamic reward models. It works like a group of - experts. The predicted results can be used to estimate the uncertainty. - - :param (int) observation_size -- dimension of states - :param (int) num_actions -- dimension of actions - :param (int) num_models -- number of world models in this ensemble. - :param (int) hidden_size -- size of neurons in hidden layers. - """ - - def __init__( - self, - observation_size: int, - num_actions: int, - num_models: int, - lr: float, - device: torch.device, - hidden_size: int = 128, - ): - self.num_models = num_models - self.observation_size = observation_size - self.num_actions = num_actions - - self.models = [ - IntegratedWorldModel( - observation_size=observation_size, - num_actions=num_actions, - hidden_size=hidden_size, - lr=lr, - ) - for _ in range(self.num_models) - ] - self.statistics = {} - - # Bring all reward prediction and dynamic rediction networks to device. - self.device = device - for model in self.models: - model.dyna_network.to(device) - model.reward_network.to(device) - - def set_statistics(self, statistics: dict) -> None: - """ - Update all statistics for normalization for all world models and the - ensemble itself. - - :param (Dictionary) statistics: - """ - for key, value in statistics.items(): - if isinstance(value, np.ndarray): - statistics[key] = torch.FloatTensor(statistics[key]).to(self.device) - - self.statistics = statistics - for model in self.models: - model.statistics = statistics - model.dyna_network.statistics = statistics - - def pred_rewards( - self, observation: torch.Tensor, actions: torch.Tensor - ) -> torch.Tensor: - """ - Make a prediciton of rewards based on current state and actions. Take - the mean of rewards as final for now. - - :param (Tensors) obs -- dimension of states - :param (Tensors) actions -- dimension of actions - - :return (Tensors) reward -- predicted mean rewards. - :return (List) rewards -- A list of predicted rewards. For STEVE use. - """ - rewards = [] - for model in self.models: - pred_rewards = model.reward_network.forward(observation, actions) - rewards.append(pred_rewards) - - # Use average - rewards = torch.stack(rewards) - reward = torch.min(rewards, dim=0).values # Pessimetic - - return reward, rewards - - def pred_next_states( - self, observation: torch.Tensor, actions: torch.Tensor - ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: - """ - Predict the next state based on current state and action, using an - ensemble of world models. The world model is probablisitic. It is - trained with Gaussian NLL loss. - - :param (Tensors) obs -- dimension of states - :param (Tensors) actions -- dimension of actions - - :return (Tensors) random picked next state predicitons - :return (Tensors) all next state predicitons - :return (Tensors) all normalized delta' means - :return (Tensors) all normalized delta' vars - """ - means = [] - norm_means = [] - norm_vars = [] - - # Iterate over the neural networks and get the predictions - for model in self.models: - # Predict delta - mean, n_mean, n_var = model.dyna_network.forward(observation, actions) - means.append(mean) - norm_means.append(n_mean) - norm_vars.append(n_var) - - # Normalized - predictions_means = torch.stack(means) - predictions_norm_means = torch.stack(norm_means) - predictions_vars = torch.stack(norm_vars) - - # Get rid of the nans - not_nans = [] - for i in range(self.num_models): - if not torch.any(torch.isnan(predictions_means[i])): - not_nans.append(i) - if len(not_nans) == 0: - logging.info("Predicting all Nans") - sys.exit() - - rand_ind = random.randint(0, len(not_nans) - 1) - prediction = predictions_means[not_nans[rand_ind]] - - # next = current + delta - prediction += observation - all_predictions = torch.stack(means) - for j in range(all_predictions.shape[0]): - all_predictions[j] += observation - - return prediction, all_predictions, predictions_norm_means, predictions_vars - - def train_world( - self, - states: torch.Tensor, - actions: torch.Tensor, - rewards: torch.Tensor, - next_states: torch.Tensor, - next_actions: torch.Tensor, - next_rewards: torch.Tensor, - ) -> None: - # pylint: disable-next=unused-argument - """ - This function decides how to train both reward prediciton and dynamic - prediction. - - :param (Tensors) input states: - :param (Tensors) input actions: - :param (Tensors) input rewards: - :param (Tensors) input next_states: - :param (Tensors) input next_actions: - :param (Tensors) input next_rewards: - """ - # For each model, train with different data. - mini_batch_size = int(math.floor(states.shape[0] / self.num_models)) - - for i in range(self.num_models): - states_i = states[i * mini_batch_size : (i + 1) * mini_batch_size] - actions_i = actions[i * mini_batch_size : (i + 1) * mini_batch_size] - next_states_i = next_states[i * mini_batch_size : (i + 1) * mini_batch_size] - - self.models[i].train_dynamics( - states_i, - actions_i, - next_states_i, - ) - - next_actions_i = next_actions[ - i * mini_batch_size : (i + 1) * mini_batch_size - ] - next_rewards_i = next_rewards[ - i * mini_batch_size : (i + 1) * mini_batch_size - ] - - self.models[i].train_overall( - states_i, - actions_i, - next_states_i, - next_actions_i, - next_rewards_i, - ) diff --git a/cares_reinforcement_learning/networks/world_models/simple/__init__.py b/cares_reinforcement_learning/networks/world_models/simple/__init__.py new file mode 100644 index 00000000..96f070c5 --- /dev/null +++ b/cares_reinforcement_learning/networks/world_models/simple/__init__.py @@ -0,0 +1,5 @@ +from .simple_ns_reward import Simple_NS_Reward +from .simple_sas_reward import Simple_SAS_Reward +from .probabilistic_ns_reward import Probabilistic_NS_Reward +from .probabilistic_sas_reward import Probabilistic_SAS_Reward +from .probabilistic_dynamic import Probabilistic_Dynamics diff --git a/cares_reinforcement_learning/networks/world_models/simple_dynamics.py b/cares_reinforcement_learning/networks/world_models/simple/probabilistic_dynamic.py similarity index 61% rename from cares_reinforcement_learning/networks/world_models/simple_dynamics.py rename to cares_reinforcement_learning/networks/world_models/simple/probabilistic_dynamic.py index 3464c876..fb9b7477 100644 --- a/cares_reinforcement_learning/networks/world_models/simple_dynamics.py +++ b/cares_reinforcement_learning/networks/world_models/simple/probabilistic_dynamic.py @@ -1,12 +1,11 @@ import torch -import torch.nn.functional as F import torch.utils from torch import nn -import cares_reinforcement_learning.util.helpers as hlp +from cares_reinforcement_learning.util import weight_init_pnn, MLP -class SimpleDynamics(nn.Module): +class Probabilistic_Dynamics(nn.Module): """ A world model with fully connected layers. It takes current states (s) and current actions (a), and predict next states (s'). @@ -22,23 +21,27 @@ class SimpleDynamics(nn.Module): :param (int) hidden_size -- size of neurons in hidden layers. """ - def __init__(self, observation_size: int, num_actions: int, hidden_size: int): + def __init__(self, observation_size: int, num_actions: int, hidden_size: list): + print("Create a Prob Dynamics") super().__init__() self.observation_size = observation_size self.num_actions = num_actions - self.layer1 = nn.Linear(observation_size + num_actions, hidden_size) - self.layer2 = nn.Linear(hidden_size, hidden_size) - self.mean_layer = nn.Linear(hidden_size, observation_size) - self.logvar_layer = nn.Linear(hidden_size, observation_size) + self.model = MLP( + input_size=observation_size + num_actions, + hidden_sizes=hidden_size, + output_size=2 * observation_size, + ) + + self.add_module("mlp", self.model) - self.apply(hlp.weight_init) + self.model.apply(weight_init_pnn) self.statistics = {} def forward( self, observation: torch.Tensor, actions: torch.Tensor - ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + ) -> tuple[torch.Tensor, torch.Tensor]: """ Forward the inputs throught the network. @@ -52,23 +55,16 @@ def forward( :return (Tensors) normalized_var -- normalized delta of var for uncertainty estimation. """ - + assert ( + observation.shape[1] + actions.shape[1] + == self.observation_size + self.num_actions + ) # Always normalized obs - normalized_obs = hlp.normalize_observation(observation, self.statistics) - - x = torch.cat((normalized_obs, actions), dim=1) - x = self.layer1(x) - x = F.relu(x) - x = self.layer2(x) - x = F.relu(x) - - normalized_mean = self.mean_layer(x) - logvar = self.logvar_layer(x) + x = torch.cat((observation, actions), dim=1) + pred = self.model(x) + logvar = pred[:, : self.observation_size] + normalized_mean = pred[:, self.observation_size :] logvar = torch.tanh(logvar) normalized_var = torch.exp(logvar) - # Always denormalized delta - mean_deltas = hlp.denormalize_observation_delta( - normalized_mean, self.statistics - ) - return mean_deltas, normalized_mean, normalized_var + return normalized_mean, normalized_var diff --git a/cares_reinforcement_learning/networks/world_models/simple/probabilistic_ns_reward.py b/cares_reinforcement_learning/networks/world_models/simple/probabilistic_ns_reward.py new file mode 100644 index 00000000..010bfbef --- /dev/null +++ b/cares_reinforcement_learning/networks/world_models/simple/probabilistic_ns_reward.py @@ -0,0 +1,43 @@ +import torch +from torch import nn, Tensor +import torch.nn.functional as F +from cares_reinforcement_learning.util import weight_init_pnn, MLP, weight_init + + +class Probabilistic_NS_Reward(nn.Module): + def __init__( + self, + observation_size: int, + num_actions: int, + hidden_size: list, + normalize: bool, + ): + """ + Note, This reward function is limited to 0 ~ 1 for dm_control. + A reward model with fully connected layers. It takes current states (s) + and current actions (a), and predict rewards (r). + """ + super().__init__() + print("Create a Prob NS Rewrad") + self.normalize = normalize + self.observation_size = observation_size + self.num_actions = num_actions + self.model = MLP( + input_size=observation_size, hidden_sizes=hidden_size, output_size=2 + ) + self.add_module("mlp", self.model) + self.model.apply(weight_init) + + def forward(self, next_observation: torch.Tensor) -> tuple[Tensor, Tensor]: + """ + Forward the inputs throught the network. + Note: For DMCS environment, the reward is from 0~1. + """ + pred = self.model(next_observation) + var_mean = pred[:, 1].unsqueeze(dim=1) + rwd_mean = pred[:, 0].unsqueeze(dim=1) + logvar = torch.tanh(var_mean) + normalized_var = torch.exp(logvar) + if self.normalize: + rwd_mean = F.sigmoid(rwd_mean) + return rwd_mean, normalized_var diff --git a/cares_reinforcement_learning/networks/world_models/simple/probabilistic_sas_reward.py b/cares_reinforcement_learning/networks/world_models/simple/probabilistic_sas_reward.py new file mode 100644 index 00000000..122be07c --- /dev/null +++ b/cares_reinforcement_learning/networks/world_models/simple/probabilistic_sas_reward.py @@ -0,0 +1,63 @@ +import torch +from torch import nn, Tensor +import torch.nn.functional as F +from cares_reinforcement_learning.util import weight_init_pnn, MLP, weight_init + + +class Probabilistic_SAS_Reward(nn.Module): + def __init__( + self, + observation_size: int, + num_actions: int, + hidden_size: list, + normalize: bool, + ): + """ + Note, This reward function is limited to 0 ~ 1 for dm_control. + A reward model with fully connected layers. It takes current states (s) + and current actions (a), and predict rewards (r). + + :param (int) observation_size -- dimension of states + :param (int) num_actions -- dimension of actions + :param (int) hidden_size -- size of neurons in hidden layers. + """ + super().__init__() + print("Create a Prob SAS Rewrad") + self.normalize = normalize + self.observation_size = observation_size + self.num_actions = num_actions + + self.model = MLP( + input_size=2 * observation_size + num_actions, + hidden_sizes=hidden_size, + output_size=2, + ) + + self.add_module("mlp", self.model) + self.model.apply(weight_init) + + def forward( + self, + observation: torch.Tensor, + actions: torch.Tensor, + next_observation: torch.Tensor, + ) -> tuple[Tensor, Tensor]: + """ + Forward the inputs throught the network. + Note: For DMCS environment, the reward is from 0~1. + + :param (Tensors) obs -- dimension of states + :param (Tensors) actions -- dimension of actions + :param (Bool) normalized -- whether normalized reward to 0~1 + + :return (Tensors) x -- predicted rewards. + """ + x = torch.cat((observation, actions, next_observation), dim=1) + pred = self.model(x) + rwd_mean = pred[:, 0].unsqueeze(dim=1) + var_mean = pred[:, 1].unsqueeze(dim=1) + logvar = torch.tanh(var_mean) + normalized_var = torch.exp(logvar) + if self.normalize: + rwd_mean = F.sigmoid(rwd_mean) + return rwd_mean, normalized_var diff --git a/cares_reinforcement_learning/networks/world_models/simple_rewards.py b/cares_reinforcement_learning/networks/world_models/simple/simple_ns_reward.py similarity index 55% rename from cares_reinforcement_learning/networks/world_models/simple_rewards.py rename to cares_reinforcement_learning/networks/world_models/simple/simple_ns_reward.py index cee44a93..52a30b7a 100644 --- a/cares_reinforcement_learning/networks/world_models/simple_rewards.py +++ b/cares_reinforcement_learning/networks/world_models/simple/simple_ns_reward.py @@ -1,12 +1,17 @@ import torch -import torch.nn.functional as F from torch import nn - -import cares_reinforcement_learning.util.helpers as hlp +import torch.nn.functional as F +from cares_reinforcement_learning.util import weight_init_pnn, MLP, weight_init -class SimpleReward(nn.Module): - def __init__(self, observation_size: int, num_actions: int, hidden_size: int): +class Simple_NS_Reward(nn.Module): + def __init__( + self, + observation_size: int, + num_actions: int, + hidden_size: list, + normalize: bool, + ): """ Note, This reward function is limited to 0 ~ 1 for dm_control. A reward model with fully connected layers. It takes current states (s) @@ -17,17 +22,17 @@ def __init__(self, observation_size: int, num_actions: int, hidden_size: int): :param (int) hidden_size -- size of neurons in hidden layers. """ super().__init__() + print("Create a Simple NS Rewrad") + self.normalize = normalize self.observation_size = observation_size self.num_actions = num_actions - self.linear1 = nn.Linear(observation_size + num_actions, hidden_size) - self.linear2 = nn.Linear(hidden_size, hidden_size) - self.linear3 = nn.Linear(hidden_size, 1) + self.model = MLP( + input_size=observation_size, hidden_sizes=hidden_size, output_size=1 + ) + self.add_module("mlp", self.model) + self.model.apply(weight_init) - self.apply(hlp.weight_init) - - def forward( - self, observation: torch.Tensor, actions: torch.Tensor, normalized: bool = False - ) -> torch.Tensor: + def forward(self, observation: torch.Tensor) -> torch.Tensor: """ Forward the inputs throught the network. Note: For DMCS environment, the reward is from 0~1. @@ -38,14 +43,7 @@ def forward( :return (Tensors) x -- predicted rewards. """ - x = torch.cat((observation, actions), dim=1) - x = self.linear1(x) - x = F.relu(x) - x = self.linear2(x) - x = F.relu(x) - x = self.linear3(x) - - if normalized: - x = F.sigmoid(x) - - return x + rwd_mean = self.model(observation) + if self.normalize: + rwd_mean = F.sigmoid(rwd_mean) + return rwd_mean diff --git a/cares_reinforcement_learning/networks/world_models/simple/simple_sas_reward.py b/cares_reinforcement_learning/networks/world_models/simple/simple_sas_reward.py new file mode 100644 index 00000000..2df6e8b3 --- /dev/null +++ b/cares_reinforcement_learning/networks/world_models/simple/simple_sas_reward.py @@ -0,0 +1,61 @@ +import torch +from torch import nn +import torch.nn.functional as F +from cares_reinforcement_learning.util import weight_init_pnn, MLP, weight_init + + +class Simple_SAS_Reward(nn.Module): + def __init__( + self, + observation_size: int, + num_actions: int, + hidden_size: list, + normalize: bool, + ): + """ + Note, This reward function is limited to 0 ~ 1 for dm_control. + A reward model with fully connected layers. It takes current states (s) + and current actions (a), and predict rewards (r). + + :param (int) observation_size -- dimension of states + :param (int) num_actions -- dimension of actions + :param (int) hidden_size -- size of neurons in hidden layers. + """ + super().__init__() + print("Create a Simple SAS Rewrad") + self.normalize = normalize + self.observation_size = observation_size + self.num_actions = num_actions + self.model = MLP( + input_size=2 * observation_size + num_actions, + hidden_sizes=hidden_size, + output_size=1, + ) + self.add_module("mlp", self.model) + self.model.apply(weight_init) + + def forward( + self, + observation: torch.Tensor, + actions: torch.Tensor, + next_observation: torch.Tensor, + ) -> torch.Tensor: + """ + Forward the inputs throught the network. + Note: For DMCS environment, the reward is from 0~1. + + :param (Tensors) obs -- dimension of states + :param (Tensors) actions -- dimension of actions + :param (Bool) normalized -- whether normalized reward to 0~1 + + :return (Tensors) x -- predicted rewards. + """ + assert ( + observation.shape[1] + actions.shape[1] + == self.observation_size + self.num_actions + ) + x = torch.cat((observation, actions, next_observation), dim=1) + rwd_mean = self.model(x) + if self.normalize: + rwd_mean = F.sigmoid(rwd_mean) + return rwd_mean diff --git a/cares_reinforcement_learning/networks/world_models/world_model.py b/cares_reinforcement_learning/networks/world_models/world_model.py new file mode 100644 index 00000000..6077f198 --- /dev/null +++ b/cares_reinforcement_learning/networks/world_models/world_model.py @@ -0,0 +1,279 @@ +import logging +import torch +import numpy as np +from cares_reinforcement_learning.networks.world_models.simple import ( + Probabilistic_SAS_Reward, + Probabilistic_NS_Reward, +) +from cares_reinforcement_learning.networks.world_models.simple import ( + Simple_SAS_Reward, + Simple_NS_Reward, +) +import torch.nn.functional as F +import torch.utils +from torch import optim + + +class World_Model: + """ + World Model + """ + + def __init__( + self, + observation_size: int, + num_actions: int, + l_r: float, + device: str, + hidden_size=None, + sas: bool = True, + prob_rwd: bool = False, + num_rwd_model: int = 5, + ): + logging.info(f"Num of Reward models: {num_rwd_model}") + if hidden_size is None: + hidden_size = [128, 128] + self.sas = None + self.prob_rwd = None + self.statistics = {} + self.device = device + self.sas = sas + self.prob_rwd = prob_rwd + self.statistics = {} + self.counter = 0 + self.num_rwd_model = num_rwd_model + + self.rwd_models = [] + self.rwd_model_optimizers = [] + for i in range(self.num_rwd_model): + if prob_rwd: + if sas: + reward_network = Probabilistic_SAS_Reward( + observation_size=observation_size, + num_actions=num_actions, + hidden_size=hidden_size, + normalize=False, + ) + else: + reward_network = Probabilistic_NS_Reward( + observation_size=observation_size, + num_actions=num_actions, + hidden_size=hidden_size, + normalize=False, + ) + else: + if sas: + reward_network = Simple_SAS_Reward( + observation_size=observation_size, + num_actions=num_actions, + hidden_size=hidden_size, + normalize=False, + ) + else: + reward_network = Simple_NS_Reward( + observation_size=observation_size, + num_actions=num_actions, + hidden_size=hidden_size, + normalize=False, + ) + reward_network.to(self.device) + self.rwd_models.append(reward_network) + reward_optimizer = optim.Adam(reward_network.parameters(), lr=l_r) + self.rwd_model_optimizers.append(reward_optimizer) + + def set_statistics(self, statistics: dict) -> None: + """ + Update all statistics for normalization for all world models and the + ensemble itself. + + :param (Dictionary) statistics: + """ + for key, value in statistics.items(): + if isinstance(value, np.ndarray): + statistics[key] = torch.FloatTensor(statistics[key]).to(self.device) + self.statistics = statistics + + def train_world( + self, + states: torch.Tensor, + actions: torch.Tensor, + next_states: torch.Tensor, + ) -> None: + """ + Train the dynamic of world model. + :param states: + :param actions: + :param next_states: + """ + logging.info(" Train world Not Implemented") + + def pred_next_states( + self, observation: torch.Tensor, actions: torch.Tensor + ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: + """ + Make a prediction of next state. + :param observation: + :param actions: + :return: Next_state Prediction, Next_state Means, Next_State Variance. + """ + logging.info("Predict Next Not Implemented") + return ( + torch.zeros(observation.shape), + torch.zeros(observation.shape), + torch.zeros(observation.shape), + ) + + def train_reward( + self, + states: torch.Tensor, + actions: torch.Tensor, + next_states: torch.Tensor, + rewards: torch.Tensor, + ) -> None: + """ + Train the reward prediction with or without world model dynamics. + + :param states: + :param actions: + :param next_states: + :param rewards: + """ + indice = self.counter % self.num_rwd_model + self.rwd_model_optimizers[indice].zero_grad() + if self.prob_rwd: + if self.sas: + rwd_mean, rwd_var = self.rwd_models[indice]( + states, actions, next_states + ) + else: + rwd_mean, rwd_var = self.rwd_models[indice](next_states) + reward_loss = F.gaussian_nll_loss( + input=rwd_mean, target=rewards, var=rwd_var + ) + else: + if self.sas: + rwd_mean = self.rwd_models[indice](states, actions, next_states) + else: + rwd_mean = self.rwd_models[indice](next_states) + reward_loss = F.mse_loss(rwd_mean, rewards) + reward_loss.backward() + self.rwd_model_optimizers[indice].step() + self.counter += 1 + + def pred_rewards( + self, + observation: torch.Tensor, + action: torch.Tensor, + next_observation: torch.Tensor, + ) -> tuple[torch.Tensor, torch.Tensor]: + """ + Predict reward based on SAS + :param observation: + :param action: + :param next_observation: + :return: Predicted rewards, Means of rewards, Variances of rewards + """ + preds = [] + preds_vars = [] + for i in range(self.num_rwd_model): + if self.prob_rwd: + if self.sas: + pred_rewards, rwd_var = self.rwd_models[i]( + observation, action, next_observation + ) + else: + pred_rewards, rwd_var = self.rwd_models[i](next_observation) + else: + if self.sas: + pred_rewards = self.rwd_models[i]( + observation, action, next_observation + ) + else: + pred_rewards = self.rwd_models[i](next_observation) + rwd_var = None + preds.append(pred_rewards) + preds_vars.append(rwd_var) + preds = torch.stack(preds) + total_unc = 0.0 + if self.num_rwd_model > 1: + epistemic_uncert = torch.var(preds, dim=0) ** 0.5 + aleatoric_uncert = torch.zeros(epistemic_uncert.shape) + if rwd_var is None: + rwd_var = torch.zeros(preds.shape) + else: + rwd_var = torch.stack(preds_vars) + aleatoric_uncert = torch.mean(rwd_var**2, dim=0) ** 0.5 + total_unc = (aleatoric_uncert**2 + epistemic_uncert**2) ** 0.5 + + if preds.shape[0] > 1: + preds = torch.mean(preds, dim=0) + else: + preds = preds[0] + + return preds, total_unc + + def pred_all_rewards( + self, + observation: torch.Tensor, + action: torch.Tensor, + next_observation: torch.Tensor, + ) -> tuple[torch.Tensor, torch.Tensor]: + """ + Predict reward based on SAS + :param observation: + :param action: + :param next_observation: + :return: Predicted rewards, Means of rewards, Variances of rewards + """ + preds = [] + preds_vars = [] + for j in range(next_observation.shape[0]): + for i in range(self.num_rwd_model): + if self.prob_rwd: + if self.sas: + pred_rewards, rwd_var = self.rwd_models[i]( + observation, action, next_observation[j] + ) + else: + pred_rewards, rwd_var = self.rwd_models[i](next_observation[j]) + else: + if self.sas: + pred_rewards = self.rwd_models[i]( + observation, action, next_observation[j] + ) + else: + pred_rewards = self.rwd_models[i](next_observation[j]) + rwd_var = None + preds.append(pred_rewards) + preds_vars.append(rwd_var) + preds = torch.stack(preds) + if rwd_var is None: + preds_vars = torch.zeros(preds.shape) + else: + preds_vars = torch.stack(preds_vars) + + return preds, preds_vars + + def estimate_uncertainty( + self, + observation: torch.Tensor, + actions: torch.Tensor, + train_reward: bool, + ) -> tuple[float, float, torch.Tensor]: + """ + Estimate next state uncertainty and reward uncertainty. + + :param observation: + :param actions: + :return: Dynamic Uncertainty, Reward Uncertainty + """ + logging.info("Estimating Uncertainty Not Implemented") + return 0.0, 0.0, None + + def train_together( + self, + states: torch.Tensor, + actions: torch.Tensor, + rewards: torch.Tensor, + ): + logging.info("Train Together Not Implemented") diff --git a/cares_reinforcement_learning/util/__init__.py b/cares_reinforcement_learning/util/__init__.py index 2b4965bb..6a13b055 100644 --- a/cares_reinforcement_learning/util/__init__.py +++ b/cares_reinforcement_learning/util/__init__.py @@ -1,3 +1,5 @@ from cares_reinforcement_learning.util.network_factory import NetworkFactory from cares_reinforcement_learning.util.record import Record from cares_reinforcement_learning.util.rl_parser import RLParser +from cares_reinforcement_learning.util.helpers import * +from cares_reinforcement_learning.util.uncertainty_estimation import * diff --git a/cares_reinforcement_learning/util/configurations.py b/cares_reinforcement_learning/util/configurations.py index 8a660ebd..a1a47c4b 100644 --- a/cares_reinforcement_learning/util/configurations.py +++ b/cares_reinforcement_learning/util/configurations.py @@ -3,12 +3,12 @@ import pydantic from pydantic import BaseModel, Field from torch import nn - from cares_reinforcement_learning.encoders.configurations import ( BurgessConfig, VanillaAEConfig, ) + # pylint disbale-next=unused-import # NOTE: If a parameter is a list then don't wrap with Optional leave as implicit optional - list[type] = default @@ -104,7 +104,7 @@ class AlgorithmConfig(SubscriptableClass): algorithm: str = Field(description="Name of the algorithm to be used") G: int = 1 - G_model: int = 1 + G_model: float = 1 buffer_size: int = 1000000 batch_size: int = 256 max_steps_exploration: int = 1000 @@ -118,11 +118,379 @@ class AlgorithmConfig(SubscriptableClass): image_observation: int = 0 +class PPOConfig(AlgorithmConfig): + algorithm: str = Field("PPO", Literal=True) + actor_lr: float = 1e-4 + critic_lr: float = 1e-3 + + gamma: float = 0.99 + eps_clip: float = 0.2 + updates_per_iteration: int = 10 + + max_steps_per_batch: int = 5000 + + actor_config: MLPConfig = MLPConfig( + hidden_sizes=[1024, 1024], output_activation_function=nn.Tanh.__name__ + ) + critic_config: MLPConfig = MLPConfig(hidden_sizes=[1024, 1024]) + + ################################### -# DQN Algorithms # +# SAC Algorithms # ################################### +class SACConfig(AlgorithmConfig): + algorithm: str = Field("SAC", Literal=True) + actor_lr: float = 3e-4 + critic_lr: float = 3e-4 + alpha_lr: float = 3e-4 + gamma: float = 0.99 + tau: float = 0.005 + reward_scale: float = 1.0 + log_std_bounds: list[float] = [-20, 2] + policy_update_freq: int = 1 + target_update_freq: int = 1 + actor_config: MLPConfig = MLPConfig(hidden_sizes=[256, 256]) + critic_config: MLPConfig = MLPConfig(hidden_sizes=[256, 256]) + + +class DynaSAC_NSConfig(AlgorithmConfig): + algorithm: str = Field("DynaSAC_NS", Literal=True) + type: str = Field("mbrl", Literal=True) + G: int = (1,) + G_model: float = (1,) + + actor_lr: float = 3e-4 + critic_lr: float = 3e-4 + alpha_lr: float = 3e-4 + gamma: float = 0.99 + tau: float = 0.005 + reward_scale: float = 1.0 + log_std_bounds: list[float] = [-20, 2] + policy_update_freq: int = 1 + target_update_freq: int = 1 + actor_config: MLPConfig = MLPConfig(hidden_sizes=[256, 256]) + critic_config: MLPConfig = MLPConfig(hidden_sizes=[256, 256]) + + max_steps_exploration: int = 256 + num_models: int = 5 + world_model_lr: float = 0.001 + horizon: int = 3 + num_samples: int = 10 + sas: bool = False + train_reward: bool = True + train_both: bool = False + gripper: bool = False + + +class STEVESACConfig(AlgorithmConfig): + algorithm: str = Field("STEVESAC", Literal=True) + type: str = Field("mbrl", Literal=True) + G: int = (1,) + G_model: float = (1,) + + actor_lr: float = 3e-4 + critic_lr: float = 3e-4 + alpha_lr: float = 3e-4 + gamma: float = 0.99 + tau: float = 0.005 + reward_scale: float = 1.0 + log_std_bounds: list[float] = [-20, 2] + policy_update_freq: int = 1 + target_update_freq: int = 1 + actor_config: MLPConfig = MLPConfig(hidden_sizes=[256, 256]) + critic_config: MLPConfig = MLPConfig(hidden_sizes=[256, 256]) + + max_steps_exploration: int = 256 + + num_models: int = 6 + num_rwd_models: int = 5 + world_model_lr: float = 0.001 + + horizon: int = 3 + + sas: bool = False + train_reward: bool = True + train_both: bool = False + gripper: bool = False + + +class STEVESAC_BoundedConfig(AlgorithmConfig): + algorithm: str = Field("STEVESAC_Bounded", Literal=True) + type: str = Field("mbrl", Literal=True) + G: int = (1,) + G_model: float = (1,) + + actor_lr: float = 3e-4 + critic_lr: float = 3e-4 + alpha_lr: float = 3e-4 + gamma: float = 0.99 + tau: float = 0.005 + reward_scale: float = 1.0 + log_std_bounds: list[float] = [-20, 2] + policy_update_freq: int = 1 + target_update_freq: int = 1 + actor_config: MLPConfig = MLPConfig(hidden_sizes=[256, 256]) + critic_config: MLPConfig = MLPConfig(hidden_sizes=[256, 256]) + + max_steps_exploration: int = 256 + + num_models: int = 6 + num_rwd_models: int = 5 + world_model_lr: float = 0.001 + + horizon: int = 3 + + sas: bool = False + train_reward: bool = True + train_both: bool = False + gripper: bool = False + + threshold: float = 0.1 + exploration_sample: int = 5 + + +class DynaSAC_BoundedConfig(AlgorithmConfig): + algorithm: str = Field("DynaSAC_Bounded", Literal=True) + type: str = Field("mbrl", Literal=True) + G: int = (1,) + G_model: float = (1,) + + actor_lr: float = 3e-4 + critic_lr: float = 3e-4 + alpha_lr: float = 3e-4 + gamma: float = 0.99 + tau: float = 0.005 + reward_scale: float = 1.0 + log_std_bounds: list[float] = [-20, 2] + policy_update_freq: int = 1 + target_update_freq: int = 1 + actor_config: MLPConfig = MLPConfig(hidden_sizes=[256, 256]) + critic_config: MLPConfig = MLPConfig(hidden_sizes=[256, 256]) + + num_rwd_models: int = 1 + max_steps_exploration: int = 256 + num_models: int = 5 + world_model_lr: float = 0.001 + horizon: int = 3 + num_samples: int = 10 + sas: bool = False + train_reward: bool = True + train_both: bool = False + gripper: bool = False + threshold: float = 0.1 + exploration_sample: int = 5 + + +class STEVESAC_Bounded_YaoConfig(AlgorithmConfig): + algorithm: str = Field("STEVESAC_Bounded_Yao", Literal=True) + type: str = Field("mbrl", Literal=True) + G: int = (1,) + G_model: float = (1,) + + actor_lr: float = 3e-4 + critic_lr: float = 3e-4 + alpha_lr: float = 3e-4 + gamma: float = 0.99 + tau: float = 0.005 + reward_scale: float = 1.0 + log_std_bounds: list[float] = [-20, 2] + policy_update_freq: int = 1 + target_update_freq: int = 1 + actor_config: MLPConfig = MLPConfig(hidden_sizes=[256, 256]) + critic_config: MLPConfig = MLPConfig(hidden_sizes=[256, 256]) + + max_steps_exploration: int = 256 + + num_models: int = 6 + num_rwd_models: int = 5 + world_model_lr: float = 0.001 + + horizon: int = 3 + + sas: bool = False + train_reward: bool = True + train_both: bool = False + gripper: bool = False + + threshold: float = 0.1 + exploration_sample: int = 5 + + +class DynaSAC_Bounded_YaoConfig(AlgorithmConfig): + algorithm: str = Field("DynaSAC_Bounded_Yao", Literal=True) + type: str = Field("mbrl", Literal=True) + G: int = (1,) + G_model: float = (1,) + + actor_lr: float = 3e-4 + critic_lr: float = 3e-4 + alpha_lr: float = 3e-4 + gamma: float = 0.99 + tau: float = 0.005 + reward_scale: float = 1.0 + log_std_bounds: list[float] = [-20, 2] + policy_update_freq: int = 1 + target_update_freq: int = 1 + actor_config: MLPConfig = MLPConfig(hidden_sizes=[256, 256]) + critic_config: MLPConfig = MLPConfig(hidden_sizes=[256, 256]) + + num_rwd_models: int = 1 + max_steps_exploration: int = 256 + num_models: int = 5 + world_model_lr: float = 0.001 + horizon: int = 3 + num_samples: int = 10 + sas: bool = False + train_reward: bool = True + train_both: bool = False + gripper: bool = False + threshold: float = 0.1 + exploration_sample: int = 5 + + +class STEVE_MEANConfig(AlgorithmConfig): + algorithm: str = Field("STEVE", Literal=True) + type: str = Field("mbrl", Literal=True) + actor_lr: float = 3e-4 + critic_lr: float = 3e-4 + alpha_lr: float = 3e-4 + gamma: float = 0.99 + tau: float = 0.005 + reward_scale: float = 1.0 + log_std_bounds: list[float] = [-20, 2] + policy_update_freq: int = 1 + target_update_freq: int = 1 + actor_config: MLPConfig = MLPConfig(hidden_sizes=[256, 256]) + critic_config: MLPConfig = MLPConfig(hidden_sizes=[256, 256]) + num_models: int = 5 + world_model_lr: float = 0.001 + horizon: int = 3 + num_samples: int = 10 + sas: bool = False + train_reward: bool = True + train_both: bool = True + gripper: bool = False + + +class DynaSAC_NS_IWConfig(AlgorithmConfig): + algorithm: str = Field("DynaSAC_NS_IW", Literal=True) + type: str = Field("mbrl", Literal=True) + G: int = (1,) + G_model: float = (1,) + actor_lr: float = 3e-4 + critic_lr: float = 3e-4 + alpha_lr: float = 3e-4 + gamma: float = 0.99 + tau: float = 0.005 + reward_scale: float = 1.0 + log_std_bounds: list[float] = [-20, 2] + policy_update_freq: int = 1 + target_update_freq: int = 1 + actor_config: MLPConfig = MLPConfig(hidden_sizes=[256, 256]) + critic_config: MLPConfig = MLPConfig(hidden_sizes=[256, 256]) + + max_steps_exploration: int = 256 + num_models: int = 5 + world_model_lr: float = 0.001 + horizon: int = 3 + num_samples: int = 10 + + num_rwd_models: int = 1 + sas: bool = False + threshold: float = 0.1 + reweight_actor: bool = False + + train_reward: bool = True + train_both: bool = False + gripper: bool = False + + +class DynaSAC_BIVReweightConfig(AlgorithmConfig): + algorithm: str = Field("DynaSAC_BIVNS", Literal=True) + type: str = Field("mbrl", Literal=True) + actor_lr: float = 3e-4 + critic_lr: float = 3e-4 + alpha_lr: float = 3e-4 + gamma: float = 0.99 + tau: float = 0.005 + reward_scale: float = 1.0 + log_std_bounds: list[float] = [-20, 2] + policy_update_freq: int = 1 + target_update_freq: int = 1 + actor_config: MLPConfig = MLPConfig(hidden_sizes=[256, 256]) + critic_config: MLPConfig = MLPConfig(hidden_sizes=[256, 256]) + + num_models: int = 5 + world_model_lr: float = 0.001 + horizon: int = 3 + num_samples: int = 10 + + threshold: float = 0.1 + reweight_actor: bool = False + + train_reward: bool = True + train_both: bool = True + gripper: bool = False + + +class DynaSAC_SUNRISEReweightConfig(AlgorithmConfig): + algorithm: str = Field("DynaSAC_SUNRISENS", Literal=True) + type: str = Field("mbrl", Literal=True) + actor_lr: float = 3e-4 + critic_lr: float = 3e-4 + alpha_lr: float = 3e-4 + gamma: float = 0.99 + tau: float = 0.005 + reward_scale: float = 1.0 + log_std_bounds: list[float] = [-20, 2] + policy_update_freq: int = 1 + target_update_freq: int = 1 + actor_config: MLPConfig = MLPConfig(hidden_sizes=[256, 256]) + critic_config: MLPConfig = MLPConfig(hidden_sizes=[256, 256]) + + num_models: int = 5 + world_model_lr: float = 0.001 + horizon: int = 3 + num_samples: int = 10 + + threshold: float = 0.1 + reweight_actor: bool = False + + train_reward: bool = True + train_both: bool = True + gripper: bool = False + + +class DynaSAC_UWACReweightConfig(AlgorithmConfig): + algorithm: str = Field("DynaSAC_UWACNS", Literal=True) + type: str = Field("mbrl", Literal=True) + actor_lr: float = 3e-4 + critic_lr: float = 3e-4 + alpha_lr: float = 3e-4 + gamma: float = 0.99 + tau: float = 0.005 + reward_scale: float = 1.0 + log_std_bounds: list[float] = [-20, 2] + policy_update_freq: int = 1 + target_update_freq: int = 1 + actor_config: MLPConfig = MLPConfig(hidden_sizes=[256, 256]) + critic_config: MLPConfig = MLPConfig(hidden_sizes=[256, 256]) + num_models: int = 5 + world_model_lr: float = 0.001 + horizon: int = 3 + num_samples: int = 10 + + threshold: float = 0.1 + reweight_actor: bool = False + + train_reward: bool = True + train_both: bool = True + gripper: bool = False + + +############ Useless to me ########### class DQNConfig(AlgorithmConfig): algorithm: str = Field("DQN", Literal=True) lr: float = 1e-3 @@ -159,33 +527,6 @@ class DuelingDQNConfig(AlgorithmConfig): advantage_stream_config: MLPConfig = MLPConfig(hidden_sizes=[512]) -################################### -# PPO Algorithms # -################################### - - -class PPOConfig(AlgorithmConfig): - algorithm: str = Field("PPO", Literal=True) - actor_lr: float = 1e-4 - critic_lr: float = 1e-3 - - gamma: float = 0.99 - eps_clip: float = 0.2 - updates_per_iteration: int = 10 - - max_steps_per_batch: int = 5000 - - actor_config: MLPConfig = MLPConfig( - hidden_sizes=[1024, 1024], output_activation_function=nn.Tanh.__name__ - ) - critic_config: MLPConfig = MLPConfig(hidden_sizes=[1024, 1024]) - - -################################### -# SAC Algorithms # -################################### - - class SACDConfig(AlgorithmConfig): algorithm: str = Field("SACD", Literal=True) actor_lr: float = 3e-4 @@ -210,25 +551,6 @@ class SACDConfig(AlgorithmConfig): critic_config: MLPConfig = MLPConfig(hidden_sizes=[512, 512]) -class SACConfig(AlgorithmConfig): - algorithm: str = Field("SAC", Literal=True) - actor_lr: float = 3e-4 - critic_lr: float = 3e-4 - alpha_lr: float = 3e-4 - - gamma: float = 0.99 - tau: float = 0.005 - reward_scale: float = 1.0 - - log_std_bounds: list[float] = [-20, 2] - - policy_update_freq: int = 1 - target_update_freq: int = 1 - - actor_config: MLPConfig = MLPConfig(hidden_sizes=[256, 256]) - critic_config: MLPConfig = MLPConfig(hidden_sizes=[256, 256]) - - class SACAEConfig(SACConfig): algorithm: str = Field("SACAE", Literal=True) @@ -482,38 +804,6 @@ class DroQConfig(SACConfig): ) -class DynaSACConfig(SACConfig): - algorithm: str = Field("DynaSAC", Literal=True) - actor_lr: float = 3e-4 - critic_lr: float = 3e-4 - - alpha_lr: float = 3e-4 - - # TODO this bool doesn't work as expected - needs to be int 1/0 - use_bounded_active: bool = False - num_models: int = 5 - - gamma: float = 0.99 - tau: float = 0.005 - - log_std_bounds: list[float] = [-20, 2] - - policy_update_freq: int = 1 - target_update_freq: int = 1 - - actor_config: MLPConfig = MLPConfig(hidden_sizes=[256, 256]) - critic_config: MLPConfig = MLPConfig(hidden_sizes=[256, 256]) - - horizon: int = 3 - num_samples: int = 10 - world_model_lr: float = 0.001 - - -################################### -# TD3 Algorithms # -################################### - - class DDPGConfig(AlgorithmConfig): algorithm: str = Field("DDPG", Literal=True) actor_lr: float = 1e-4 diff --git a/cares_reinforcement_learning/util/helpers.py b/cares_reinforcement_learning/util/helpers.py index 7980075e..c49f2917 100644 --- a/cares_reinforcement_learning/util/helpers.py +++ b/cares_reinforcement_learning/util/helpers.py @@ -4,6 +4,51 @@ import numpy as np import torch +from torch import nn as nn +import torch.nn.functional as F +import time + + +class MLP(nn.Module): + def __init__(self, input_size: int, hidden_sizes: list[int], output_size: int): + super().__init__() + + self.fully_connected_layers = [] + for i, next_size in enumerate(hidden_sizes): + fully_connected_layer = nn.Linear(input_size, next_size) + self.add_module(f"fully_connected_layer_{i}", fully_connected_layer) + self.fully_connected_layers.append(fully_connected_layer) + input_size = next_size + + self.output_layer = nn.Linear(input_size, output_size) + + def forward(self, state): + for fully_connected_layer in self.fully_connected_layers: + state = F.relu(fully_connected_layer(state)) + output = self.output_layer(state) + return output + + +def weight_init_pnn(module: torch.nn.Module) -> None: + """ + Custom weight init for Conv2D and Linear layers + + delta-orthogonal init from https://arxiv.org/pdf/1806.05393.pdf + """ + if isinstance(module, torch.nn.Linear): + torch.manual_seed(int(time.time())) + torch.cuda.manual_seed_all(int(time.time())) + torch.nn.init.xavier_uniform_(module.weight) + module.bias.data.uniform_(-0.5, 0.5) + + elif isinstance(module, (torch.nn.Conv2d, torch.nn.ConvTranspose2d)): + assert module.weight.size(2) == module.weight.size(3) + module.weight.data.fill_(0.0) + module.bias.data.fill_(0.0) + mid = module.weight.size(2) // 2 + gain = torch.nn.init.calculate_gain("relu") + torch.nn.init.orthogonal_(module.weight.data[:, :, mid, mid], gain) + def get_device() -> torch.device: device = torch.device("cpu") diff --git a/cares_reinforcement_learning/util/network_factory.py b/cares_reinforcement_learning/util/network_factory.py index 616ca976..61738453 100644 --- a/cares_reinforcement_learning/util/network_factory.py +++ b/cares_reinforcement_learning/util/network_factory.py @@ -1,9 +1,3 @@ -""" -This module provides functions to create different types of reinforcement learning agents -with their corresponding network architectures. -""" - -import copy import inspect import logging import sys @@ -11,99 +5,15 @@ import cares_reinforcement_learning.util.configurations as acf import cares_reinforcement_learning.util.helpers as hlp + # Disable these as this is a deliberate use of dynamic imports # pylint: disable=import-outside-toplevel # pylint: disable=invalid-name - ################################### # DQN Algorithms # ################################### -def create_DQN(observation_size, action_num, config: acf.DQNConfig): - from cares_reinforcement_learning.algorithm.value import DQN - from cares_reinforcement_learning.networks.DQN import Network - - network = Network(observation_size, action_num, config=config) - - device = hlp.get_device() - agent = DQN(network=network, config=config, device=device) - return agent - - -def create_DuelingDQN(observation_size, action_num, config: acf.DuelingDQNConfig): - """ - Original paper https://arxiv.org/abs/1511.06581 - """ - from cares_reinforcement_learning.algorithm.value import DQN - from cares_reinforcement_learning.networks.DuelingDQN import Network - - network = Network(observation_size, action_num, config=config) - - device = hlp.get_device() - agent = DQN(network=network, config=config, device=device) - return agent - - -def create_DoubleDQN(observation_size, action_num, config: acf.DoubleDQNConfig): - from cares_reinforcement_learning.algorithm.value import DoubleDQN - from cares_reinforcement_learning.networks.DoubleDQN import Network - - network = Network(observation_size, action_num, config=config) - - device = hlp.get_device() - agent = DoubleDQN( - network=network, - config=config, - device=device, - ) - return agent - - -################################### -# PPO Algorithms # -################################### - - -def create_PPO(observation_size, action_num, config: acf.PPOConfig): - from cares_reinforcement_learning.algorithm.policy import PPO - from cares_reinforcement_learning.networks.PPO import Actor, Critic - - actor = Actor(observation_size, action_num, config=config) - critic = Critic(observation_size, config=config) - - device = hlp.get_device() - agent = PPO( - actor_network=actor, - critic_network=critic, - config=config, - device=device, - ) - return agent - - -################################### -# SAC Algorithms # -################################### - - -def create_SACD(observation_size, action_num, config: acf.SACDConfig): - from cares_reinforcement_learning.algorithm.policy import SACD - from cares_reinforcement_learning.networks.SACD import Actor, Critic - - actor = Actor(observation_size, action_num, config=config) - critic = Critic(observation_size, action_num, config=config) - - device = hlp.get_device() - agent = SACD( - actor_network=actor, - critic_network=critic, - config=config, - device=device, - ) - return agent - - def create_SAC(observation_size, action_num, config: acf.SACConfig): from cares_reinforcement_learning.algorithm.policy import SAC from cares_reinforcement_learning.networks.SAC import Actor, Critic @@ -121,427 +31,526 @@ def create_SAC(observation_size, action_num, config: acf.SACConfig): return agent -def create_SACAE(observation_size, action_num, config: acf.SACAEConfig): - from cares_reinforcement_learning.algorithm.policy import SACAE - from cares_reinforcement_learning.encoders.vanilla_autoencoder import Decoder - from cares_reinforcement_learning.networks.SACAE import Actor, Critic - - actor = Actor(observation_size, action_num, config=config) - critic = Critic(observation_size, action_num, config=config) - - ae_config = config.autoencoder_config - decoder = Decoder( - observation_size["image"], - out_dim=actor.encoder.out_dim, - latent_dim=ae_config.latent_dim, - num_layers=ae_config.num_layers, - num_filters=ae_config.num_filters, - kernel_size=ae_config.kernel_size, - ) - - device = hlp.get_device() - agent = SACAE( - actor_network=actor, - critic_network=critic, - decoder_network=decoder, - config=config, - device=device, +def create_DynaSAC_NS(observation_size, action_num, config: acf.DynaSAC_NSConfig): + """ + Create networks for model-based SAC agent. The Actor and Critic is same. + An extra world model is added. + """ + from cares_reinforcement_learning.algorithm.mbrl import DynaSAC_NS + from cares_reinforcement_learning.networks.SAC import Actor, Critic + from cares_reinforcement_learning.networks.world_models.ensemble import ( + Ensemble_Dyna_Big, ) - return agent - - -def create_PERSAC(observation_size, action_num, config: acf.PERSACConfig): - from cares_reinforcement_learning.algorithm.policy import PERSAC - from cares_reinforcement_learning.networks.PERSAC import Actor, Critic actor = Actor(observation_size, action_num, config=config) critic = Critic(observation_size, action_num, config=config) device = hlp.get_device() - agent = PERSAC( - actor_network=actor, - critic_network=critic, - config=config, - device=device, - ) - return agent - -def create_REDQ(observation_size, action_num, config: acf.REDQConfig): - from cares_reinforcement_learning.algorithm.policy import REDQ - from cares_reinforcement_learning.networks.REDQ import Actor, Critic - - actor = Actor(observation_size, action_num, config=config) - ensemble_critic = Critic(observation_size, action_num, config=config) - - device = hlp.get_device() - agent = REDQ( - actor_network=actor, - ensemble_critic=ensemble_critic, - config=config, + world_model = Ensemble_Dyna_Big( + observation_size=observation_size, + num_actions=action_num, + num_models=config.num_models, device=device, + l_r=config.world_model_lr, + sas=config.sas, + boost_inter=30, ) - return agent - - -def create_TQC(observation_size, action_num, config: acf.TQCConfig): - from cares_reinforcement_learning.algorithm.policy import TQC - from cares_reinforcement_learning.networks.TQC import Actor, Critic - - actor = Actor(observation_size, action_num, config=config) - critic = Critic(observation_size, action_num, config=config) - device = hlp.get_device() - agent = TQC( + agent = DynaSAC_NS( actor_network=actor, critic_network=critic, - config=config, + world_network=world_model, + actor_lr=config.actor_lr, + critic_lr=config.critic_lr, + gamma=config.gamma, + tau=config.tau, + action_num=action_num, + alpha_lr=config.alpha_lr, + horizon=config.horizon, + num_samples=config.num_samples, device=device, + train_both=config.train_both, + train_reward=config.train_reward, + gripper=config.gripper, ) return agent -def create_LAPSAC(observation_size, action_num, config: acf.LAPSACConfig): - from cares_reinforcement_learning.algorithm.policy import LAPSAC - from cares_reinforcement_learning.networks.LAPSAC import Actor, Critic - - actor = Actor(observation_size, action_num, config=config) - critic = Critic(observation_size, action_num, config=config) - - device = hlp.get_device() - agent = LAPSAC( - actor_network=actor, - critic_network=critic, - config=config, - device=device, +def create_DynaSAC_Bounded( + observation_size, action_num, config: acf.DynaSAC_BoundedConfig +): + """ + Create networks for model-based SAC agent. The Actor and Critic is same. + An extra world model is added. + """ + from cares_reinforcement_learning.algorithm.mbrl import DynaSAC_Bounded + from cares_reinforcement_learning.networks.SAC import Actor, Critic + from cares_reinforcement_learning.networks.world_models.ensemble import ( + Ensemble_Dyna_Big, ) - return agent - - -def create_LA3PSAC(observation_size, action_num, config: acf.LA3PSACConfig): - from cares_reinforcement_learning.algorithm.policy import LA3PSAC - from cares_reinforcement_learning.networks.LA3PSAC import Actor, Critic actor = Actor(observation_size, action_num, config=config) critic = Critic(observation_size, action_num, config=config) device = hlp.get_device() - agent = LA3PSAC( - actor_network=actor, - critic_network=critic, - config=config, - device=device, - ) - return agent - - -def create_MAPERSAC(observation_size, action_num, config: acf.MAPERSACConfig): - from cares_reinforcement_learning.algorithm.policy import MAPERSAC - from cares_reinforcement_learning.networks.MAPERSAC import Actor, Critic - actor = Actor(observation_size, action_num, config=config) - critic = Critic(observation_size, action_num, config=config) - - device = hlp.get_device() - agent = MAPERSAC( - actor_network=actor, - critic_network=critic, - config=config, + world_model = Ensemble_Dyna_Big( + observation_size=observation_size, + num_actions=action_num, + num_models=config.num_models, device=device, + l_r=config.world_model_lr, + sas=config.sas, + prob_rwd=True, + boost_inter=30, ) - return agent - - -def create_RDSAC(observation_size, action_num, config: acf.RDSACConfig): - from cares_reinforcement_learning.algorithm.policy import RDSAC - from cares_reinforcement_learning.networks.RDSAC import Actor, Critic - actor = Actor(observation_size, action_num, config=config) - critic = Critic(observation_size, action_num, config=config) - - device = hlp.get_device() - agent = RDSAC( + agent = DynaSAC_Bounded( actor_network=actor, critic_network=critic, - config=config, + world_network=world_model, + actor_lr=config.actor_lr, + critic_lr=config.critic_lr, + gamma=config.gamma, + tau=config.tau, + action_num=action_num, + alpha_lr=config.alpha_lr, + horizon=config.horizon, + num_samples=config.num_samples, device=device, + train_both=config.train_both, + train_reward=config.train_reward, + gripper=config.gripper, + threshold=config.threshold, + exploration_sample=config.exploration_sample, ) + return agent -def create_DroQ(observation_size, action_num, config: acf.DroQConfig): - from cares_reinforcement_learning.algorithm.policy import DroQ - from cares_reinforcement_learning.networks.DroQ import Actor, Critic +def create_DynaSAC_Bounded_Yao( + observation_size, action_num, config: acf.DynaSAC_Bounded_YaoConfig +): + """ + Create networks for model-based SAC agent. The Actor and Critic is same. + An extra world model is added. + """ + from cares_reinforcement_learning.algorithm.mbrl import DynaSAC_Bounded_Yao + from cares_reinforcement_learning.networks.SAC import Actor, Critic + from cares_reinforcement_learning.networks.world_models.ensemble import ( + Ensemble_Dyna_Big, + ) actor = Actor(observation_size, action_num, config=config) critic = Critic(observation_size, action_num, config=config) device = hlp.get_device() - agent = DroQ( - actor_network=actor, - critic_network=critic, - config=config, + + world_model = Ensemble_Dyna_Big( + observation_size=observation_size, + num_actions=action_num, + num_models=config.num_models, device=device, + l_r=config.world_model_lr, + sas=config.sas, + prob_rwd=True, + boost_inter=30, ) - return agent - -def create_CrossQ(observation_size, action_num, config: acf.CrossQConfig): - from cares_reinforcement_learning.algorithm.policy import CrossQ - from cares_reinforcement_learning.networks.CrossQ import Actor, Critic - - actor = Actor(observation_size, action_num, config=config) - critic = Critic(observation_size, action_num, config=config) - - device = hlp.get_device() - agent = CrossQ( + agent = DynaSAC_Bounded_Yao( actor_network=actor, critic_network=critic, - config=config, + world_network=world_model, + actor_lr=config.actor_lr, + critic_lr=config.critic_lr, + gamma=config.gamma, + tau=config.tau, + action_num=action_num, + alpha_lr=config.alpha_lr, + horizon=config.horizon, + num_samples=config.num_samples, device=device, + train_both=config.train_both, + train_reward=config.train_reward, + gripper=config.gripper, + threshold=config.threshold, + exploration_sample=config.exploration_sample, ) + return agent -def create_DynaSAC(observation_size, action_num, config: acf.DynaSACConfig): +def create_STEVESAC(observation_size, action_num, config: acf.STEVESACConfig): """ Create networks for model-based SAC agent. The Actor and Critic is same. An extra world model is added. """ - from cares_reinforcement_learning.algorithm.mbrl import DynaSAC - from cares_reinforcement_learning.networks.DynaSAC import Actor, Critic - from cares_reinforcement_learning.networks.world_models import EnsembleWorldReward + from cares_reinforcement_learning.algorithm.mbrl import STEVESAC + from cares_reinforcement_learning.networks.SAC import Actor, Critic + from cares_reinforcement_learning.networks.world_models.ensemble import ( + Ensemble_Dyna_Big, + ) actor = Actor(observation_size, action_num, config=config) critic = Critic(observation_size, action_num, config=config) device = hlp.get_device() - world_model = EnsembleWorldReward( + world_model = Ensemble_Dyna_Big( observation_size=observation_size, num_actions=action_num, num_models=config.num_models, - lr=config.world_model_lr, + num_rwd_model=config.num_rwd_models, device=device, + l_r=config.world_model_lr, + sas=config.sas, ) - agent = DynaSAC( + agent = STEVESAC( actor_network=actor, critic_network=critic, world_network=world_model, - config=config, + actor_lr=config.actor_lr, + critic_lr=config.critic_lr, + gamma=config.gamma, + tau=config.tau, + action_num=action_num, + alpha_lr=config.alpha_lr, + horizon=config.horizon, device=device, + train_both=config.train_both, + train_reward=config.train_reward, + gripper=config.gripper, ) return agent -################################### -# TD3 Algorithms # -################################### - - -def create_DDPG(observation_size, action_num, config: acf.DDPGConfig): - from cares_reinforcement_learning.algorithm.policy import DDPG - from cares_reinforcement_learning.networks.DDPG import Actor, Critic - - actor = Actor(observation_size, action_num, config=config) - critic = Critic(observation_size, action_num, config=config) +def create_STEVESAC_Bounded( + observation_size, action_num, config: acf.STEVESAC_BoundedConfig +): + """ + Create networks for model-based SAC agent. The Actor and Critic is same. + An extra world model is added. + """ - device = hlp.get_device() - agent = DDPG( - actor_network=actor, - critic_network=critic, - config=config, - device=device, + from cares_reinforcement_learning.algorithm.mbrl import STEVESAC_Bounded + from cares_reinforcement_learning.networks.SAC import Actor, Critic + from cares_reinforcement_learning.networks.world_models.ensemble import ( + Ensemble_Dyna_Big, ) - return agent - - -def create_TD3(observation_size, action_num, config: acf.TD3Config): - from cares_reinforcement_learning.algorithm.policy import TD3 - from cares_reinforcement_learning.networks.TD3 import Actor, Critic actor = Actor(observation_size, action_num, config=config) critic = Critic(observation_size, action_num, config=config) device = hlp.get_device() - agent = TD3( - actor_network=actor, - critic_network=critic, - config=config, - device=device, - ) - return agent - - -def create_TD3AE(observation_size, action_num, config: acf.TD3AEConfig): - from cares_reinforcement_learning.algorithm.policy import TD3AE - from cares_reinforcement_learning.encoders.vanilla_autoencoder import Decoder - from cares_reinforcement_learning.networks.TD3AE import Actor, Critic - - actor = Actor(observation_size, action_num, config=config) - critic = Critic(observation_size, action_num, config=config) - - ae_config = config.autoencoder_config - decoder = Decoder( - observation_size["image"], - out_dim=actor.encoder.out_dim, - latent_dim=ae_config.latent_dim, - num_layers=ae_config.num_layers, - num_filters=ae_config.num_filters, - kernel_size=ae_config.kernel_size, - ) - device = hlp.get_device() - agent = TD3AE( - actor_network=actor, - critic_network=critic, - decoder_network=decoder, - config=config, + world_model = Ensemble_Dyna_Big( + observation_size=observation_size, + num_actions=action_num, + num_models=config.num_models, + num_rwd_model=config.num_rwd_models, device=device, + l_r=config.world_model_lr, + sas=config.sas, ) - return agent - -def create_NaSATD3(observation_size, action_num, config: acf.NaSATD3Config): - from cares_reinforcement_learning.algorithm.policy import NaSATD3 - from cares_reinforcement_learning.networks.NaSATD3 import Actor, Critic - - actor = Actor(observation_size, action_num, config=config) - critic = Critic(observation_size, action_num, config=config) - - device = hlp.get_device() - agent = NaSATD3( + agent = STEVESAC_Bounded( actor_network=actor, critic_network=critic, - config=config, + world_network=world_model, + actor_lr=config.actor_lr, + critic_lr=config.critic_lr, + gamma=config.gamma, + tau=config.tau, + action_num=action_num, + alpha_lr=config.alpha_lr, + horizon=config.horizon, device=device, + train_both=config.train_both, + train_reward=config.train_reward, + gripper=config.gripper, + threshold=config.threshold, + exploration_sample=config.exploration_sample, ) - return agent + return agent -def create_PERTD3(observation_size, action_num, config: acf.PERTD3Config): - from cares_reinforcement_learning.algorithm.policy import PERTD3 - from cares_reinforcement_learning.networks.PERTD3 import Actor, Critic - actor = Actor(observation_size, action_num, config=config) - critic = Critic(observation_size, action_num, config=config) +def create_STEVESAC_Bounded_Yao( + observation_size, action_num, config: acf.STEVESAC_Bounded_YaoConfig +): + """ + Create networks for model-based SAC agent. The Actor and Critic is same. + An extra world model is added. + """ - device = hlp.get_device() - agent = PERTD3( - actor_network=actor, - critic_network=critic, - config=config, - device=device, + from cares_reinforcement_learning.algorithm.mbrl import STEVESAC_Bounded_Yao + from cares_reinforcement_learning.networks.SAC import Actor, Critic + from cares_reinforcement_learning.networks.world_models.ensemble import ( + Ensemble_Dyna_Big, ) - return agent - - -def create_LAPTD3(observation_size, action_num, config: acf.LAPTD3Config): - from cares_reinforcement_learning.algorithm.policy import LAPTD3 - from cares_reinforcement_learning.networks.LAPTD3 import Actor, Critic actor = Actor(observation_size, action_num, config=config) critic = Critic(observation_size, action_num, config=config) device = hlp.get_device() - agent = LAPTD3( - actor_network=actor, - critic_network=critic, - config=config, + + world_model = Ensemble_Dyna_Big( + observation_size=observation_size, + num_actions=action_num, + num_models=config.num_models, + num_rwd_model=config.num_rwd_models, device=device, + l_r=config.world_model_lr, + sas=config.sas, ) - return agent - -def create_PALTD3(observation_size, action_num, config: acf.PALTD3Config): - from cares_reinforcement_learning.algorithm.policy import PALTD3 - from cares_reinforcement_learning.networks.PALTD3 import Actor, Critic - - actor = Actor(observation_size, action_num, config=config) - critic = Critic(observation_size, action_num, config=config) - - device = hlp.get_device() - agent = PALTD3( + agent = STEVESAC_Bounded_Yao( actor_network=actor, critic_network=critic, - config=config, + world_network=world_model, + actor_lr=config.actor_lr, + critic_lr=config.critic_lr, + gamma=config.gamma, + tau=config.tau, + action_num=action_num, + alpha_lr=config.alpha_lr, + horizon=config.horizon, device=device, + train_both=config.train_both, + train_reward=config.train_reward, + gripper=config.gripper, + threshold=config.threshold, + exploration_sample=config.exploration_sample, ) - return agent + return agent -def create_LA3PTD3(observation_size, action_num, config: acf.LA3PTD3Config): - from cares_reinforcement_learning.algorithm.policy import LA3PTD3 - from cares_reinforcement_learning.networks.LA3PTD3 import Actor, Critic - actor = Actor(observation_size, action_num, config=config) - critic = Critic(observation_size, action_num, config=config) +def create_DynaSAC_NS_IW(observation_size, action_num, config: acf.DynaSAC_NS_IWConfig): + """ + Create networks for model-based SAC agent. The Actor and Critic is same. + An extra world model is added. - device = hlp.get_device() - agent = LA3PTD3( - actor_network=actor, - critic_network=critic, - config=config, - device=device, + """ + from cares_reinforcement_learning.algorithm.mbrl import DynaSAC_NS_IW + from cares_reinforcement_learning.networks.SAC import Actor, Critic + from cares_reinforcement_learning.networks.world_models.ensemble import ( + Ensemble_Dyna_Big, ) - return agent - - -def create_MAPERTD3(observation_size, action_num, config: acf.MAPERTD3Config): - from cares_reinforcement_learning.algorithm.policy import MAPERTD3 - from cares_reinforcement_learning.networks.MAPERTD3 import Actor, Critic actor = Actor(observation_size, action_num, config=config) critic = Critic(observation_size, action_num, config=config) device = hlp.get_device() - agent = MAPERTD3( - actor_network=actor, - critic_network=critic, - config=config, + + world_model = Ensemble_Dyna_Big( + observation_size=observation_size, + num_actions=action_num, + num_models=config.num_models, + num_rwd_model=config.num_rwd_models, device=device, + l_r=config.world_model_lr, + sas=config.sas, ) - return agent - -def create_RDTD3(observation_size, action_num, config: acf.RDTD3Config): - from cares_reinforcement_learning.algorithm.policy import RDTD3 - from cares_reinforcement_learning.networks.RDTD3 import Actor, Critic - - actor = Actor(observation_size, action_num, config=config) - critic = Critic(observation_size, action_num, config=config) - - device = hlp.get_device() - agent = RDTD3( + agent = DynaSAC_NS_IW( actor_network=actor, critic_network=critic, - config=config, + world_network=world_model, + actor_lr=config.actor_lr, + critic_lr=config.critic_lr, + gamma=config.gamma, + tau=config.tau, + action_num=action_num, device=device, + alpha_lr=config.alpha_lr, + horizon=config.horizon, + num_samples=config.num_samples, + train_both=config.train_both, + train_reward=config.train_reward, + gripper=config.gripper, + threshold=config.threshold, ) return agent -def create_CTD4(observation_size, action_num, config: acf.CTD4Config): - from cares_reinforcement_learning.algorithm.policy import CTD4 - from cares_reinforcement_learning.networks.CTD4 import Actor, Critic - - device = hlp.get_device() - - actor = Actor(observation_size, action_num, config=config) - ensemble_critic = Critic(observation_size, action_num, config=config) - - agent = CTD4( - actor_network=actor, - ensemble_critic=ensemble_critic, - config=config, - device=device, - ) - - return agent +# def create_DynaSAC_SAS(observation_size, action_num, config: AlgorithmConfig): +# """ +# Create networks for model-based SAC agent. The Actor and Critic is same. +# An extra world model is added. +# +# """ +# from cares_reinforcement_learning.algorithm.mbrl import DynaSAC_SAS +# from cares_reinforcement_learning.networks.SAC import Actor, Critic +# from cares_reinforcement_learning.networks.world_models import EnsembleWorldAndOneSASReward +# +# actor = Actor(observation_size, action_num) +# critic = Critic(observation_size, action_num) +# +# device = torch.device("cuda" if torch.cuda.is_available() else "cpu") +# +# world_model = EnsembleWorldAndOneSASReward( +# observation_size=observation_size, +# num_actions=action_num, +# num_models=config.num_models, +# lr=config.world_model_lr, +# device=device, +# ) +# +# agent = DynaSAC_SAS( +# actor_network=actor, +# critic_network=critic, +# world_network=world_model, +# actor_lr=config.actor_lr, +# critic_lr=config.critic_lr, +# gamma=config.gamma, +# tau=config.tau, +# action_num=action_num, +# alpha_lr=config.alpha_lr, +# horizon=config.horizon, +# num_samples=config.num_samples, +# device=device, +# ) +# return agent + + +# def create_DynaSAC_BIVReweight(observation_size, action_num, config: AlgorithmConfig): +# """ +# Create networks for model-based SAC agent. The Actor and Critic is same. +# An extra world model is added. +# +# """ +# from cares_reinforcement_learning.algorithm.mbrl import DynaSAC_BIVReweight +# from cares_reinforcement_learning.networks.SAC import Actor, Critic +# from cares_reinforcement_learning.networks.world_models import EnsembleWorldAndOneNSReward +# +# actor = Actor(observation_size, action_num) +# critic = Critic(observation_size, action_num) +# +# device = torch.device("cuda" if torch.cuda.is_available() else "cpu") +# +# world_model = EnsembleWorldAndOneNSReward( +# observation_size=observation_size, +# num_actions=action_num, +# num_models=config.num_models, +# device=device, +# lr=config.world_model_lr, +# ) +# +# agent = DynaSAC_BIVReweight( +# actor_network=actor, +# critic_network=critic, +# world_network=world_model, +# actor_lr=config.actor_lr, +# critic_lr=config.critic_lr, +# gamma=config.gamma, +# tau=config.tau, +# action_num=action_num, +# device=device, +# alpha_lr=config.alpha_lr, +# horizon=config.horizon, +# num_samples=config.num_samples, +# threshold_scale=config.threshold_scale, +# reweight_critic=config.reweight_critic, +# reweight_actor=config.reweight_actor, +# mode=config.mode, +# sample_times=config.sample_times, +# ) +# return agent +# +# +# def create_DynaSAC_SUNRISEReweight(observation_size, action_num, config: AlgorithmConfig): +# """ +# Create networks for model-based SAC agent. The Actor and Critic is same. +# An extra world model is added. +# +# """ +# from cares_reinforcement_learning.algorithm.mbrl import DynaSAC_SUNRISEReweight +# from cares_reinforcement_learning.networks.SAC import Actor, Critic +# from cares_reinforcement_learning.networks.world_models import EnsembleWorldAndOneNSReward +# +# actor = Actor(observation_size, action_num) +# critic = Critic(observation_size, action_num) +# +# device = torch.device("cuda" if torch.cuda.is_available() else "cpu") +# +# world_model = EnsembleWorldAndOneNSReward( +# observation_size=observation_size, +# num_actions=action_num, +# num_models=config.num_models, +# device=device, +# lr=config.world_model_lr, +# ) +# +# agent = DynaSAC_SUNRISEReweight( +# actor_network=actor, +# critic_network=critic, +# world_network=world_model, +# actor_lr=config.actor_lr, +# critic_lr=config.critic_lr, +# gamma=config.gamma, +# tau=config.tau, +# action_num=action_num, +# device=device, +# alpha_lr=config.alpha_lr, +# horizon=config.horizon, +# num_samples=config.num_samples, +# threshold_scale=config.threshold_scale, +# reweight_critic=config.reweight_critic, +# reweight_actor=config.reweight_actor, +# mode=config.mode, +# sample_times=config.sample_times, +# ) +# return agent +# +# +# def create_DynaSAC_UWACReweight(observation_size, action_num, config: AlgorithmConfig): +# """ +# Create networks for model-based SAC agent. The Actor and Critic is same. +# An extra world model is added. +# +# """ +# from cares_reinforcement_learning.algorithm.mbrl import DynaSAC_UWACReweight +# from cares_reinforcement_learning.networks.SAC import Actor, Critic +# from cares_reinforcement_learning.networks.world_models import EnsembleWorldAndOneNSReward +# +# actor = Actor(observation_size, action_num) +# critic = Critic(observation_size, action_num) +# +# device = torch.device("cuda" if torch.cuda.is_available() else "cpu") +# +# world_model = EnsembleWorldAndOneNSReward( +# observation_size=observation_size, +# num_actions=action_num, +# num_models=config.num_models, +# device=device, +# lr=config.world_model_lr, +# ) +# +# agent = DynaSAC_UWACReweight( +# actor_network=actor, +# critic_network=critic, +# world_network=world_model, +# actor_lr=config.actor_lr, +# critic_lr=config.critic_lr, +# gamma=config.gamma, +# tau=config.tau, +# action_num=action_num, +# device=device, +# alpha_lr=config.alpha_lr, +# horizon=config.horizon, +# num_samples=config.num_samples, +# threshold_scale=config.threshold_scale, +# reweight_critic=config.reweight_critic, +# reweight_actor=config.reweight_actor, +# mode=config.mode, +# sample_times=config.sample_times, +# ) +# return agent -# TODO return type base "Algorithm" class? class NetworkFactory: def create_network( self, diff --git a/cares_reinforcement_learning/util/uncertainty_estimation.py b/cares_reinforcement_learning/util/uncertainty_estimation.py new file mode 100644 index 00000000..d1c4010c --- /dev/null +++ b/cares_reinforcement_learning/util/uncertainty_estimation.py @@ -0,0 +1,36 @@ +import torch +import torch.nn.functional as F + + +def sampling(pred_means, pred_vars): + """ + High std means low uncertainty. Therefore, divided by 1 + + :param pred_means: + :param pred_vars: + :return: + """ + # 5 models, each sampled 10 times = 50, + sample1 = torch.distributions.Normal(pred_means[0], pred_vars[0]).sample([10]) + sample2 = torch.distributions.Normal(pred_means[1], pred_vars[1]).sample([10]) + sample3 = torch.distributions.Normal(pred_means[2], pred_vars[2]).sample([10]) + sample4 = torch.distributions.Normal(pred_means[3], pred_vars[3]).sample([10]) + sample5 = torch.distributions.Normal(pred_means[4], pred_vars[4]).sample([10]) + + samples = torch.cat((sample1, sample2, sample3, sample4, sample5)) + # Samples = [5 * 10, 10 predictions, 11 state dims] + # print(samples.shape) + stds = torch.var(samples, dim=0) + # print(stds.shape) + # [10 predictions, 11 state dims] + total_stds = torch.mean(stds, dim=1) + # Clip for sigmoid + # total_stds[total_stds < 0.2] = 0.0 + # total_stds[total_stds > 4.0] = 4.0 + + total_stds = F.sigmoid(total_stds) # 0.5 - 1.0 + # total_stds = 1 / total_stds + # total_stds = total_stds / torch.mean(total_stds) # if very uncertain, + # high std, encouraged. + # total_stds = total_stds - torch.min(total_stds) + return total_stds.detach()