DDQN_Agent.py

# -*- coding: utf-8 -*-
"""DDQN_Agent.py

Automatically generated by Colab.

Original file is located at
    https://colab.research.google.com/drive/1AILIXrr-pVFnNcXY8MV2myhJqu3X2MOS
"""

# Commented out IPython magic to ensure Python compatibility.
import torch
import torch.nn as nn
import torch.optim as optim
import random
from collections import deque

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# %matplotlib inline

import os
import torch
import torch.nn as nn
from torch.utils.data import Dataset,DataLoader
import torch.optim as optim

import yfinance as yf
from collections import deque
import random
import math
from tqdm import tqdm

# DDQN Model
class DDQN(nn.Module):
    def __init__(self, state_dim, action_dim):
        super(DDQN, self).__init__()
        self.fc1 = nn.Linear(state_dim, 64)
        self.fc2 = nn.Linear(64, 32)
        self.fc3 = nn.Linear(32, 8)
        self.fc4 = nn.Linear(8, action_dim)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = torch.relu(self.fc3(x))
        output = torch.softmax(self.fc4(x), dim=-1)
        return output

# DDQN Agent
class DDQN_Agent:
    def __init__(self, state_dim, tau=0.0001, is_eval=False, model_name=""):
        self.model_type = "DDQN"
        self.state_dim = state_dim
        self.action_dim = 3  # hold, sell, and buy
        self.memory = deque(maxlen=100)
        self.buffer_size = 60

        self.gamma = 0.95
        self.epsilon = 1.0  # initial exploration rate
        self.epsilon_min = 0.01  # minimum exploration rate
        self.epsilon_decay = 0.995  # decrease exploration rate as the agent becomes good at trading
        self.is_eval = is_eval
        self.tau = tau
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        if is_eval:
            self.model = self.create_model().to(self.device)
            self.model_target = self.create_model().to(self.device)

            if self.device.type == 'cpu':
                self.model.load_state_dict(torch.load(f'{model_name}.pth', map_location=torch.device('cpu')))
                self.model_target.load_state_dict(torch.load(f'{model_name}_target.pth', map_location=torch.device('cpu')))
            else:
                self.model.load_state_dict(torch.load(f'{model_name}.pth'))
                self.model_target.load_state_dict(torch.load(f'{model_name}_target.pth'))
            self.model.eval()
            self.model_target.eval()
        else:
            self.model = self.create_model().to(self.device)
            self.model_target = self.create_model().to(self.device)
            self.model_target.load_state_dict(self.model.state_dict())

        self.optimizer = optim.Adam(self.model.parameters(), lr=0.01)
        self.loss_fn = nn.MSELoss()

    def create_model(self):
        return DDQN(self.state_dim, self.action_dim)

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def act(self, state):
        if not self.is_eval and np.random.rand() <= self.epsilon:
            return random.randrange(self.action_dim)

        state = torch.tensor(state, dtype=torch.float32).unsqueeze(0).to(self.device)
        with torch.no_grad():
            options = self.model(state)
        return torch.argmax(options[0]).item()

    def experience_replay(self, batch_size):
        if len(self.memory) < batch_size:
            return  # Not enough samples in memory

        mini_batch = random.sample(self.memory, min(len(self.memory), batch_size))

        for state, action, reward, next_state, done in mini_batch:
            state = torch.tensor(state, dtype=torch.float32).to(self.device)
            next_state = torch.tensor(next_state, dtype=torch.float32).to(self.device)

            # Double DQN: Select the action using the model, evaluate using the target network
            target = reward
            if not done:
                best_action = torch.argmax(self.model(next_state)).item()
                target = reward + self.gamma * self.model_target(next_state)[0][best_action].item()

            target_f = self.model(state).detach().cpu().numpy()
            target_f = target_f.squeeze()
            target_f[action] = target

            target_f = torch.tensor(target_f, dtype=torch.float32).to(self.device).unsqueeze(0)

            self.optimizer.zero_grad()
            loss = self.loss_fn(self.model(state), target_f)
            loss.backward()
            self.optimizer.step()

        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

        # Update the target model with the new method
        self.update_model_target()

        return loss.item()

    def update_model_target(self):
        # Get the state_dict of the current model and the target model
        model_weights = self.model.state_dict()
        model_target_weights = self.model_target.state_dict()

        # Update the weights of the target model
        for key in model_weights:
            model_target_weights[key] = self.tau * model_weights[key] + (1 - self.tau) * model_target_weights[key]

        # Load the updated weights into the target model
        self.model_target.load_state_dict(model_target_weights)