-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathREINFORCE.py
197 lines (151 loc) · 7.41 KB
/
REINFORCE.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
'''
REINFORCE Monte Carlo Policy Gradient AI Player
Author: Lei Mao
Date: 5/2/2017
Introduction:
The REINFORCE_AI used REINFORCE, one of the Monte Carlo Policy Gradient methods, to optimize the AI actions in certain environment. It is extremely complicated to implement the loss function of REINFORCE in Keras. Tensorflow, though it takes time to construct the neural network, makes it easier to customize different loss functions.
'''
import os
import numpy as np
import tensorflow as tf
GAMMA = .99# decay rate of past observations
LEARNING_RATE = 0.00005 # learning rate in deep learning
RAND_SEED = 0 # random seed
SAVE_PERIOD = 100 # period of time steps to save the model
LOG_PERIOD = 100 # period of time steps to save the log of training
MODEL_DIR = 'model/' # path for saving the model
LOG_DIR = 'log/' # path for saving the training log
np.random.seed(RAND_SEED)
tf.set_random_seed(RAND_SEED)
class REINFORCE():
def __init__(self, num_actions, num_features):
# Initialize the number of player actions available in the game
self.num_actions = num_actions
# Initialize the number of features in the observation
self.num_features = num_features
# Initialize the model
self.model = self.REINFORCE_FC_Setup()
# Initialize tensorflow session
self.saver = tf.train.Saver()
self.sess = tf.Session()
self.sess.run(tf.global_variables_initializer())
# Initialize the episode number
self.episode = 0
# Initialize episode replays used for caching game transitions in one single episode
self.episode_observations = list() # observation feature list
self.episode_actions = list() # one-hot encoded action
self.episode_rewards = list() # immediate reward
def Softmax_Cross_Entropy(softmax_label, softmax_pred):
# Calculate cross entropy for softmaxed label and prediction matrices
# This function is not used in Tensorflow version of the code
return (-1.) * np.dot(softmax_label, np.log(softmax_pred.T))
def One_Hot_Encoding(labels, num_class):
# Transform labels to one-hot encoded array
# This function is not used in Tensorflow version of the code
matrix_encoded = np.zeros(len(labels), num_class, dtype = np.bool)
matrix_encoded[np.arrange(len(labels)), labels] = 1
return matrix_encoded
def REINFORCE_FC_Setup(self):
# Set up REINFORCE Tensorflow environment
with tf.name_scope('inputs'):
self.tf_observations = tf.placeholder(tf.float32, [None, self.num_features], name = 'observations')
self.tf_actions = tf.placeholder(tf.int32, [None,], name = 'num_actions')
self.tf_values = tf.placeholder(tf.float32, [None,], name = 'state_values')
# FC1
fc1 = tf.layers.dense(
inputs = self.tf_observations,
units = 16,
activation = tf.nn.tanh, # tanh activation
kernel_initializer = tf.random_normal_initializer(mean=0, stddev=0.5),
bias_initializer = tf.constant_initializer(0.1),
name='FC1'
)
# FC2
fc2 = tf.layers.dense(
inputs = fc1,
units = 32,
activation = tf.nn.tanh, # tanh activation
kernel_initializer = tf.random_normal_initializer(mean=0, stddev=0.4),
bias_initializer = tf.constant_initializer(0.1),
name='FC2'
)
# FC3
# fc3 = tf.layers.dense(
# inputs = fc2,
# units = 8,
# activation = tf.nn.sigmoid, # tanh activation
# kernel_initializer = tf.random_normal_initializer(mean=0, stddev=0.3),
# bias_initializer = tf.constant_initializer(0.1),
# name='FC3'
# )
# fc4 = tf.layers.dense(
# inputs = fc3,
# units = 8,
# activation = tf.nn.leaky_relu, # tanh activation
# kernel_initializer = tf.random_normal_initializer(mean=0, stddev=0.3),
# bias_initializer = tf.constant_initializer(0.1),
# name='FC4'
# )
# FC3
logits = tf.layers.dense(
inputs = fc2,
units = self.num_actions,
activation = None,
kernel_initializer = tf.random_normal_initializer(mean=0, stddev=0.3),
bias_initializer = tf.constant_initializer(0.1),
name='FC5'
)
# Softmax
self.action_probs = tf.nn.softmax(logits, name='action_probs')
with tf.name_scope('loss'):
# To maximize (log_p * V) is equal to minimize -(log_p * V)
# Construct loss function mean(-(log_p * V)) to be minimized by tensorflow
neg_log_prob = tf.nn.sparse_softmax_cross_entropy_with_logits(logits = logits, labels = self.tf_actions) # this equals to -log_p
self.loss = tf.reduce_mean(neg_log_prob * self.tf_values)
with tf.name_scope('train'):
self.optimizer = tf.train.AdamOptimizer(LEARNING_RATE).minimize(self.loss)
def REINFORCE_FC_Restore(self):
# Restore the trained model
self.saver.restore(self.sess, MODEL_DIR + 'AI_model')
def Store_Transition(self, observation, action, reward):
# Store game transitions used for updating the weights in the Policy Neural Network
self.episode_observations.append(observation)
self.episode_actions.append(action)
self.episode_rewards.append(reward)
def Clear_Episode_Replays(self):
# Clear game transitions
self.episode_observations = list()
self.episode_actions = list()
self.episode_rewards = list()
def Calculate_Value(self):
# The estimate of v(St) is updated in the direction of the complete return:
# Gt = Rt+1 + gamma * Rt+2 + gamma^2 * Rt+3 + ... + gamma^(T-t+1)RT;
# where T is the last time step of the episode.
state_values = np.zeros_like(self.episode_rewards, dtype=np.float64)
state_values[-1] = self.episode_rewards[-1]
for t in reversed(range(0, len(self.episode_rewards)-1)):
state_values[t] = GAMMA * state_values[t+1] + self.episode_rewards[t]
# Normalization to help the control of the gradient estimator variance
state_values -= np.mean(state_values)
state_values /= np.std(state_values)
return state_values
def REINFORCE_FC_Train(self):
# Train model using data from one episode
inputs = np.array(self.episode_observations)
state_values = self.Calculate_Value()
# Start gradient descent
_, train_loss = self.sess.run([self.optimizer, self.loss], feed_dict = {
self.tf_observations: np.vstack(self.episode_observations),
self.tf_actions: np.array(self.episode_actions),
self.tf_values: state_values})
# Print train_loss
print('Episode train loss: %f' %train_loss)
# Clear episode replays after training for one episode
self.Clear_Episode_Replays()
return train_loss
def AI_Action(self, observation):
# Calculate action probabilities when given observation
prob_weights = self.sess.run(self.action_probs, feed_dict = {self.tf_observations: observation[np.newaxis, :]})
# Randomly choose action according to the probabilities
action = np.random.choice(range(prob_weights.shape[1]), p=prob_weights.ravel())
return action