Select Git revision
Descriptors.c
-
Dean Camera authored
Add branch for the conversion of demos to use standard C header files for configuration, rather than makefile defined macros.
Dean Camera authoredAdd branch for the conversion of demos to use standard C header files for configuration, rather than makefile defined macros.
17_4_CARTPOLE.py 4.43 KiB
import torch as T
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
'''
17_4:
Use policy gradient reinforcement learning to balance an inverted pendulum mounted on a cart driven by a controller that you train.
References:
https://lilianweng.github.io/posts/2018-04-08-policy-gradient/
https://www.youtube.com/watch?v=wc-FxNENg9U&t=87s
We have two actions, push left or push right.
We have four states, cart position, cart velocity, pole angle, and pole velocity at tip.
Pole angle and velocity are also symmetric
'''
class DeepQNetwork(nn.Module):
def __init__(self, lr,input_dims, fc1_dims, fc2_dims, n_actions):
super(DeepQNetwork, self).__init__()
self.input_dims = input_dims
self.fc1_dims = fc1_dims
self.fc2_dims = fc2_dims
self.n_actions = n_actions
self.fc1 = nn.Linear(*self.input_dims,self.fc1_dims)
self.fc2 = nn.Linear(self.fc1_dims,self.fc2_dims)
self.fc3 = nn.Linear(self.fc2_dims,self.n_actions)
self.optimizer = optim.Adam(self.parameters(),lr=lr)
self.loss = nn.MSELoss()
self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu')
self.to(self.device)
def forward(self,state):
x = F.relu(self.fc1(state))
x = F.relu(self.fc2(x))
actions = self.fc3(x)
return actions
class Agent():
def __init__(self, gamma, epsilon, lr, input_dims, batch_size, n_actions,
max_mem_size=100000, eps_end=0.01, eps_dec=5e-4):
self.gamma = gamma
self.epsilon = epsilon
self.eps_min = eps_end
self.eps_dec = eps_dec
self.lr = lr
self.action_space = [i for i in range(n_actions)]
self.mem_size = max_mem_size
self.batch_size = batch_size
self.mem_cntr = 0
self.Q_eval = DeepQNetwork(self.lr, n_actions=n_actions, input_dims=input_dims,
fc1_dims=256, fc2_dims=256)
self.state_memory = np.zeros((self.mem_size,*input_dims), dtype=np.float32)
self.new_state_memory = np.zeros((self.mem_size,*input_dims), dtype=np.float32)
self.action_memory = np.zeros(self.mem_size, dtype=np.int32)
self.reward_memory = np.zeros(self.mem_size, dtype=np.float32)
self.terminal_memory = np.zeros(self.mem_size, dtype=np.bool) # whether the state is terminal or not (game is done or not)
def store_transition(self, state, action, reward, state_, terminal):
index = self.mem_cntr % self.mem_size
self.state_memory[index] = state
self.new_state_memory[index] = state_
self.reward_memory[index] = reward
self.terminal_memory[index] = terminal
self.action_memory[index] = action
self.mem_cntr += 1
def choose_action(self, observation):
if np.random.random() > self.epsilon:
state = T.tensor([observation]).to(self.Q_eval.device)
actions = self.Q_eval.forward(state)
action = T.argmax(actions).item()
else:
action = np.random.choice(self.action_space)
return action
def learn(self):
"""
Get's called every step
"""
if self.mem_cntr < self.batch_size:
return
self.Q_eval.optimizer.zero_grad()
max_mem = self.mem_cntr if self.mem_cntr < self.mem_size else self.mem_size
batch = np.random.choice(max_mem,self.batch_size,replace=False)
batch_index = np.arange(self.batch_size,dtype=np.int32)
state_batch = T.tensor(self.state_memory[batch]).to(self.Q_eval.device)
new_state_batch = T.tensor(self.new_state_memory[batch]).to(self.Q_eval.device)
reward_batch = T.tensor(self.reward_memory[batch]).to(self.Q_eval.device)
terminal_batch = T.tensor(self.terminal_memory[batch]).to(self.Q_eval.device)
action_batch = self.action_memory[batch]
q_eval = self.Q_eval.forward(state_batch)[batch_index,action_batch]
q_next = self.Q_eval.forward(new_state_batch)
q_next[terminal_batch] = 0.0
q_target = reward_batch + self.gamma*T.max(q_next,dim=1)[0] # returns a tuple of (values,indices), we want the value so [0]
loss = self.Q_eval.loss(q_target,q_eval).to(self.Q_eval.device)
loss.backward()
self.Q_eval.optimizer.step()
self.epsilon = self.epsilon - self.eps_dec if self.epsilon > self.eps_min else self.eps_min
import gym
if __name__ == '__main__':