Skip to content
Snippets Groups Projects
Select Git revision
  • ae6a51492948d621907d3e62c03c2854e6fcc19c
  • master default protected
  • LUFA-170418
  • LUFA-151115
  • LUFA-140928
  • LUFA-140302
  • LUFA-130901
  • LUFA-130901-BETA
  • LUFA-130303
  • LUFA-120730
  • LUFA-120730-BETA
  • LUFA-120219
  • LUFA-120219-BETA
  • LUFA-111009
  • LUFA-111009-BETA
  • LUFA-110528
  • LUFA-110528-BETA
17 results

Descriptors.c

Blame
  • 17_4_CARTPOLE.py 4.43 KiB
    import torch as T
    import torch.nn as nn
    import torch.nn.functional as F
    import torch.optim as optim
    
    import numpy as np
    
    '''
    17_4:
    Use policy gradient reinforcement learning to balance an inverted pendulum mounted on a cart driven by a controller that you train.
    References:
    https://lilianweng.github.io/posts/2018-04-08-policy-gradient/
    https://www.youtube.com/watch?v=wc-FxNENg9U&t=87s
    
    We have two actions, push left or push right.
    We have four states, cart position, cart velocity, pole angle, and pole velocity at tip.
    
    Pole angle and velocity are also symmetric
    
    '''
    
    class DeepQNetwork(nn.Module):
        def __init__(self, lr,input_dims, fc1_dims, fc2_dims, n_actions):
            super(DeepQNetwork, self).__init__()
            self.input_dims = input_dims
            self.fc1_dims = fc1_dims
            self.fc2_dims = fc2_dims
            self.n_actions = n_actions
            self.fc1 = nn.Linear(*self.input_dims,self.fc1_dims)
            self.fc2 = nn.Linear(self.fc1_dims,self.fc2_dims)
            self.fc3 = nn.Linear(self.fc2_dims,self.n_actions)
            self.optimizer = optim.Adam(self.parameters(),lr=lr)
            self.loss = nn.MSELoss()
            self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu')
            self.to(self.device)
    
        def forward(self,state):
            x = F.relu(self.fc1(state))
            x = F.relu(self.fc2(x))
            actions = self.fc3(x)
    
            return actions
        
    class Agent():
        def __init__(self, gamma, epsilon, lr, input_dims, batch_size, n_actions,
                     max_mem_size=100000, eps_end=0.01, eps_dec=5e-4):
            self.gamma = gamma
            self.epsilon = epsilon
            self.eps_min = eps_end
            self.eps_dec = eps_dec
            self.lr = lr
            self.action_space = [i for i in range(n_actions)]
            self.mem_size = max_mem_size
            self.batch_size = batch_size
            self.mem_cntr = 0
    
            self.Q_eval = DeepQNetwork(self.lr, n_actions=n_actions, input_dims=input_dims,
                                        fc1_dims=256, fc2_dims=256)
            self.state_memory = np.zeros((self.mem_size,*input_dims), dtype=np.float32)
            self.new_state_memory = np.zeros((self.mem_size,*input_dims), dtype=np.float32)
            self.action_memory = np.zeros(self.mem_size, dtype=np.int32)
            self.reward_memory = np.zeros(self.mem_size, dtype=np.float32)
            self.terminal_memory = np.zeros(self.mem_size, dtype=np.bool) # whether the state is terminal or not (game is done or not)
    
        def store_transition(self, state, action, reward, state_, terminal):
            index = self.mem_cntr % self.mem_size
            self.state_memory[index] = state
            self.new_state_memory[index] = state_
            self.reward_memory[index] = reward
            self.terminal_memory[index] = terminal
            self.action_memory[index] = action
    
            self.mem_cntr += 1
    
        def choose_action(self, observation):
            if np.random.random() > self.epsilon:
                state = T.tensor([observation]).to(self.Q_eval.device)
                actions = self.Q_eval.forward(state)
                action = T.argmax(actions).item()
            else:
                action = np.random.choice(self.action_space)
    
            return action
        
        def learn(self):
            """
            Get's called every step
            """
            if self.mem_cntr < self.batch_size:
                return
    
            self.Q_eval.optimizer.zero_grad()
    
            max_mem = self.mem_cntr if self.mem_cntr < self.mem_size else self.mem_size
    
            batch = np.random.choice(max_mem,self.batch_size,replace=False)
    
            batch_index = np.arange(self.batch_size,dtype=np.int32)
    
            state_batch = T.tensor(self.state_memory[batch]).to(self.Q_eval.device)
            new_state_batch = T.tensor(self.new_state_memory[batch]).to(self.Q_eval.device)
            reward_batch = T.tensor(self.reward_memory[batch]).to(self.Q_eval.device)
            terminal_batch = T.tensor(self.terminal_memory[batch]).to(self.Q_eval.device)
            action_batch = self.action_memory[batch]
    
            q_eval = self.Q_eval.forward(state_batch)[batch_index,action_batch]
            q_next = self.Q_eval.forward(new_state_batch)
            q_next[terminal_batch] = 0.0
    
            q_target = reward_batch + self.gamma*T.max(q_next,dim=1)[0] # returns a tuple of (values,indices), we want the value so [0]
    
            loss = self.Q_eval.loss(q_target,q_eval).to(self.Q_eval.device)
            loss.backward()
            self.Q_eval.optimizer.step()
    
            self.epsilon = self.epsilon - self.eps_dec if self.epsilon > self.eps_min else self.eps_min
    
    import gym
    if __name__ == '__main__':