import numpy as np
import gym

# fix numpy rng seed
seed = 42
np.random.seed(seed)

# instantiate environment
env = gym.make("CartPole-v1")

# set environment seed
env.seed(seed)
env.action_space.np_random.seed(seed)

# RL problem parameters
gamma = 0.99  # Discount factor for past rewards
max_steps_per_episode = 10000 # task is non-episodic
return_solved = 300 # return cutoff to consider the task solved


from IPython import display
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

env.reset()
img = plt.imshow(env.render(mode='rgb_array')) # only call this once

for _ in range(80):
    # display settings
    img.set_data(env.render(mode='rgb_array')) # update data
    display.display(plt.gcf())
    display.clear_output(wait=True)
    # choose action
    action = env.action_space.sample()
    # take action
    frame, reward, is_done, _ = env.step(action) 
# close pop-up window 
env.close()


import jax.numpy as jnp # jax's numpy version with GPU support
from jax import random # used to define a RNG key to control the random input in JAX
from jax.experimental import stax # neural network library
from jax.experimental.stax import Dense, Relu, LogSoftmax, FanOut # neural network layers

# set key for the RNG (see JAX docs)
rng = random.PRNGKey(seed)


# define functions which initialize the parameters and evaluate the model
initialize_params, predict = stax.serial(
                                        # common base layer
                                        stax.serial(
                                            ### fully connected DNN
                                            Dense(128), # 128 hidden neurons
                                            Relu, # ReLu activation
                                            ),
                                        # actor and critic output heads
                                        FanOut(2), # split architecture pipeline into two heads using FanOut
                                        stax.parallel(
                                            # actor head
                                            stax.serial(
                                                Dense(env.action_space.n), # 2 output neurons (actor)
                                                LogSoftmax # LogSoftmax; NB: computes the log-probability
                                            ),
                                            # critic head
                                            Dense(1), # 1 output neuron (critic), no activation
                                        ),
                                )


# initialize the model parameters
input_shape = (-1,)+env.observation_space.shape # -1: number of time steps, size of state vector
output_shape, inital_params = initialize_params(rng, input_shape) # fcc layer 28x28 pixes in each image

print('\noutput shape of the AC network is {} for (actor, critic).\n'.format(output_shape))


# test network
states=np.ones((3,)+env.observation_space.shape, dtype=np.float32)

actor_predictions, critic_predictions = predict(inital_params, states)
# check the output shape
print("actor head shape:", actor_predictions.shape) # actor
print("critic head shape:", critic_predictions.shape) # critic

# check conservation of probability for actor
print('\nconservation of probability for actor:', np.sum(jnp.exp(actor_predictions), axis=1))

WARNING:absl:No GPU/TPU found, falling back to CPU. (Set TF_CPP_MIN_LOG_LEVEL=0 and rerun for more info.)

output shape of the AC network is ((-1, 2), (-1, 1)) for (actor, critic).

actor head shape: (3, 2)
critic head shape: (3, 1)

conservation of probability for actor: [1. 1. 1.]


### define loss and accuracy functions

from jax import grad, lax
from jax.tree_util import tree_flatten # jax params are stored as nested tuples; use this to manipulate tuples


def huber_loss(x, delta: float = 1.0):
    """
    # 0.5 * x^2                  if |x| <= delta
    # 0.5 * d^2 + d * (|x| - d)  if |x| > delta
    
    """
    abs_x = jnp.abs(x)
    quadratic = jnp.minimum(abs_x, delta)
    # Same as max(abs_x - delta, 0) but avoids potentially doubling the gradient.
    linear = abs_x - quadratic
    return 0.5 * quadratic ** 2 + delta * linear

def l2_regularizer(params, lmbda):
    """
    Define l2 regularizer: $\lambda \ sum_j ||theta_j||^2 $ for every parameter in the model $\theta_j$
    
    """
    return lmbda*jnp.sum(jnp.array([jnp.sum(jnp.abs(theta)**2) for theta in tree_flatten(params)[0] ]))


def AC_loss(params, trajectory):
    """
    Define the Actor-Critic loss function. 
    
    params: object(jax pytree):
        parameters of the deep policy network.
    trajectory: tuple (states, actions, returns) containing the RL states, actions and returns (not the rewards!): 
        states: np.array of size (trajectory length, env.observation_space.shape)
        actions: np.array of size (trajectory length, env.action_space.n)
        returns: np.array of size (trajectory length)
    
    """
    # extract data from the batch
    states, actions, returns = trajectory
    # compute policy predictions
    actor_preds, critic_preds = predict(params, states)
    critic_preds = critic_preds.squeeze() # remove extra array dimensions
    # select those values of the policy along the action trajectory
    actor_preds_select = jnp.take_along_axis(actor_preds, jnp.expand_dims(actions, axis=1), axis=1).squeeze()
    # actor pseudoloss: negative pseudo loss function (want to MAXimize reward with gradient DEscent)
    loss_actor = -jnp.mean(actor_preds_select * (returns - lax.stop_gradient(critic_preds) ) )
    # critic loss: use Huber loss
    loss_critic = jnp.mean(huber_loss(critic_preds - returns))
    #
    return loss_actor + loss_critic + l2_regularizer(params, 0.001)


### define generalized gradient descent optimizer and a function to update model parameters

from jax.experimental import optimizers # gradient descent optimizers
from jax import jit

step_size = 0.01 # step size or learning rate 

# compute optimizer functions
opt_init, opt_update, get_params = optimizers.adam(step_size)


# define function which updates the parameters using the change computed by the optimizer
@jit # Just In Time compilation speeds up the code; requires to use jnp everywhere; remove when debugging
def update(i, opt_state, trajectory):
    """
    i: int,
        counter to count how many update steps we have performed
    opt_state: object,
        the state of the optimizer
    trajectory: np.array
        batch containing the data used to update the model
        
    Returns: 
    opt_state: object,
        the new state of the optimizer
        
    """
    # get current parameters of the model
    current_params = get_params(opt_state)
    # compute gradients
    grad_params = grad(AC_loss)(current_params, trajectory)
    # use the optimizer to perform the update using opt_update
    return opt_update(i, grad_params, opt_state)


### Train model

import time

# preallocate aux variables
running_return = 0.0
episode = 0


print("\nStart training...\n")

# set the initial model parameters in the optimizer
opt_state = opt_init(inital_params)


while True:  # run until "solved", see break condition below
    
    # record time
    start_time = time.time()
        
    # reset environment
    state = env.reset()
    episode_return = 0.0
    
    # get current parameters
    current_params = get_params(opt_state)
    
    # preallocate empty lists for the states, actions and rewards within a trajectory
    states,actions,rewards = [],[],[]

    
    # loop over timesteps of episode to generate a trajectory
    for time_step in range(max_steps_per_episode):
        
        # record state
        states.append(state)
        
        # call network to compute \log\pi(:|s)
        log_pi_s, _ = predict(current_params,state)
        
        # select action according to actor probability distribution
        action = np.random.choice(env.action_space.n, p=np.exp(log_pi_s) )
        
        # record selected action
        actions.append(action)

        # take action observe next state and receive reward
        state, reward, done, _ = env.step(action)
        
        # record reward
        rewards.append(reward)
        
        # update current episode return
        episode_return += reward

        # break if episode has come to an end (i.e. the pendulum has fallen below 15°)
        if done:
            break
        
    # compute discounted returns from the bare rewards
    returns = np.array(rewards)
    returns = returns[::-1] * (gamma*np.ones_like(returns) )**np.arange(returns.shape[0])
    returns = jnp.cumsum(returns)[::-1]
    
    # define trajectory data
    trajectory = (np.array(states), np.array(actions), returns)
     
    # update model
    opt_state = update(episode, opt_state, trajectory)
            
        
    ### record time needed for a single epoch
    episode_time = time.time() - start_time
        
    # compute running return to check condition for solving the task
    running_return = 0.05 * episode_return + (1 - 0.05) * running_return
    
    # print stats
    episode += 1
    if episode % 10 == 0:
        template = "episode {}: averaged running return: {:.2f}; took {:0.2f} secs."
        print(template.format(episode, running_return, episode_time))
    
    ### check if task is considered solved
    if running_return > return_solved:  # condition to consider task solved
        print("\nSolved at episode {} with average running return {}!".format(episode, running_return))
        break

Start training...

episode 10: averaged running return: 16.37; took 0.08 secs.
episode 20: averaged running return: 27.26; took 0.80 secs.
episode 30: averaged running return: 39.75; took 0.70 secs.
episode 40: averaged running return: 48.66; took 0.91 secs.
episode 50: averaged running return: 55.18; took 0.12 secs.
episode 60: averaged running return: 44.94; took 0.66 secs.
episode 70: averaged running return: 33.68; took 0.05 secs.
episode 80: averaged running return: 29.15; took 0.06 secs.
episode 90: averaged running return: 29.65; took 0.07 secs.
episode 100: averaged running return: 30.32; took 0.05 secs.
episode 110: averaged running return: 30.61; took 0.07 secs.
episode 120: averaged running return: 39.97; took 1.04 secs.
episode 130: averaged running return: 41.64; took 0.12 secs.
episode 140: averaged running return: 63.08; took 1.11 secs.
episode 150: averaged running return: 61.51; took 0.93 secs.
episode 160: averaged running return: 69.63; took 0.95 secs.
episode 170: averaged running return: 60.88; took 0.78 secs.
episode 180: averaged running return: 70.87; took 1.19 secs.
episode 190: averaged running return: 87.42; took 0.34 secs.
episode 200: averaged running return: 111.04; took 1.62 secs.
episode 210: averaged running return: 112.74; took 0.88 secs.
episode 220: averaged running return: 87.47; took 0.07 secs.
episode 230: averaged running return: 71.55; took 0.84 secs.
episode 240: averaged running return: 81.89; took 0.90 secs.
episode 250: averaged running return: 140.31; took 1.34 secs.
episode 260: averaged running return: 140.98; took 0.32 secs.
episode 270: averaged running return: 131.98; took 0.17 secs.
episode 280: averaged running return: 116.49; took 0.20 secs.
episode 290: averaged running return: 157.31; took 1.76 secs.
episode 300: averaged running return: 155.98; took 0.25 secs.
episode 310: averaged running return: 235.59; took 1.10 secs.

Solved at episode 316 with average running return 305.6379016611696!

Actor-Critic (AC) Methods¶

Basic Theory¶

Actor-Critic Algorithms¶

Offline Actor-Critic Algorithm¶

Online Actor-Critic Algorithm¶

Cart Pole Environment¶

State (or Observation) and Action spaces for the Cartpole problem¶

Rewards¶

Actor-Critic Network¶

(Pseudo-) Loss Function¶

Define generalized gradient descent optimizer¶

Offline Actor-Critic Algorithm¶

Questions¶