Notebook

In [1]:

import matplotlib
from matplotlib import pyplot
import numpy as np

import sys
sys.path.append("..")
from hiora_cartpole import features
from hiora_cartpole import fourier_fa
from hiora_cartpole import driver

import gym

env = gym.make('MountainCar-v0')

state_ranges = np.array([env.observation_space.low, env.observation_space.high])
four_n_weights, four_feature_vec \
    = fourier_fa.make_feature_vec(state_ranges,
                                  n_acts=3,
                                  order=7)

#fv = feature_vec(cartpole.observation_space.sample(), cartpole.action_space.sample())

from hiora_cartpole import linfa
experience = linfa.init(lmbda=0.9,
                        init_alpha=1.0,
                        epsi=0.01,
                        feature_vec=four_feature_vec,
                        n_weights=four_n_weights,
                        act_space=env.action_space,
                        theta=None,
                        is_use_alpha_bounds=True)

/home/erle/.local/lib/python2.7/site-packages/matplotlib/__init__.py:1350: UserWarning:  This call to matplotlib.use() has no effect
because the backend has already been chosen;
matplotlib.use() must be called *before* pylab, matplotlib.pyplot,
or matplotlib.backends is imported for the first time.

  warnings.warn(_use_error_msg)
[2016-09-21 14:55:04,801] Making new env: MountainCar-v0

In [75]:

experience, steps_per_episode, alpha_per_episode \
    = driver.train(env, linfa, experience, n_episodes=2000, max_steps=200, is_render=False)
# Credits: http://matplotlib.org/examples/api/two_scales.html
fig, ax1 = pyplot.subplots()
ax1.plot(steps_per_episode, color='b')
ax2 = ax1.twinx()
ax2.plot(alpha_per_episode, color='r')
pyplot.show()

In [59]:

steps_per_episode

Out[59]:

array([143, 129, 132, 148, 149, 147, 142, 148, 136, 147, 151, 153, 147,
       148, 148, 134, 151, 152, 142, 100, 145, 134, 143, 149, 132, 147,
       147, 148, 131, 147, 149,  99, 150, 133, 145, 150, 145, 140,  98,
       148, 100,  95, 142, 145, 136, 144, 129, 109, 109, 143, 141, 146,
       145, 140, 146, 144, 143, 142, 147, 131, 146,  92, 147, 144, 140,
       145, 146, 130, 140, 131, 146, 146, 121, 144, 141, 129, 145, 150,
       109, 142, 159, 143, 144,  92,  92, 132, 148, 101,  93, 141, 102,
       146, 146,  97, 128, 109, 143, 143, 124, 109, 142, 144, 142, 152,
       142, 125, 106, 137, 147, 124, 124, 143, 149, 116, 122, 110, 144,
       149, 148, 141, 147, 127, 151, 102, 130,  93, 111, 119, 147,  99,
       129, 141,  97, 113, 150, 110, 140, 150, 140, 153,  99, 130, 148,
       147, 144, 134, 127, 140, 124, 146, 148, 197, 150, 137, 117, 132,
        93, 130,  98, 147, 148, 148, 113, 130, 112, 167, 110, 147,  93,
       117, 148, 132,  94, 145, 146, 138, 146, 123, 146, 149, 113, 147,
       147, 132, 165, 106, 133, 149, 148, 147, 104, 146, 137, 145, 132,
       150, 149, 127, 119, 112, 146, 149, 144, 141, 145,  99, 128, 146,
       147, 133, 145, 143, 130, 173,  94, 129, 148, 144, 132, 151,  94,
       149, 125, 141, 129, 131, 130, 200, 200, 200, 154, 154, 197, 151,
       134, 147, 126, 115, 171, 135, 143, 147, 168, 140, 150, 141, 162,
       140, 148, 170, 175, 132, 141, 124, 114, 131, 125, 129, 148, 144,
       148, 135, 191, 135, 133, 147, 166, 148, 131, 149, 149, 136, 136,
       147, 150, 150, 171, 200, 143, 128, 113, 119, 131, 146, 147, 147,
       128, 130, 146, 177, 138, 144, 151, 131, 133, 131, 132, 144, 109,
       141, 132, 129, 132, 200, 200, 148, 147, 150, 144, 131, 147, 141,
       133, 174, 176, 171, 170, 138, 135, 143, 163, 164, 173, 170, 166,
       138, 132, 137, 143, 123, 133, 130, 133, 173, 141, 200, 200, 156,
       150, 153, 180, 200, 146, 140, 187, 129, 136, 130, 143, 138, 140,
       116, 136, 143, 147, 141, 138, 142, 147, 144, 165, 164, 139, 102,
       137, 173, 121, 137, 135, 146, 171, 191, 131, 144, 168, 102, 129,
       145, 114, 125, 200, 150, 148, 154, 133, 167, 149, 169, 150, 163,
       134, 182, 116, 114, 117, 140, 132, 112, 148, 111, 143, 117, 103,
       135, 150, 132, 149, 153, 148, 150, 134, 114, 142, 121, 156, 149,
       147, 101,  99, 198, 155, 136, 170, 138, 137, 181, 140, 171, 143,
       162, 138, 144, 136, 142, 141, 135, 134, 134, 143, 166, 148, 139,
       144, 146, 190, 138, 147, 131, 137, 112, 142, 191, 140, 114, 117,
       112, 111, 148, 181, 147, 174, 148, 174, 174, 143, 151, 111, 100,
       141, 113, 116, 112, 113, 121, 154, 149, 180, 149, 143, 174, 133,
       143, 152, 145, 144, 192, 182, 200, 148, 147, 146, 147, 179, 148,
       149, 143, 144, 139, 186, 181], dtype=int32)

In [66]:

%matplotlib notebook
state_ranges = np.array([env.observation_space.low, env.observation_space.high])
driver.plot_2D_V(state_ranges, env.action_space, four_feature_vec, experience.theta)

(40, 40) (40, 40) (40, 40)

In [69]:

experience, steps_per_episode = driver.train(env, linfa, experience, n_episodes=10, max_steps=200, is_render=True)tg 
#pyplot.plot(steps_per_episode)
#pyplot.show()

In [6]:

from hiora_cartpole import features

env = gym.make('MountainCar-v0')

state_ranges = np.array([env.observation_space.low, env.observation_space.high])

tilec_n_weights, tilec_feature_vec = features.make_feature_vec(state_ranges, 3, [9, 9], 5)

#fv = feature_vec(cartpole.observation_space.sample(), cartpole.action_space.sample())

from hiora_cartpole import linfa
fexperience = linfa.init(lmbda=0.9,
                        init_alpha=1.0,
                        epsi=0.01,
                        feature_vec=tilec_feature_vec,
                        n_weights=tilec_n_weights,
                        act_space=env.action_space,
                        theta=None,
                        is_use_alpha_bounds=True)

[2016-09-20 10:13:46,516] Making new env: MountainCar-v0

In [7]:

fexperience, steps_per_episode, alpha_per_episode \
    = driver.train(env, linfa, fexperience, n_episodes=100, max_steps=200, is_render=True)
# Credits: http://matplotlib.org/examples/api/two_scales.html
fig, ax1 = pyplot.subplots()
ax1.plot(steps_per_episode, color='b')
ax2 = ax1.twinx()
ax2.plot(alpha_per_episode, color='r')
pyplot.show()

----------------------------
KeyboardInterruptTraceback (most recent call last)
<ipython-input-7-9edff38cd70a> in <module>()
----> 1 fexperience, steps_per_episode, alpha_per_episode     = driver.train(env, linfa, fexperience, n_episodes=100, max_steps=200, is_render=True)
      2 # Credits: http://matplotlib.org/examples/api/two_scales.html
      3 fig, ax1 = pyplot.subplots()
      4 ax1.plot(steps_per_episode, color='b')
      5 ax2 = ax1.twinx()

/home/erle/repos/cartpole/hiora_cartpole/driver.pyc in train(env, learner, experience, n_episodes, max_steps, is_render)
     53 
     54         for t in xrange(max_steps):
---> 55             is_render and env.render() # pylint: disable=expression-not-assigned
     56             experience, action = learner.think(experience, observation, reward,
     57                                                done)

/home/erle/repos/gym/gym/core.pyc in render(self, mode, close)
    187             raise error.UnsupportedMode('Unsupported rendering mode: {}. (Supported modes for {}: {})'.format(mode, self, modes))
    188 
--> 189         return self._render(mode=mode, close=close)
    190 
    191     def close(self):

/home/erle/repos/gym/gym/envs/classic_control/mountain_car.pyc in _render(self, mode, close)
    117         self.cartrans.set_rotation(math.cos(3 * pos))
    118 
--> 119         return self.viewer.render(return_rgb_array = mode=='rgb_array')

/home/erle/repos/gym/gym/envs/classic_control/rendering.pyc in render(self, return_rgb_array)
     80     def render(self, return_rgb_array=False):
     81         glClearColor(1,1,1,1)
---> 82         self.window.clear()
     83         self.window.switch_to()
     84         self.window.dispatch_events()

/home/erle/.local/lib/python2.7/site-packages/pyglet/window/__init__.pyc in clear(self)
   1149         buffer.  The window must be the active context (see `switch_to`).
   1150         '''
-> 1151         gl.glClear(gl.GL_COLOR_BUFFER_BIT | gl.GL_DEPTH_BUFFER_BIT)
   1152 
   1153     def dispatch_event(self, *args):

/home/erle/.local/lib/python2.7/site-packages/pyglet/gl/lib.pyc in errcheck(result, func, arguments)
     82     pass
     83 
---> 84 def errcheck(result, func, arguments):
     85     if _debug_gl_trace:
     86         try:

KeyboardInterrupt:

In [103]:

%matplotlib notebook
driver.plot_2D_V(state_ranges, env.action_space, tilec_feature_vec, fexperience.theta)

(40, 40) (40, 40) (40, 40)

Gehring's tilecoding is incredibly slow!

In [4]:

%time driver.train(env, linfa, fexperience, n_episodes=100, max_steps=200, is_render=False)

CPU times: user 1min 34s, sys: 76 ms, total: 1min 34s
Wall time: 1min 34s

Out[4]:

(Immutable(feature_vec=<function feature_vec_inner at 0x7fbe17f9aa28>, theta=array([ 0.,  0.,  0., ...,  0.,  0.,  0.]), E=array([ 0.,  0.,  0., ...,  0.,  0.,  0.]), epsi=0.01, init_alpha=1.0, p_alpha=0.024050127792627128, lmbda=0.9, p_obs=None, p_act=None, p_feat=None, act_space=Discrete(3), is_use_alpha_bounds=True),
 array([200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200,
        200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200,
        200, 200, 200, 164, 200, 200, 200, 200, 200, 200, 200, 200, 200,
        200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200,
        200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200,
        200, 180, 162, 152, 185, 177, 149, 154, 158, 200, 162, 159, 164,
        157, 151, 155, 162, 160, 156, 158, 156, 154, 158, 154, 160, 150,
        149, 153, 153, 153, 154, 154, 158, 158, 153], dtype=int32),
 array([ 0.04718258,  0.04718258,  0.03039294,  0.03039294,  0.03039294,
         0.03039294,  0.03039294,  0.03039294,  0.03039294,  0.02405013,
         0.02405013,  0.02405013,  0.02405013,  0.02405013,  0.02405013,
         0.02405013,  0.02405013,  0.02405013,  0.02405013,  0.02405013,
         0.02405013,  0.02405013,  0.02405013,  0.02405013,  0.02405013,
         0.02405013,  0.02405013,  0.02405013,  0.02405013,  0.02405013,
         0.02405013,  0.02405013,  0.02405013,  0.02405013,  0.02405013,
         0.02405013,  0.02405013,  0.02405013,  0.02405013,  0.02405013,
         0.02405013,  0.02405013,  0.02405013,  0.02405013,  0.02405013,
         0.02405013,  0.02405013,  0.02405013,  0.02405013,  0.02405013,
         0.02405013,  0.02405013,  0.02405013,  0.02405013,  0.02405013,
         0.02405013,  0.02405013,  0.02405013,  0.02405013,  0.02405013,
         0.02405013,  0.02405013,  0.02405013,  0.02405013,  0.02405013,
         0.02405013,  0.02405013,  0.02405013,  0.02405013,  0.02405013,
         0.02405013,  0.02405013,  0.02405013,  0.02405013,  0.02405013,
         0.02405013,  0.02405013,  0.02405013,  0.02405013,  0.02405013,
         0.02405013,  0.02405013,  0.02405013,  0.02405013,  0.02405013,
         0.02405013,  0.02405013,  0.02405013,  0.02405013,  0.02405013,
         0.02405013,  0.02405013,  0.02405013,  0.02405013,  0.02405013,
         0.02405013,  0.02405013,  0.02405013,  0.02405013,  0.02405013]))

In [12]:

from hiora_cartpole import easytile_fa

env = gym.make('MountainCar-v0')

state_ranges = np.array([env.observation_space.low, env.observation_space.high])

easyt_n_weights, easyt_feature_vec = easytile_fa.make_feature_vec(state_ranges, 3, [9, 9], 5)

#fv = feature_vec(cartpole.observation_space.sample(), cartpole.action_space.sample())

from hiora_cartpole import linfa
eexperience = linfa.init(lmbda=0.9,
                        init_alpha=0.005,
                        epsi=0.01,
                        feature_vec=easyt_feature_vec,
                        n_weights=easyt_n_weights,
                        act_space=env.action_space,
                        theta=None,
                        is_use_alpha_bounds=False)

[2016-09-21 15:15:46,355] Making new env: MountainCar-v0

It's quite sensitive to lambda.

In [21]:

eexperience, steps_per_episode, alpha_per_episode \
    = driver.train(env, linfa, eexperience, n_episodes=5000, max_steps=200, is_render=False)
# Credits: http://matplotlib.org/examples/api/two_scales.html
fig, ax1 = pyplot.subplots()
ax1.plot(steps_per_episode, color='b')
ax2 = ax1.twinx()
ax2.plot(alpha_per_episode, color='r')
pyplot.show()

In [22]:

fig, ax1 = pyplot.subplots()
ax1.plot(eexperience.theta)
pyplot.show()

In [23]:

%matplotlib notebook
driver.plot_2D_V(state_ranges, env.action_space, easyt_feature_vec, -eexperience.theta)

(40, 40) (40, 40) (40, 40)

In [24]:

eexperience, steps_per_episode, alpha_per_episode \
    = driver.train(env, linfa, eexperience, n_episodes=20, max_steps=200, is_render=True)

In [28]:

%time driver.train(env, linfa, fexperience, n_episodes=100, max_steps=200, is_render=False)

CPU times: user 4.4 s, sys: 4 ms, total: 4.4 s
Wall time: 4.4 s

Out[28]:

(Immutable(feature_vec=<function feature_vec_inner at 0x7fbe1057aed8>, theta=array([ 0.       , -1.6374056, -7.3466172, ...,  0.       ,  0.       ,  0.       ]), E=array([ 0.,  0.,  0., ...,  0.,  0.,  0.]), epsi=0.01, init_alpha=1.0, p_alpha=0.024309124525753373, lmbda=0.9, p_obs=None, p_act=None, p_feat=None, act_space=Discrete(3), is_use_alpha_bounds=True),
 array([200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200,
        200, 200, 200, 200, 200, 161, 200, 200, 200, 151, 174, 200, 169,
        171, 200, 138, 188, 141,  88,  88, 135, 140, 141,  92, 158, 150,
         88,  90,  86, 139, 144, 161, 122, 161, 152, 136, 135, 148, 139,
        100, 141, 140, 173, 200, 154, 152, 145, 138, 139, 135, 132, 134,
        134, 134, 136, 138, 134, 134, 119, 116, 110, 109, 187, 135, 117,
        120, 113, 116,  89, 136, 103, 104, 106, 109, 128, 138,  86, 137,
        144, 140, 136,  88, 130,  84,  83, 168, 129], dtype=int32),
 array([ 0.04487296,  0.03067321,  0.03067321,  0.03067321,  0.03067321,
         0.02911278,  0.02911278,  0.02702132,  0.02702132,  0.02702132,
         0.02702132,  0.02430912,  0.02430912,  0.02430912,  0.02430912,
         0.02430912,  0.02430912,  0.02430912,  0.02430912,  0.02430912,
         0.02430912,  0.02430912,  0.02430912,  0.02430912,  0.02430912,
         0.02430912,  0.02430912,  0.02430912,  0.02430912,  0.02430912,
         0.02430912,  0.02430912,  0.02430912,  0.02430912,  0.02430912,
         0.02430912,  0.02430912,  0.02430912,  0.02430912,  0.02430912,
         0.02430912,  0.02430912,  0.02430912,  0.02430912,  0.02430912,
         0.02430912,  0.02430912,  0.02430912,  0.02430912,  0.02430912,
         0.02430912,  0.02430912,  0.02430912,  0.02430912,  0.02430912,
         0.02430912,  0.02430912,  0.02430912,  0.02430912,  0.02430912,
         0.02430912,  0.02430912,  0.02430912,  0.02430912,  0.02430912,
         0.02430912,  0.02430912,  0.02430912,  0.02430912,  0.02430912,
         0.02430912,  0.02430912,  0.02430912,  0.02430912,  0.02430912,
         0.02430912,  0.02430912,  0.02430912,  0.02430912,  0.02430912,
         0.02430912,  0.02430912,  0.02430912,  0.02430912,  0.02430912,
         0.02430912,  0.02430912,  0.02430912,  0.02430912,  0.02430912,
         0.02430912,  0.02430912,  0.02430912,  0.02430912,  0.02430912,
         0.02430912,  0.02430912,  0.02430912,  0.02430912,  0.02430912]))

Faster.

In [9]:

# What do I want?
#
#  - Write one procedure that just trains for a number of episodes.
#
#  - Write another procedure that keeps a running average of episode lengths and
#    stops training when the average doesn't change much anymore.
#
#  - Possibly write a procedure that returns the sequence of Q functions
#    resulting from training.


next_dtimestep = driver.make_next_dtimestep(env, linfa.think)
train_and_prep = driver.make_train_and_prep(env, next_dtimestep, linfa.wrapup)

episode_nr, last_avg, experience \
    = driver.train_until_converged(
          env=env,
          train_and_prep=train_and_prep,
          init_experience=experience,
          max_steps=100,
          max_episodes=10000,
          avg_window=200,
          max_diff=1)
print episode_nr, last_avg




cnts_dtimesteps = driver.cnts_dtimesteps_iter(env, train_and_prep, experience,
                                              100)

thetas = driver.train_return_thetas(cnts_dtimesteps, 1000)

sqes = 1.0 / experience.theta.shape[0] * np.sum(np.diff(thetas, axis=0) ** 2, axis=1)

pyplot.plot(sqes)
pyplot.show()

sums = np.sum(np.abs(thetas), axis=1)
pyplot.plot(sums)
pyplot.show()

with np.load("hard-earned-theta.npz") as data:
    old_theta = data['arr_0']
    print np.sum(old_theta)

#hard_earned_theta = np.copy(experience.theta)
#np.savez_compressed("hard-earned-theta", hard_earned_theta)

----------------------------
KeyboardInterruptTraceback (most recent call last)
<ipython-input-9-d07e132dca21> in <module>()
     19           max_episodes=10000,
     20           avg_window=200,
---> 21           max_diff=1)
     22 print episode_nr, last_avg
     23 

/home/erle/repos/cartpole/hiora_cartpole/driver.pyc in train_until_converged(env, train_and_prep, init_experience, max_steps, max_episodes, avg_window, max_diff)
    138             imap(lambda (a1, a2): abs(a1 - a2),
    139                 pairwise(
--> 140                     averages(limited_cnts, avg_window)
    141                 )
    142             )

/home/erle/repos/cartpole/hiora_cartpole/driver.pyc in iterate(f, x)
     17     while True:
     18         yield x
---> 19         x = f(x)
     20 
     21 #### Some iterators

/home/erle/repos/cartpole/hiora_cartpole/driver.pyc in <lambda>((_, dt))
    123 
    124     train_and_prep_ms = functools.partial(train_and_prep, max_steps=max_steps)
--> 125     cnts_dtimesteps   = iterate(lambda (_, dt): train_and_prep_ms(dt),
    126                                 (0, first_dtimestep))
    127 

/home/erle/repos/cartpole/hiora_cartpole/driver.pyc in train_and_prep_inner(first_dtimestep, max_steps)
     92         n_dtimestep = 0
     93         for n_dtimestep in enumerator:
---> 94             dtimestep = next_dtimestep(dtimestep)
     95 
     96             if dtimestep.done:

/home/erle/repos/cartpole/hiora_cartpole/driver.pyc in next_dtimestep_inner(dtimestep)
     77         new_experience, new_action = think(dtimestep.experience,
     78                                            dtimestep.observation,
---> 79                                            dtimestep.reward)
     80         new_observation, new_reward, new_done, _ = env.step(new_action)
     81 

/home/erle/repos/cartpole/hiora_cartpole/linfa.pyc in think(e, o, r, done)
     62 
     63     if not done:
---> 64         a     = choose_action(e, o) # action
     65         feat  = e.feature_vec(o, a)
     66         Qnext = feat.dot(e.theta)

/home/erle/repos/cartpole/hiora_cartpole/linfa.pyc in choose_action(e, o)
     45 
     46 def choose_action(e, o):
---> 47     if true_with_prob(e.epsi):
     48         return e.act_space.sample()
     49     else:

/home/erle/repos/cartpole/hiora_cartpole/linfa.pyc in true_with_prob(p)
     41 
     42 def true_with_prob(p):
---> 43     return np.random.choice(2, p=[1-p, p])
     44 
     45 

KeyboardInterrupt:

In [11]:

np.array([[1, 2], [3, 4]])[0]

Out[11]:

array([1, 2])

In [4]:

env.close()