import matplotlib
from matplotlib import pyplot
import numpy as np
import sys
sys.path.append("..")
from hiora_cartpole import features
from hiora_cartpole import fourier_fa
from hiora_cartpole import driver
import gym
env = gym.make('MountainCar-v0')
state_ranges = np.array([env.observation_space.low, env.observation_space.high])
four_n_weights, four_feature_vec \
= fourier_fa.make_feature_vec(state_ranges,
n_acts=3,
order=7)
#fv = feature_vec(cartpole.observation_space.sample(), cartpole.action_space.sample())
from hiora_cartpole import linfa
experience = linfa.init(lmbda=0.9,
init_alpha=1.0,
epsi=0.01,
feature_vec=four_feature_vec,
n_weights=four_n_weights,
act_space=env.action_space,
theta=None,
is_use_alpha_bounds=True)
/home/erle/.local/lib/python2.7/site-packages/matplotlib/__init__.py:1350: UserWarning: This call to matplotlib.use() has no effect because the backend has already been chosen; matplotlib.use() must be called *before* pylab, matplotlib.pyplot, or matplotlib.backends is imported for the first time. warnings.warn(_use_error_msg) [2016-09-21 14:55:04,801] Making new env: MountainCar-v0
experience, steps_per_episode, alpha_per_episode \
= driver.train(env, linfa, experience, n_episodes=2000, max_steps=200, is_render=False)
# Credits: http://matplotlib.org/examples/api/two_scales.html
fig, ax1 = pyplot.subplots()
ax1.plot(steps_per_episode, color='b')
ax2 = ax1.twinx()
ax2.plot(alpha_per_episode, color='r')
pyplot.show()
steps_per_episode
array([143, 129, 132, 148, 149, 147, 142, 148, 136, 147, 151, 153, 147, 148, 148, 134, 151, 152, 142, 100, 145, 134, 143, 149, 132, 147, 147, 148, 131, 147, 149, 99, 150, 133, 145, 150, 145, 140, 98, 148, 100, 95, 142, 145, 136, 144, 129, 109, 109, 143, 141, 146, 145, 140, 146, 144, 143, 142, 147, 131, 146, 92, 147, 144, 140, 145, 146, 130, 140, 131, 146, 146, 121, 144, 141, 129, 145, 150, 109, 142, 159, 143, 144, 92, 92, 132, 148, 101, 93, 141, 102, 146, 146, 97, 128, 109, 143, 143, 124, 109, 142, 144, 142, 152, 142, 125, 106, 137, 147, 124, 124, 143, 149, 116, 122, 110, 144, 149, 148, 141, 147, 127, 151, 102, 130, 93, 111, 119, 147, 99, 129, 141, 97, 113, 150, 110, 140, 150, 140, 153, 99, 130, 148, 147, 144, 134, 127, 140, 124, 146, 148, 197, 150, 137, 117, 132, 93, 130, 98, 147, 148, 148, 113, 130, 112, 167, 110, 147, 93, 117, 148, 132, 94, 145, 146, 138, 146, 123, 146, 149, 113, 147, 147, 132, 165, 106, 133, 149, 148, 147, 104, 146, 137, 145, 132, 150, 149, 127, 119, 112, 146, 149, 144, 141, 145, 99, 128, 146, 147, 133, 145, 143, 130, 173, 94, 129, 148, 144, 132, 151, 94, 149, 125, 141, 129, 131, 130, 200, 200, 200, 154, 154, 197, 151, 134, 147, 126, 115, 171, 135, 143, 147, 168, 140, 150, 141, 162, 140, 148, 170, 175, 132, 141, 124, 114, 131, 125, 129, 148, 144, 148, 135, 191, 135, 133, 147, 166, 148, 131, 149, 149, 136, 136, 147, 150, 150, 171, 200, 143, 128, 113, 119, 131, 146, 147, 147, 128, 130, 146, 177, 138, 144, 151, 131, 133, 131, 132, 144, 109, 141, 132, 129, 132, 200, 200, 148, 147, 150, 144, 131, 147, 141, 133, 174, 176, 171, 170, 138, 135, 143, 163, 164, 173, 170, 166, 138, 132, 137, 143, 123, 133, 130, 133, 173, 141, 200, 200, 156, 150, 153, 180, 200, 146, 140, 187, 129, 136, 130, 143, 138, 140, 116, 136, 143, 147, 141, 138, 142, 147, 144, 165, 164, 139, 102, 137, 173, 121, 137, 135, 146, 171, 191, 131, 144, 168, 102, 129, 145, 114, 125, 200, 150, 148, 154, 133, 167, 149, 169, 150, 163, 134, 182, 116, 114, 117, 140, 132, 112, 148, 111, 143, 117, 103, 135, 150, 132, 149, 153, 148, 150, 134, 114, 142, 121, 156, 149, 147, 101, 99, 198, 155, 136, 170, 138, 137, 181, 140, 171, 143, 162, 138, 144, 136, 142, 141, 135, 134, 134, 143, 166, 148, 139, 144, 146, 190, 138, 147, 131, 137, 112, 142, 191, 140, 114, 117, 112, 111, 148, 181, 147, 174, 148, 174, 174, 143, 151, 111, 100, 141, 113, 116, 112, 113, 121, 154, 149, 180, 149, 143, 174, 133, 143, 152, 145, 144, 192, 182, 200, 148, 147, 146, 147, 179, 148, 149, 143, 144, 139, 186, 181], dtype=int32)
%matplotlib notebook
state_ranges = np.array([env.observation_space.low, env.observation_space.high])
driver.plot_2D_V(state_ranges, env.action_space, four_feature_vec, experience.theta)
(40, 40) (40, 40) (40, 40)
experience, steps_per_episode = driver.train(env, linfa, experience, n_episodes=10, max_steps=200, is_render=True)tg
#pyplot.plot(steps_per_episode)
#pyplot.show()
from hiora_cartpole import features
env = gym.make('MountainCar-v0')
state_ranges = np.array([env.observation_space.low, env.observation_space.high])
tilec_n_weights, tilec_feature_vec = features.make_feature_vec(state_ranges, 3, [9, 9], 5)
#fv = feature_vec(cartpole.observation_space.sample(), cartpole.action_space.sample())
from hiora_cartpole import linfa
fexperience = linfa.init(lmbda=0.9,
init_alpha=1.0,
epsi=0.01,
feature_vec=tilec_feature_vec,
n_weights=tilec_n_weights,
act_space=env.action_space,
theta=None,
is_use_alpha_bounds=True)
[2016-09-20 10:13:46,516] Making new env: MountainCar-v0
fexperience, steps_per_episode, alpha_per_episode \
= driver.train(env, linfa, fexperience, n_episodes=100, max_steps=200, is_render=True)
# Credits: http://matplotlib.org/examples/api/two_scales.html
fig, ax1 = pyplot.subplots()
ax1.plot(steps_per_episode, color='b')
ax2 = ax1.twinx()
ax2.plot(alpha_per_episode, color='r')
pyplot.show()
---------------------------- KeyboardInterruptTraceback (most recent call last) <ipython-input-7-9edff38cd70a> in <module>() ----> 1 fexperience, steps_per_episode, alpha_per_episode = driver.train(env, linfa, fexperience, n_episodes=100, max_steps=200, is_render=True) 2 # Credits: http://matplotlib.org/examples/api/two_scales.html 3 fig, ax1 = pyplot.subplots() 4 ax1.plot(steps_per_episode, color='b') 5 ax2 = ax1.twinx() /home/erle/repos/cartpole/hiora_cartpole/driver.pyc in train(env, learner, experience, n_episodes, max_steps, is_render) 53 54 for t in xrange(max_steps): ---> 55 is_render and env.render() # pylint: disable=expression-not-assigned 56 experience, action = learner.think(experience, observation, reward, 57 done) /home/erle/repos/gym/gym/core.pyc in render(self, mode, close) 187 raise error.UnsupportedMode('Unsupported rendering mode: {}. (Supported modes for {}: {})'.format(mode, self, modes)) 188 --> 189 return self._render(mode=mode, close=close) 190 191 def close(self): /home/erle/repos/gym/gym/envs/classic_control/mountain_car.pyc in _render(self, mode, close) 117 self.cartrans.set_rotation(math.cos(3 * pos)) 118 --> 119 return self.viewer.render(return_rgb_array = mode=='rgb_array') /home/erle/repos/gym/gym/envs/classic_control/rendering.pyc in render(self, return_rgb_array) 80 def render(self, return_rgb_array=False): 81 glClearColor(1,1,1,1) ---> 82 self.window.clear() 83 self.window.switch_to() 84 self.window.dispatch_events() /home/erle/.local/lib/python2.7/site-packages/pyglet/window/__init__.pyc in clear(self) 1149 buffer. The window must be the active context (see `switch_to`). 1150 ''' -> 1151 gl.glClear(gl.GL_COLOR_BUFFER_BIT | gl.GL_DEPTH_BUFFER_BIT) 1152 1153 def dispatch_event(self, *args): /home/erle/.local/lib/python2.7/site-packages/pyglet/gl/lib.pyc in errcheck(result, func, arguments) 82 pass 83 ---> 84 def errcheck(result, func, arguments): 85 if _debug_gl_trace: 86 try: KeyboardInterrupt:
%matplotlib notebook
driver.plot_2D_V(state_ranges, env.action_space, tilec_feature_vec, fexperience.theta)
(40, 40) (40, 40) (40, 40)
Gehring's tilecoding is incredibly slow!
%time driver.train(env, linfa, fexperience, n_episodes=100, max_steps=200, is_render=False)
CPU times: user 1min 34s, sys: 76 ms, total: 1min 34s Wall time: 1min 34s
(Immutable(feature_vec=<function feature_vec_inner at 0x7fbe17f9aa28>, theta=array([ 0., 0., 0., ..., 0., 0., 0.]), E=array([ 0., 0., 0., ..., 0., 0., 0.]), epsi=0.01, init_alpha=1.0, p_alpha=0.024050127792627128, lmbda=0.9, p_obs=None, p_act=None, p_feat=None, act_space=Discrete(3), is_use_alpha_bounds=True), array([200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 164, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 180, 162, 152, 185, 177, 149, 154, 158, 200, 162, 159, 164, 157, 151, 155, 162, 160, 156, 158, 156, 154, 158, 154, 160, 150, 149, 153, 153, 153, 154, 154, 158, 158, 153], dtype=int32), array([ 0.04718258, 0.04718258, 0.03039294, 0.03039294, 0.03039294, 0.03039294, 0.03039294, 0.03039294, 0.03039294, 0.02405013, 0.02405013, 0.02405013, 0.02405013, 0.02405013, 0.02405013, 0.02405013, 0.02405013, 0.02405013, 0.02405013, 0.02405013, 0.02405013, 0.02405013, 0.02405013, 0.02405013, 0.02405013, 0.02405013, 0.02405013, 0.02405013, 0.02405013, 0.02405013, 0.02405013, 0.02405013, 0.02405013, 0.02405013, 0.02405013, 0.02405013, 0.02405013, 0.02405013, 0.02405013, 0.02405013, 0.02405013, 0.02405013, 0.02405013, 0.02405013, 0.02405013, 0.02405013, 0.02405013, 0.02405013, 0.02405013, 0.02405013, 0.02405013, 0.02405013, 0.02405013, 0.02405013, 0.02405013, 0.02405013, 0.02405013, 0.02405013, 0.02405013, 0.02405013, 0.02405013, 0.02405013, 0.02405013, 0.02405013, 0.02405013, 0.02405013, 0.02405013, 0.02405013, 0.02405013, 0.02405013, 0.02405013, 0.02405013, 0.02405013, 0.02405013, 0.02405013, 0.02405013, 0.02405013, 0.02405013, 0.02405013, 0.02405013, 0.02405013, 0.02405013, 0.02405013, 0.02405013, 0.02405013, 0.02405013, 0.02405013, 0.02405013, 0.02405013, 0.02405013, 0.02405013, 0.02405013, 0.02405013, 0.02405013, 0.02405013, 0.02405013, 0.02405013, 0.02405013, 0.02405013, 0.02405013]))
from hiora_cartpole import easytile_fa
env = gym.make('MountainCar-v0')
state_ranges = np.array([env.observation_space.low, env.observation_space.high])
easyt_n_weights, easyt_feature_vec = easytile_fa.make_feature_vec(state_ranges, 3, [9, 9], 5)
#fv = feature_vec(cartpole.observation_space.sample(), cartpole.action_space.sample())
from hiora_cartpole import linfa
eexperience = linfa.init(lmbda=0.9,
init_alpha=0.005,
epsi=0.01,
feature_vec=easyt_feature_vec,
n_weights=easyt_n_weights,
act_space=env.action_space,
theta=None,
is_use_alpha_bounds=False)
[2016-09-21 15:15:46,355] Making new env: MountainCar-v0
It's quite sensitive to lambda.
eexperience, steps_per_episode, alpha_per_episode \
= driver.train(env, linfa, eexperience, n_episodes=5000, max_steps=200, is_render=False)
# Credits: http://matplotlib.org/examples/api/two_scales.html
fig, ax1 = pyplot.subplots()
ax1.plot(steps_per_episode, color='b')
ax2 = ax1.twinx()
ax2.plot(alpha_per_episode, color='r')
pyplot.show()
fig, ax1 = pyplot.subplots()
ax1.plot(eexperience.theta)
pyplot.show()
%matplotlib notebook
driver.plot_2D_V(state_ranges, env.action_space, easyt_feature_vec, -eexperience.theta)
(40, 40) (40, 40) (40, 40)
eexperience, steps_per_episode, alpha_per_episode \
= driver.train(env, linfa, eexperience, n_episodes=20, max_steps=200, is_render=True)
%time driver.train(env, linfa, fexperience, n_episodes=100, max_steps=200, is_render=False)
CPU times: user 4.4 s, sys: 4 ms, total: 4.4 s Wall time: 4.4 s
(Immutable(feature_vec=<function feature_vec_inner at 0x7fbe1057aed8>, theta=array([ 0. , -1.6374056, -7.3466172, ..., 0. , 0. , 0. ]), E=array([ 0., 0., 0., ..., 0., 0., 0.]), epsi=0.01, init_alpha=1.0, p_alpha=0.024309124525753373, lmbda=0.9, p_obs=None, p_act=None, p_feat=None, act_space=Discrete(3), is_use_alpha_bounds=True), array([200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 161, 200, 200, 200, 151, 174, 200, 169, 171, 200, 138, 188, 141, 88, 88, 135, 140, 141, 92, 158, 150, 88, 90, 86, 139, 144, 161, 122, 161, 152, 136, 135, 148, 139, 100, 141, 140, 173, 200, 154, 152, 145, 138, 139, 135, 132, 134, 134, 134, 136, 138, 134, 134, 119, 116, 110, 109, 187, 135, 117, 120, 113, 116, 89, 136, 103, 104, 106, 109, 128, 138, 86, 137, 144, 140, 136, 88, 130, 84, 83, 168, 129], dtype=int32), array([ 0.04487296, 0.03067321, 0.03067321, 0.03067321, 0.03067321, 0.02911278, 0.02911278, 0.02702132, 0.02702132, 0.02702132, 0.02702132, 0.02430912, 0.02430912, 0.02430912, 0.02430912, 0.02430912, 0.02430912, 0.02430912, 0.02430912, 0.02430912, 0.02430912, 0.02430912, 0.02430912, 0.02430912, 0.02430912, 0.02430912, 0.02430912, 0.02430912, 0.02430912, 0.02430912, 0.02430912, 0.02430912, 0.02430912, 0.02430912, 0.02430912, 0.02430912, 0.02430912, 0.02430912, 0.02430912, 0.02430912, 0.02430912, 0.02430912, 0.02430912, 0.02430912, 0.02430912, 0.02430912, 0.02430912, 0.02430912, 0.02430912, 0.02430912, 0.02430912, 0.02430912, 0.02430912, 0.02430912, 0.02430912, 0.02430912, 0.02430912, 0.02430912, 0.02430912, 0.02430912, 0.02430912, 0.02430912, 0.02430912, 0.02430912, 0.02430912, 0.02430912, 0.02430912, 0.02430912, 0.02430912, 0.02430912, 0.02430912, 0.02430912, 0.02430912, 0.02430912, 0.02430912, 0.02430912, 0.02430912, 0.02430912, 0.02430912, 0.02430912, 0.02430912, 0.02430912, 0.02430912, 0.02430912, 0.02430912, 0.02430912, 0.02430912, 0.02430912, 0.02430912, 0.02430912, 0.02430912, 0.02430912, 0.02430912, 0.02430912, 0.02430912, 0.02430912, 0.02430912, 0.02430912, 0.02430912, 0.02430912]))
Faster.
# What do I want?
#
# - Write one procedure that just trains for a number of episodes.
#
# - Write another procedure that keeps a running average of episode lengths and
# stops training when the average doesn't change much anymore.
#
# - Possibly write a procedure that returns the sequence of Q functions
# resulting from training.
next_dtimestep = driver.make_next_dtimestep(env, linfa.think)
train_and_prep = driver.make_train_and_prep(env, next_dtimestep, linfa.wrapup)
episode_nr, last_avg, experience \
= driver.train_until_converged(
env=env,
train_and_prep=train_and_prep,
init_experience=experience,
max_steps=100,
max_episodes=10000,
avg_window=200,
max_diff=1)
print episode_nr, last_avg
cnts_dtimesteps = driver.cnts_dtimesteps_iter(env, train_and_prep, experience,
100)
thetas = driver.train_return_thetas(cnts_dtimesteps, 1000)
sqes = 1.0 / experience.theta.shape[0] * np.sum(np.diff(thetas, axis=0) ** 2, axis=1)
pyplot.plot(sqes)
pyplot.show()
sums = np.sum(np.abs(thetas), axis=1)
pyplot.plot(sums)
pyplot.show()
with np.load("hard-earned-theta.npz") as data:
old_theta = data['arr_0']
print np.sum(old_theta)
#hard_earned_theta = np.copy(experience.theta)
#np.savez_compressed("hard-earned-theta", hard_earned_theta)
---------------------------- KeyboardInterruptTraceback (most recent call last) <ipython-input-9-d07e132dca21> in <module>() 19 max_episodes=10000, 20 avg_window=200, ---> 21 max_diff=1) 22 print episode_nr, last_avg 23 /home/erle/repos/cartpole/hiora_cartpole/driver.pyc in train_until_converged(env, train_and_prep, init_experience, max_steps, max_episodes, avg_window, max_diff) 138 imap(lambda (a1, a2): abs(a1 - a2), 139 pairwise( --> 140 averages(limited_cnts, avg_window) 141 ) 142 ) /home/erle/repos/cartpole/hiora_cartpole/driver.pyc in iterate(f, x) 17 while True: 18 yield x ---> 19 x = f(x) 20 21 #### Some iterators /home/erle/repos/cartpole/hiora_cartpole/driver.pyc in <lambda>((_, dt)) 123 124 train_and_prep_ms = functools.partial(train_and_prep, max_steps=max_steps) --> 125 cnts_dtimesteps = iterate(lambda (_, dt): train_and_prep_ms(dt), 126 (0, first_dtimestep)) 127 /home/erle/repos/cartpole/hiora_cartpole/driver.pyc in train_and_prep_inner(first_dtimestep, max_steps) 92 n_dtimestep = 0 93 for n_dtimestep in enumerator: ---> 94 dtimestep = next_dtimestep(dtimestep) 95 96 if dtimestep.done: /home/erle/repos/cartpole/hiora_cartpole/driver.pyc in next_dtimestep_inner(dtimestep) 77 new_experience, new_action = think(dtimestep.experience, 78 dtimestep.observation, ---> 79 dtimestep.reward) 80 new_observation, new_reward, new_done, _ = env.step(new_action) 81 /home/erle/repos/cartpole/hiora_cartpole/linfa.pyc in think(e, o, r, done) 62 63 if not done: ---> 64 a = choose_action(e, o) # action 65 feat = e.feature_vec(o, a) 66 Qnext = feat.dot(e.theta) /home/erle/repos/cartpole/hiora_cartpole/linfa.pyc in choose_action(e, o) 45 46 def choose_action(e, o): ---> 47 if true_with_prob(e.epsi): 48 return e.act_space.sample() 49 else: /home/erle/repos/cartpole/hiora_cartpole/linfa.pyc in true_with_prob(p) 41 42 def true_with_prob(p): ---> 43 return np.random.choice(2, p=[1-p, p]) 44 45 KeyboardInterrupt:
np.array([[1, 2], [3, 4]])[0]
array([1, 2])
env.close()