import numpy as no
import gym
from keras.models import Sequential
from keras.layers import Dense, Activation, Flatten
from keras.optimizers import Adam
from rl.agents.dqn import DQNAgent
from rl.agents.ddpg import DDPGAgent
from rl.policy import BoltzmannQPolicy , LinearAnnealedPolicy , EpsGreedyQPolicy
from rl.memory import SequentialMemory
/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/h5py/__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`. from ._conv import register_converters as _register_converters Using TensorFlow backend.
ENV_NAME_2 = 'Asteroids-v0'
# Get the environment and extract the number of actions
env = gym.make(ENV_NAME_2)
nb_actions = env.action_space.n
nb_actions
14
# Next, we build a neural network model
model = Sequential()
model.add(Flatten(input_shape=(1,) + env.observation_space.shape))
model.add(Dense(3, activation= 'tanh')) # One layer of 3 units with tanh activation function
model.add(Dense(nb_actions))
model.add(Activation('sigmoid')) # one layer of 1 unit with sigmoid activation function
print(model.summary())
_________________________________________________________________ Layer (type) Output Shape Param # ================================================================= flatten_1 (Flatten) (None, 100800) 0 _________________________________________________________________ dense_1 (Dense) (None, 3) 302403 _________________________________________________________________ dense_2 (Dense) (None, 14) 56 _________________________________________________________________ activation_1 (Activation) (None, 14) 0 ================================================================= Total params: 302,459 Trainable params: 302,459 Non-trainable params: 0 _________________________________________________________________ None
#DQN -- Deep Reinforcement Learning
#Configure and compile the agent.
#Use every built-in Keras optimizer and metrics!
memory = SequentialMemory(limit=20000, window_length=1)
policy = BoltzmannQPolicy()
dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=10,
target_model_update=1e-2, policy=policy)
dqn.compile(Adam(lr=1e-3), metrics=['mae', 'acc'])
## Visualize the training during 500000 steps
dqn.fit(env, nb_steps=500000, visualize=True, verbose=2)
Training for 100000 steps ...
/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/rl/memory.py:29: UserWarning: Not enough entries to sample without replacement. Consider increasing your warm-up phase to avoid oversampling! warnings.warn('Not enough entries to sample without replacement. Consider increasing your warm-up phase to avoid oversampling!')
1455/100000: episode: 1, duration: 46.233s, episode steps: 1455, steps per second: 31, episode reward: 1350.000, mean reward: 0.928 [0.000, 100.000], mean action: 6.509 [0.000, 13.000], mean observation: 2.139 [0.000, 240.000], loss: 21.742520, mean_absolute_error: 0.748284, acc: 0.079748, mean_q: 0.865032 2423/100000: episode: 2, duration: 30.502s, episode steps: 968, steps per second: 32, episode reward: 630.000, mean reward: 0.651 [0.000, 100.000], mean action: 6.411 [0.000, 13.000], mean observation: 2.337 [0.000, 240.000], loss: 36.164433, mean_absolute_error: 0.916387, acc: 0.075252, mean_q: 0.964510 3192/100000: episode: 3, duration: 23.303s, episode steps: 769, steps per second: 33, episode reward: 780.000, mean reward: 1.014 [0.000, 100.000], mean action: 6.113 [0.000, 13.000], mean observation: 1.996 [0.000, 240.000], loss: 32.863270, mean_absolute_error: 0.941716, acc: 0.073716, mean_q: 0.978775 4037/100000: episode: 4, duration: 25.042s, episode steps: 845, steps per second: 34, episode reward: 880.000, mean reward: 1.041 [0.000, 100.000], mean action: 6.650 [0.000, 13.000], mean observation: 2.124 [0.000, 240.000], loss: 33.935211, mean_absolute_error: 0.961146, acc: 0.068861, mean_q: 0.984799 4472/100000: episode: 5, duration: 12.989s, episode steps: 435, steps per second: 33, episode reward: 330.000, mean reward: 0.759 [0.000, 50.000], mean action: 6.377 [0.000, 13.000], mean observation: 2.465 [0.000, 240.000], loss: 31.147161, mean_absolute_error: 0.964446, acc: 0.073851, mean_q: 0.988034 6292/100000: episode: 6, duration: 56.351s, episode steps: 1820, steps per second: 32, episode reward: 1180.000, mean reward: 0.648 [0.000, 100.000], mean action: 6.511 [0.000, 13.000], mean observation: 1.940 [0.000, 240.000], loss: 31.117493, mean_absolute_error: 0.973881, acc: 0.072373, mean_q: 0.991440 7098/100000: episode: 7, duration: 24.283s, episode steps: 806, steps per second: 33, episode reward: 780.000, mean reward: 0.968 [0.000, 100.000], mean action: 6.561 [0.000, 13.000], mean observation: 2.627 [0.000, 240.000], loss: 28.760992, mean_absolute_error: 0.976429, acc: 0.070720, mean_q: 0.994323 8897/100000: episode: 8, duration: 53.421s, episode steps: 1799, steps per second: 34, episode reward: 1180.000, mean reward: 0.656 [0.000, 100.000], mean action: 6.430 [0.000, 13.000], mean observation: 1.981 [0.000, 240.000], loss: 27.216291, mean_absolute_error: 0.976294, acc: 0.074833, mean_q: 0.996801 10784/100000: episode: 9, duration: 56.207s, episode steps: 1887, steps per second: 34, episode reward: 1320.000, mean reward: 0.700 [0.000, 100.000], mean action: 6.379 [0.000, 13.000], mean observation: 1.796 [0.000, 240.000], loss: 31.204847, mean_absolute_error: 0.985795, acc: 0.070830, mean_q: 0.998568 11988/100000: episode: 10, duration: 35.985s, episode steps: 1204, steps per second: 33, episode reward: 1180.000, mean reward: 0.980 [0.000, 100.000], mean action: 6.602 [0.000, 13.000], mean observation: 1.731 [0.000, 240.000], loss: 30.669523, mean_absolute_error: 0.986347, acc: 0.074517, mean_q: 0.999281 13541/100000: episode: 11, duration: 46.164s, episode steps: 1553, steps per second: 34, episode reward: 1080.000, mean reward: 0.695 [0.000, 100.000], mean action: 6.574 [0.000, 13.000], mean observation: 1.633 [0.000, 240.000], loss: 28.269192, mean_absolute_error: 0.983127, acc: 0.073487, mean_q: 0.999560 14309/100000: episode: 12, duration: 23.052s, episode steps: 768, steps per second: 33, episode reward: 580.000, mean reward: 0.755 [0.000, 100.000], mean action: 6.793 [0.000, 13.000], mean observation: 2.165 [0.000, 240.000], loss: 28.651321, mean_absolute_error: 0.982694, acc: 0.075562, mean_q: 0.999704 14855/100000: episode: 13, duration: 16.280s, episode steps: 546, steps per second: 34, episode reward: 430.000, mean reward: 0.788 [0.000, 100.000], mean action: 6.255 [0.000, 13.000], mean observation: 2.600 [0.000, 240.000], loss: 25.917961, mean_absolute_error: 0.979710, acc: 0.075321, mean_q: 0.999771 15676/100000: episode: 14, duration: 24.421s, episode steps: 821, steps per second: 34, episode reward: 780.000, mean reward: 0.950 [0.000, 100.000], mean action: 6.279 [0.000, 13.000], mean observation: 2.161 [0.000, 240.000], loss: 29.796518, mean_absolute_error: 0.985070, acc: 0.073196, mean_q: 0.999817 17303/100000: episode: 15, duration: 48.080s, episode steps: 1627, steps per second: 34, episode reward: 1320.000, mean reward: 0.811 [0.000, 100.000], mean action: 6.373 [0.000, 13.000], mean observation: 1.493 [0.000, 240.000], loss: 31.838648, mean_absolute_error: 0.989215, acc: 0.072065, mean_q: 0.999859 18249/100000: episode: 16, duration: 27.618s, episode steps: 946, steps per second: 34, episode reward: 880.000, mean reward: 0.930 [0.000, 150.000], mean action: 6.580 [0.000, 13.000], mean observation: 1.984 [0.000, 240.000], loss: 33.025875, mean_absolute_error: 0.990334, acc: 0.072509, mean_q: 0.999892 21461/100000: episode: 17, duration: 93.989s, episode steps: 3212, steps per second: 34, episode reward: 1880.000, mean reward: 0.585 [0.000, 100.000], mean action: 6.588 [0.000, 13.000], mean observation: 1.762 [0.000, 240.000], loss: 30.154207, mean_absolute_error: 0.987160, acc: 0.070809, mean_q: 0.999936 22917/100000: episode: 18, duration: 42.662s, episode steps: 1456, steps per second: 34, episode reward: 1180.000, mean reward: 0.810 [0.000, 100.000], mean action: 6.620 [0.000, 13.000], mean observation: 1.905 [0.000, 240.000], loss: 28.899130, mean_absolute_error: 0.983624, acc: 0.068402, mean_q: 0.999970 23238/100000: episode: 19, duration: 9.459s, episode steps: 321, steps per second: 34, episode reward: 430.000, mean reward: 1.340 [0.000, 100.000], mean action: 6.533 [0.000, 13.000], mean observation: 2.253 [0.000, 240.000], loss: 26.920767, mean_absolute_error: 0.980413, acc: 0.075058, mean_q: 0.999978 23586/100000: episode: 20, duration: 10.229s, episode steps: 348, steps per second: 34, episode reward: 160.000, mean reward: 0.460 [0.000, 50.000], mean action: 6.871 [0.000, 13.000], mean observation: 2.307 [0.000, 240.000], loss: 25.303034, mean_absolute_error: 0.980627, acc: 0.065823, mean_q: 0.999980 26189/100000: episode: 21, duration: 78.370s, episode steps: 2603, steps per second: 33, episode reward: 1320.000, mean reward: 0.507 [0.000, 100.000], mean action: 6.503 [0.000, 13.000], mean observation: 1.784 [0.000, 240.000], loss: 27.165476, mean_absolute_error: 0.981853, acc: 0.070652, mean_q: 0.999986 27369/100000: episode: 22, duration: 34.425s, episode steps: 1180, steps per second: 34, episode reward: 980.000, mean reward: 0.831 [0.000, 100.000], mean action: 6.385 [0.000, 13.000], mean observation: 1.941 [0.000, 240.000], loss: 27.088072, mean_absolute_error: 0.980287, acc: 0.067505, mean_q: 0.999992 28548/100000: episode: 23, duration: 35.070s, episode steps: 1179, steps per second: 34, episode reward: 980.000, mean reward: 0.831 [0.000, 100.000], mean action: 6.528 [0.000, 13.000], mean observation: 2.060 [0.000, 240.000], loss: 24.597326, mean_absolute_error: 0.977471, acc: 0.068119, mean_q: 0.999994 28919/100000: episode: 24, duration: 11.016s, episode steps: 371, steps per second: 34, episode reward: 480.000, mean reward: 1.294 [0.000, 100.000], mean action: 6.625 [0.000, 13.000], mean observation: 2.276 [0.000, 240.000], loss: 30.021408, mean_absolute_error: 0.985281, acc: 0.075219, mean_q: 0.999995 30695/100000: episode: 25, duration: 52.892s, episode steps: 1776, steps per second: 34, episode reward: 1300.000, mean reward: 0.732 [0.000, 100.000], mean action: 6.465 [0.000, 13.000], mean observation: 1.728 [0.000, 240.000], loss: 26.489769, mean_absolute_error: 0.980855, acc: 0.071087, mean_q: 0.999996 32084/100000: episode: 26, duration: 41.220s, episode steps: 1389, steps per second: 34, episode reward: 930.000, mean reward: 0.670 [0.000, 100.000], mean action: 6.603 [0.000, 13.000], mean observation: 1.972 [0.000, 240.000], loss: 28.961477, mean_absolute_error: 0.984377, acc: 0.070082, mean_q: 0.999997 33770/100000: episode: 27, duration: 50.046s, episode steps: 1686, steps per second: 34, episode reward: 1350.000, mean reward: 0.801 [0.000, 100.000], mean action: 6.718 [0.000, 13.000], mean observation: 1.699 [0.000, 240.000], loss: 30.137896, mean_absolute_error: 0.985636, acc: 0.070192, mean_q: 0.999998 35502/100000: episode: 28, duration: 51.162s, episode steps: 1732, steps per second: 34, episode reward: 1760.000, mean reward: 1.016 [0.000, 100.000], mean action: 6.483 [0.000, 13.000], mean observation: 1.864 [0.000, 240.000], loss: 29.654802, mean_absolute_error: 0.986030, acc: 0.069302, mean_q: 0.999999 36281/100000: episode: 29, duration: 23.153s, episode steps: 779, steps per second: 34, episode reward: 610.000, mean reward: 0.783 [0.000, 100.000], mean action: 6.589 [0.000, 13.000], mean observation: 2.412 [0.000, 240.000], loss: 27.241671, mean_absolute_error: 0.980745, acc: 0.069480, mean_q: 0.999999 37151/100000: episode: 30, duration: 25.850s, episode steps: 870, steps per second: 34, episode reward: 830.000, mean reward: 0.954 [0.000, 100.000], mean action: 6.493 [0.000, 13.000], mean observation: 2.012 [0.000, 240.000], loss: 28.615261, mean_absolute_error: 0.984251, acc: 0.071480, mean_q: 0.999999 38717/100000: episode: 31, duration: 46.353s, episode steps: 1566, steps per second: 34, episode reward: 1180.000, mean reward: 0.754 [0.000, 100.000], mean action: 6.408 [0.000, 13.000], mean observation: 1.558 [0.000, 240.000], loss: 27.654753, mean_absolute_error: 0.983781, acc: 0.072777, mean_q: 0.999999 39922/100000: episode: 32, duration: 35.704s, episode steps: 1205, steps per second: 34, episode reward: 1080.000, mean reward: 0.896 [0.000, 100.000], mean action: 6.408 [0.000, 13.000], mean observation: 1.722 [0.000, 240.000], loss: 27.877979, mean_absolute_error: 0.982937, acc: 0.068361, mean_q: 0.999999 41911/100000: episode: 33, duration: 58.841s, episode steps: 1989, steps per second: 34, episode reward: 1560.000, mean reward: 0.784 [0.000, 100.000], mean action: 6.626 [0.000, 13.000], mean observation: 2.046 [0.000, 240.000], loss: 27.723555, mean_absolute_error: 0.982845, acc: 0.071220, mean_q: 0.999999 42709/100000: episode: 34, duration: 23.760s, episode steps: 798, steps per second: 34, episode reward: 880.000, mean reward: 1.103 [0.000, 100.000], mean action: 6.267 [0.000, 13.000], mean observation: 2.308 [0.000, 240.000], loss: 27.761700, mean_absolute_error: 0.982930, acc: 0.071429, mean_q: 0.999999 43787/100000: episode: 35, duration: 32.243s, episode steps: 1078, steps per second: 33, episode reward: 780.000, mean reward: 0.724 [0.000, 100.000], mean action: 6.662 [0.000, 13.000], mean observation: 2.266 [0.000, 240.000], loss: 27.965694, mean_absolute_error: 0.983301, acc: 0.072182, mean_q: 0.999999 46754/100000: episode: 36, duration: 88.179s, episode steps: 2967, steps per second: 34, episode reward: 2080.000, mean reward: 0.701 [0.000, 100.000], mean action: 6.551 [0.000, 13.000], mean observation: 2.167 [0.000, 240.000], loss: 30.416687, mean_absolute_error: 0.986695, acc: 0.070147, mean_q: 1.000000 48099/100000: episode: 37, duration: 40.026s, episode steps: 1345, steps per second: 34, episode reward: 1130.000, mean reward: 0.840 [0.000, 100.000], mean action: 6.546 [0.000, 13.000], mean observation: 1.729 [0.000, 240.000], loss: 32.340946, mean_absolute_error: 0.989692, acc: 0.068425, mean_q: 1.000000 48561/100000: episode: 38, duration: 13.956s, episode steps: 462, steps per second: 33, episode reward: 530.000, mean reward: 1.147 [0.000, 100.000], mean action: 6.578 [0.000, 13.000], mean observation: 2.210 [0.000, 240.000], loss: 25.409176, mean_absolute_error: 0.980768, acc: 0.071361, mean_q: 1.000000 done, took 1479.706 seconds
<keras.callbacks.History at 0x12150b908>
#Plot loss variations
import matplotlib.pyplot as plt
episodes = [1455,2423,3192,4037,4472,6292,7098,8897,
10784,11988,13541,14309,14855,15676,17303,18249,
21461,22917,23238,23586,26189,27369,28548,28919,
30695,32084,33770,35502,36281,37151,38717,39922,
41911,42709,43787,46754,48099,48561]
loss = [21.74,36.16,32.86,33.93,31.62,31.17,28.76,27.21,31.20,
30.66,28.269,28.651,25.91,29.79,31.83,33.02,30.15,28.89,
26.92,25.30,27.16,27.08,24.59,30.02,26.48,28.96,30.13,
29.65,27.24,28.61,27.87,27.72,26.7,27.76,27.96,30.41,32.34,25.04]
plt.plot(episodes, loss, 'r--')
plt.axis([0, 50000, 0, 40])
plt.show()
## Save the model
dqn.save_weights('dqn_{}_weights.h5f'.format(ENV_NAME_2), overwrite=True)
# Evaluate the algorithm for 10 episodes
dqn.test(env, nb_episodes=10, visualize=True)
Testing for 10 episodes ... Episode 1: reward: 110.000, steps: 726 Episode 2: reward: 130.000, steps: 604 Episode 3: reward: 210.000, steps: 613 Episode 4: reward: 110.000, steps: 922 Episode 5: reward: 110.000, steps: 622 Episode 6: reward: 260.000, steps: 571 Episode 7: reward: 130.000, steps: 612 Episode 8: reward: 260.000, steps: 567 Episode 9: reward: 260.000, steps: 576 Episode 10: reward: 260.000, steps: 578
<keras.callbacks.History at 0x14961b860>
### Another Policy with dqn
policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr="eps", value_max=.8, value_min=.01,
value_test=.0,
nb_steps=100000)
dqn = DQNAgent(model=model, nb_actions=nb_actions, nb_steps_warmup=10,
policy=policy, test_policy=policy, memory = memory,
target_model_update=1e-2)
dqn.compile(Adam(lr=1e-3), metrics=['mae', 'acc'])
dqn.fit(env, nb_steps=50000, visualize=True, verbose=2)
Training for 50000 steps ... 2647/50000: episode: 1, duration: 78.745s, episode steps: 2647, steps per second: 34, episode reward: 1180.000, mean reward: 0.446 [0.000, 100.000], mean action: 5.080 [0.000, 13.000], mean observation: 1.496 [0.000, 240.000], loss: 29.992819, mean_absolute_error: 0.987530, acc: 0.366749, mean_q: 1.000000, mean_eps: 0.789505 5062/50000: episode: 2, duration: 70.523s, episode steps: 2415, steps per second: 34, episode reward: 1390.000, mean reward: 0.576 [0.000, 100.000], mean action: 4.961 [0.000, 13.000], mean observation: 2.056 [0.000, 240.000], loss: 29.211633, mean_absolute_error: 0.985632, acc: 0.362526, mean_q: 1.000000, mean_eps: 0.769553 6988/50000: episode: 3, duration: 56.540s, episode steps: 1926, steps per second: 34, episode reward: 1410.000, mean reward: 0.732 [0.000, 100.000], mean action: 4.895 [0.000, 13.000], mean observation: 2.147 [0.000, 240.000], loss: 30.044086, mean_absolute_error: 0.987133, acc: 0.360965, mean_q: 1.000000, mean_eps: 0.752406 7721/50000: episode: 4, duration: 21.421s, episode steps: 733, steps per second: 34, episode reward: 430.000, mean reward: 0.587 [0.000, 100.000], mean action: 4.943 [0.000, 13.000], mean observation: 2.584 [0.000, 240.000], loss: 26.033418, mean_absolute_error: 0.980401, acc: 0.356114, mean_q: 1.000000, mean_eps: 0.741903 9006/50000: episode: 5, duration: 37.220s, episode steps: 1285, steps per second: 35, episode reward: 1280.000, mean reward: 0.996 [0.000, 100.000], mean action: 4.936 [0.000, 13.000], mean observation: 1.588 [0.000, 240.000], loss: 26.786589, mean_absolute_error: 0.981151, acc: 0.347811, mean_q: 1.000000, mean_eps: 0.733932 9489/50000: episode: 6, duration: 14.189s, episode steps: 483, steps per second: 34, episode reward: 460.000, mean reward: 0.952 [0.000, 100.000], mean action: 4.818 [0.000, 13.000], mean observation: 2.550 [0.000, 240.000], loss: 27.920443, mean_absolute_error: 0.983796, acc: 0.344591, mean_q: 1.000000, mean_eps: 0.726949 10482/50000: episode: 7, duration: 29.064s, episode steps: 993, steps per second: 34, episode reward: 930.000, mean reward: 0.937 [0.000, 100.000], mean action: 4.862 [0.000, 13.000], mean observation: 2.175 [0.000, 240.000], loss: 32.332520, mean_absolute_error: 0.988627, acc: 0.349100, mean_q: 1.000000, mean_eps: 0.721118 11303/50000: episode: 8, duration: 24.297s, episode steps: 821, steps per second: 34, episode reward: 580.000, mean reward: 0.706 [0.000, 100.000], mean action: 4.653 [0.000, 13.000], mean observation: 2.164 [0.000, 240.000], loss: 25.372195, mean_absolute_error: 0.978653, acc: 0.348432, mean_q: 1.000000, mean_eps: 0.713953 12967/50000: episode: 9, duration: 49.121s, episode steps: 1664, steps per second: 34, episode reward: 1180.000, mean reward: 0.709 [0.000, 100.000], mean action: 4.603 [0.000, 13.000], mean observation: 1.855 [0.000, 240.000], loss: 26.680792, mean_absolute_error: 0.980070, acc: 0.343675, mean_q: 1.000000, mean_eps: 0.704137 14767/50000: episode: 10, duration: 52.906s, episode steps: 1800, steps per second: 34, episode reward: 2050.000, mean reward: 1.139 [0.000, 100.000], mean action: 4.512 [0.000, 13.000], mean observation: 2.010 [0.000, 240.000], loss: 26.146817, mean_absolute_error: 0.980223, acc: 0.340104, mean_q: 1.000000, mean_eps: 0.690455 17088/50000: episode: 11, duration: 68.414s, episode steps: 2321, steps per second: 34, episode reward: 1880.000, mean reward: 0.810 [0.000, 100.000], mean action: 4.256 [0.000, 13.000], mean observation: 2.131 [0.000, 240.000], loss: 28.392949, mean_absolute_error: 0.984508, acc: 0.342996, mean_q: 1.000000, mean_eps: 0.674177 17370/50000: episode: 12, duration: 8.740s, episode steps: 282, steps per second: 32, episode reward: 160.000, mean reward: 0.567 [0.000, 50.000], mean action: 4.316 [0.000, 13.000], mean observation: 2.853 [0.000, 240.000], loss: 33.064301, mean_absolute_error: 0.992362, acc: 0.329455, mean_q: 1.000000, mean_eps: 0.663895 17887/50000: episode: 13, duration: 15.122s, episode steps: 517, steps per second: 34, episode reward: 580.000, mean reward: 1.122 [0.000, 100.000], mean action: 4.555 [0.000, 13.000], mean observation: 2.274 [0.000, 240.000], loss: 24.594645, mean_absolute_error: 0.978975, acc: 0.335529, mean_q: 1.000000, mean_eps: 0.660739 18599/50000: episode: 14, duration: 20.679s, episode steps: 712, steps per second: 34, episode reward: 630.000, mean reward: 0.885 [0.000, 100.000], mean action: 3.958 [0.000, 13.000], mean observation: 2.229 [0.000, 240.000], loss: 26.505238, mean_absolute_error: 0.981136, acc: 0.337166, mean_q: 1.000000, mean_eps: 0.655884 19641/50000: episode: 15, duration: 30.318s, episode steps: 1042, steps per second: 34, episode reward: 1080.000, mean reward: 1.036 [0.000, 100.000], mean action: 4.361 [0.000, 13.000], mean observation: 1.825 [0.000, 240.000], loss: 30.079884, mean_absolute_error: 0.988954, acc: 0.330824, mean_q: 1.000000, mean_eps: 0.648956 20361/50000: episode: 16, duration: 20.884s, episode steps: 720, steps per second: 34, episode reward: 830.000, mean reward: 1.153 [0.000, 100.000], mean action: 4.181 [0.000, 13.000], mean observation: 2.156 [0.000, 240.000], loss: 30.022883, mean_absolute_error: 0.986804, acc: 0.341710, mean_q: 1.000000, mean_eps: 0.641996 20921/50000: episode: 17, duration: 16.383s, episode steps: 560, steps per second: 34, episode reward: 280.000, mean reward: 0.500 [0.000, 50.000], mean action: 4.029 [0.000, 13.000], mean observation: 2.809 [0.000, 240.000], loss: 32.525301, mean_absolute_error: 0.990283, acc: 0.343750, mean_q: 1.000000, mean_eps: 0.636940 22419/50000: episode: 18, duration: 43.820s, episode steps: 1498, steps per second: 34, episode reward: 1080.000, mean reward: 0.721 [0.000, 100.000], mean action: 4.009 [0.000, 13.000], mean observation: 1.739 [0.000, 240.000], loss: 29.147020, mean_absolute_error: 0.986433, acc: 0.338722, mean_q: 1.000000, mean_eps: 0.628811 23198/50000: episode: 19, duration: 22.840s, episode steps: 779, steps per second: 34, episode reward: 930.000, mean reward: 1.194 [0.000, 100.000], mean action: 4.067 [0.000, 13.000], mean observation: 2.250 [0.000, 240.000], loss: 28.683109, mean_absolute_error: 0.986974, acc: 0.344071, mean_q: 1.000000, mean_eps: 0.619817 24514/50000: episode: 20, duration: 38.692s, episode steps: 1316, steps per second: 34, episode reward: 1360.000, mean reward: 1.033 [0.000, 100.000], mean action: 4.226 [0.000, 13.000], mean observation: 2.471 [0.000, 240.000], loss: 30.828469, mean_absolute_error: 0.991295, acc: 0.365715, mean_q: 1.000000, mean_eps: 0.611542 26366/50000: episode: 21, duration: 53.995s, episode steps: 1852, steps per second: 34, episode reward: 1320.000, mean reward: 0.713 [0.000, 100.000], mean action: 4.002 [0.000, 13.000], mean observation: 1.992 [0.000, 240.000], loss: 30.103927, mean_absolute_error: 0.989485, acc: 0.372182, mean_q: 1.000000, mean_eps: 0.599028 27983/50000: episode: 22, duration: 47.561s, episode steps: 1617, steps per second: 34, episode reward: 1490.000, mean reward: 0.921 [0.000, 100.000], mean action: 3.993 [0.000, 13.000], mean observation: 1.986 [0.000, 240.000], loss: 31.202262, mean_absolute_error: 0.990682, acc: 0.369086, mean_q: 1.000000, mean_eps: 0.585325 29873/50000: episode: 23, duration: 55.033s, episode steps: 1890, steps per second: 34, episode reward: 1480.000, mean reward: 0.783 [0.000, 100.000], mean action: 3.815 [0.000, 13.000], mean observation: 1.960 [0.000, 240.000], loss: 33.852708, mean_absolute_error: 0.996705, acc: 0.380539, mean_q: 1.000000, mean_eps: 0.571473 30851/50000: episode: 24, duration: 29.198s, episode steps: 978, steps per second: 33, episode reward: 780.000, mean reward: 0.798 [0.000, 100.000], mean action: 3.757 [0.000, 13.000], mean observation: 2.426 [0.000, 240.000], loss: 30.200397, mean_absolute_error: 0.990111, acc: 0.375991, mean_q: 1.000000, mean_eps: 0.560144 31931/50000: episode: 25, duration: 31.562s, episode steps: 1080, steps per second: 34, episode reward: 1180.000, mean reward: 1.093 [0.000, 100.000], mean action: 3.903 [0.000, 13.000], mean observation: 1.719 [0.000, 240.000], loss: 35.342483, mean_absolute_error: 0.997397, acc: 0.402402, mean_q: 1.000000, mean_eps: 0.552015 32370/50000: episode: 26, duration: 12.899s, episode steps: 439, steps per second: 34, episode reward: 230.000, mean reward: 0.524 [0.000, 50.000], mean action: 3.752 [0.000, 13.000], mean observation: 3.075 [0.000, 240.000], loss: 31.779249, mean_absolute_error: 0.992009, acc: 0.407247, mean_q: 1.000000, mean_eps: 0.546015 34069/50000: episode: 27, duration: 49.556s, episode steps: 1699, steps per second: 34, episode reward: 1370.000, mean reward: 0.806 [0.000, 100.000], mean action: 3.727 [0.000, 13.000], mean observation: 1.739 [0.000, 240.000], loss: 32.917870, mean_absolute_error: 0.994308, acc: 0.397752, mean_q: 1.000000, mean_eps: 0.537570 35248/50000: episode: 28, duration: 34.380s, episode steps: 1179, steps per second: 34, episode reward: 930.000, mean reward: 0.789 [0.000, 100.000], mean action: 3.759 [0.000, 13.000], mean observation: 2.046 [0.000, 240.000], loss: 31.251266, mean_absolute_error: 0.990669, acc: 0.407125, mean_q: 1.000000, mean_eps: 0.526202 36460/50000: episode: 29, duration: 35.646s, episode steps: 1212, steps per second: 34, episode reward: 1080.000, mean reward: 0.891 [0.000, 100.000], mean action: 3.340 [0.000, 13.000], mean observation: 1.672 [0.000, 240.000], loss: 32.289888, mean_absolute_error: 0.991152, acc: 0.403208, mean_q: 1.000000, mean_eps: 0.516757 38501/50000: episode: 30, duration: 59.686s, episode steps: 2041, steps per second: 34, episode reward: 1350.000, mean reward: 0.661 [0.000, 100.000], mean action: 3.484 [0.000, 13.000], mean observation: 1.783 [0.000, 240.000], loss: 30.293467, mean_absolute_error: 0.990134, acc: 0.409343, mean_q: 1.000000, mean_eps: 0.503908 39551/50000: episode: 31, duration: 30.607s, episode steps: 1050, steps per second: 34, episode reward: 980.000, mean reward: 0.933 [0.000, 100.000], mean action: 3.148 [0.000, 13.000], mean observation: 1.775 [0.000, 240.000], loss: 32.976856, mean_absolute_error: 0.992399, acc: 0.419137, mean_q: 1.000000, mean_eps: 0.491699 40200/50000: episode: 32, duration: 18.901s, episode steps: 649, steps per second: 34, episode reward: 380.000, mean reward: 0.586 [0.000, 100.000], mean action: 3.661 [0.000, 13.000], mean observation: 2.205 [0.000, 240.000], loss: 29.077991, mean_absolute_error: 0.986476, acc: 0.389686, mean_q: 1.000000, mean_eps: 0.484988 42374/50000: episode: 33, duration: 63.338s, episode steps: 2174, steps per second: 34, episode reward: 1300.000, mean reward: 0.598 [0.000, 100.000], mean action: 9.507 [0.000, 13.000], mean observation: 1.670 [0.000, 240.000], loss: 31.056681, mean_absolute_error: 0.986676, acc: 0.079836, mean_q: 0.999999, mean_eps: 0.473837 43610/50000: episode: 34, duration: 36.134s, episode steps: 1236, steps per second: 34, episode reward: 880.000, mean reward: 0.712 [0.000, 100.000], mean action: 9.278 [0.000, 13.000], mean observation: 2.080 [0.000, 240.000], loss: 28.141218, mean_absolute_error: 0.981980, acc: 0.117440, mean_q: 1.000000, mean_eps: 0.460367 45651/50000: episode: 35, duration: 59.989s, episode steps: 2041, steps per second: 34, episode reward: 1480.000, mean reward: 0.725 [0.000, 100.000], mean action: 9.611 [0.000, 13.000], mean observation: 1.897 [0.000, 240.000], loss: 27.262516, mean_absolute_error: 0.981316, acc: 0.158623, mean_q: 1.000000, mean_eps: 0.447423 48137/50000: episode: 36, duration: 72.982s, episode steps: 2486, steps per second: 34, episode reward: 2150.000, mean reward: 0.865 [0.000, 100.000], mean action: 9.624 [0.000, 13.000], mean observation: 2.096 [0.000, 240.000], loss: 26.563936, mean_absolute_error: 0.980988, acc: 0.219831, mean_q: 1.000000, mean_eps: 0.429541 49183/50000: episode: 37, duration: 30.598s, episode steps: 1046, steps per second: 34, episode reward: 980.000, mean reward: 0.937 [0.000, 100.000], mean action: 9.748 [0.000, 13.000], mean observation: 2.036 [0.000, 240.000], loss: 30.250379, mean_absolute_error: 0.987904, acc: 0.271481, mean_q: 1.000000, mean_eps: 0.415590 done, took 1484.830 seconds
<keras.callbacks.History at 0x15bfc3b38>
episodes_p2 = [2647,5062,6988,7721,9006,9489,
10482,11303,12967,14767,17088,17370,17887,18599,19641,
20361,20921,22419,23198,24514,26366,27983,29873,
30851,31931,32370,34069,35248,36460,38501,39551,
40200,42374,43610]
loss_p2 = [29.99,29.21,30.04,26.03,26.04,26.78,27.92,32.33,25.37,26.68,26.14,28.39,
33.06,24.59,26.5,30.07,30.02,32.05,25.4,29.14,28.68,30.82, 30.10,31.20,
33.85,30.20,35.34,31.25,32.28,30.29,32.97,29.07,31.01,28.14]
plt.plot(episodes_p2, loss_p2, 'r--')
plt.axis([0, 50000, 0, 40])
plt.show()
dqn.test(env, nb_episodes=10, visualize=True)
#SARSA Agent -- Reinforcement Learning
from rl.agents.sarsa import SARSAAgent
sarsa = SARSAAgent(model, nb_actions,
policy=None, test_policy=None,
gamma=0.99, nb_steps_warmup=10,
train_interval=1)
sarsa.compile(Adam(lr=1e-3), metrics=['mae', 'acc'])
sarsa.fit(env, nb_steps=50000, visualize=True, verbose=2)
sarsa.test(env, nb_episodes=10, visualize=True)