import gym
    import random
    import numpy as np
    import matplotlib.pyplot as plt
    from collections import deque

    from keras.models import Sequential
    from keras.optimizers import Adam
    from keras.layers import Dense, Flatten
    from keras.layers.convolutional import Conv2D
    from keras import backend as K


    env = gym.make('BreakoutDeterministic-v4')
    observation = env.reset()

    for i in range(3):
        # The ball is released after 2 frames
        if i > 1:
            print(observation.shape)
            plt.imshow(observation)
            plt.show()

        # Get the next observation
        observation, _, _, _ = env.step(1)

(210, 160, 3)

png

    def preprocess_frame(frame):
        frame = frame[30:200, 10:150]

        # grayscale frame and downsize by factor 2
        frame = frame[::2, ::2, 0]

        # set background to 0
        frame[frame == 144] = 0
        frame[frame == 109] = 0

        # set ball and paddles to 1
        frame[frame != 0] = 1

        return frame


    obs_preprocessed = preprocess_frame(observation)
    plt.imshow(obs_preprocessed, cmap='gray')
    plt.show()

(85, 70)

png

    class DQLAgent:
        def __init__(self, cols, rows, n_actions, batch_size=32):
            self.state_size = (cols, rows, 4)
            self.n_actions = n_actions
            self.epsilon = 1.
            self.epsilon_start, self.epsilon_end = 1.0, 0.1
            self.exploration_steps = 1000000.
            self.epsilon_decay_step = (self.epsilon_start - self.epsilon_end) / self.exploration_steps
            self.batch_size = batch_size
            self.discount_factor = 0.99
            self.memory = deque(maxlen=400000)
            self.model = self.build_model()
            self.target_model = self.build_model()
            self.optimizer = self.optimizer()
            self.avg_q_max, self.avg_loss = 0, 0

        def optimizer(self):
            a = K.placeholder(shape=(None,), dtype='int32')
            y = K.placeholder(shape=(None,), dtype='float32')

            py_x = self.model.output

            a_one_hot = K.one_hot(a, self.n_actions)
            q_value = K.sum(py_x * a_one_hot, axis=1)
            error = K.abs(y - q_value)

            quadratic_part = K.clip(error, 0.0, 1.0)
            linear_part = error - quadratic_part
            loss = K.mean(0.5 * K.square(quadratic_part) + linear_part)

            opt = Adam(lr=0.00025, epsilon=0.01)
            updates = opt.get_updates(self.model.trainable_weights, [], loss)
            train = K.function([self.model.input, a, y], [loss], updates=updates)

            return train

        def build_model(self):
            model = Sequential()
            model.add(Conv2D(32, (8, 8), strides=(4, 4), activation='relu', input_shape=self.state_size))
            model.add(Conv2D(64, (4, 4), strides=(2, 2), activation='relu'))
            model.add(Conv2D(64, (3, 3), strides=(1, 1), activation='relu'))
            model.add(Flatten())
            model.add(Dense(512, activation='relu'))
            model.add(Dense(self.n_actions))
            model.summary()
            return model

        def update_model(self):
            self.target_model.set_weights(self.model.get_weights())

        def action(self, history):
            history = np.float32(history / 255.0)
            if np.random.rand() <= self.epsilon:
                return random.randrange(self.n_actions)
            else:
                q_value = self.model.predict(history)
                return np.argmax(q_value[0])

        def replay(self, history, action, reward, next_history, dead):
            self.memory.append((history, action, reward, next_history, dead))

        def train(self):
            if len(self.memory) < self.batch_size:
                return
            if self.epsilon > self.epsilon_end:
                self.epsilon -= self.epsilon_decay_step

            mini_batch = random.sample(self.memory, self.batch_size)
            history = np.zeros((self.batch_size, self.state_size[0], self.state_size[1], self.state_size[2]))
            next_history = np.zeros((self.batch_size, self.state_size[0], self.state_size[1], self.state_size[2]))
            target = np.zeros((self.batch_size,))
            action, reward, dead = [], [], []

            for i in range(self.batch_size):
                history[i] = np.float32(mini_batch[i][0] / 255.)
                next_history[i] = np.float32(mini_batch[i][3] / 255.)
                action.append(mini_batch[i][1])
                reward.append(mini_batch[i][2])
                dead.append(mini_batch[i][4])

            target_value = self.target_model.predict(next_history)

            for i in range(self.batch_size):
                if dead[i]:
                    target[i] = reward[i]
                else:
                    target[i] = reward[i] + self.discount_factor * \
                                            np.amax(target_value[i])

            loss = self.optimizer([history, action, target])
            self.avg_loss += loss[0]


    env = gym.make('BreakoutDeterministic-v4')

    # General settings
    n_warmup_steps = 50000
    update_model_rate = 10000
    cols, rows = 85, 70
    n_states = 4

    # Hyperparameters
    batch_size = 32

    # Initialization
    agent = DQLAgent(cols, rows, n_actions=3)
    scores, episodes = [], []
    n_steps = 0

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
conv2d_52 (Conv2D)           (None, 20, 16, 32)        8224      
_________________________________________________________________
conv2d_53 (Conv2D)           (None, 9, 7, 64)          32832     
_________________________________________________________________
conv2d_54 (Conv2D)           (None, 7, 5, 64)          36928     
_________________________________________________________________
flatten_18 (Flatten)         (None, 2240)              0         
_________________________________________________________________
dense_34 (Dense)             (None, 512)               1147392   
_________________________________________________________________
dense_35 (Dense)             (None, 3)                 1539      
=================================================================
Total params: 1,226,915
Trainable params: 1,226,915
Non-trainable params: 0
_________________________________________________________________
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
conv2d_55 (Conv2D)           (None, 20, 16, 32)        8224      
_________________________________________________________________
conv2d_56 (Conv2D)           (None, 9, 7, 64)          32832     
_________________________________________________________________
conv2d_57 (Conv2D)           (None, 7, 5, 64)          36928     
_________________________________________________________________
flatten_19 (Flatten)         (None, 2240)              0         
_________________________________________________________________
dense_36 (Dense)             (None, 512)               1147392   
_________________________________________________________________
dense_37 (Dense)             (None, 3)                 1539      
=================================================================
Total params: 1,226,915
Trainable params: 1,226,915
Non-trainable params: 0
_________________________________________________________________



    while True:
        done = False
        dead = False
        step, score, start_life = 0, 0, 5
        observation = env.reset()


        state = preprocess_frame(observation, cols, rows)
        history = np.stack((state, state, state, state), axis=2)
        history = np.reshape([history], (1, cols, rows, n_states))

        while not done:
    #         env.render()
            n_steps += 1
            step += 1

            # Get action
            action = agent.action(history)
            observation, reward, done, info = env.step(action+1)

            # Extract next state
            state_next = preprocess_frame(observation, cols, rows)
            state_next = np.reshape([state_next], (1, cols, rows, 1))
            history_next = np.append(state_next, history[:, :, :, :3], axis=3)

            agent.avg_q_max += np.amax(agent.model.predict(history)[0])
            reward = np.clip(reward, -1., 1.)

            agent.replay(history, action, reward, history_next, dead)
            agent.train()
            if n_steps % update_model_rate == 0:
                agent.update_model()
            score += reward

            if dead:
                dead = False
            else:
                history = history_next

            if done:
                print('episode {:2d}; score: {:2.0f}; q {:2f}; loss {:2f}; steps {}'
                      .format(n_steps, score, agent.avg_q_max / float(step), agent.avg_loss / float(step), step))

                agent.avg_q_max, agent.avg_loss = 0, 0

        # Save weights of model
        if n_steps % 1000 == 0:
            agent.model.save_weights("breakout_dql.h5")

episode 38184; score:  1; q 0.103984; loss 0.003715; steps 184
episode 38344; score:  1; q 0.104243; loss 0.002910; steps 160
...
episode 106467; score:  0; q 0.374253; loss 0.003024; steps 134



    env = gym.make('BreakoutDeterministic-v4')
    agent = DQLAgent(cols, rows, n_action=3)

    for i in range(5):
        observation = env.reset()

        state = pre_processing(observation, cols, rows)
        history = np.stack((state, state, state, state), axis=2)
        history = np.reshape([history], (1, cols, rows, n_states))

        while not done:
            env.render()
            action = agent.get_action(history)
            observe, reward, done, info = env.step(action+1)