keras에서 강화 학습

개요



keras에서 강화 학습해 보았다.
openai-gym의 cartpole 해 보았다.
keras-rl은 사용하지 않는다.

사진





샘플 코드


import gym
import numpy as np
import time
from collections import deque
from gym import wrappers
from tensorflow.contrib.keras.python.keras.models import Sequential
from tensorflow.contrib.keras.python.keras.layers import Dense
from tensorflow.contrib.keras.python.keras.optimizers import Adam, SGD
from tensorflow.contrib.keras.python.keras import backend as K
import tensorflow as tf
import matplotlib.pyplot as plt

class Network:
    def __init__(self, learning_rate = 0.01, state_size = 4, action_size = 2, hidden_size = 10):
        self.model = Sequential()
        self.model.add(Dense(hidden_size, activation = 'tanh', input_dim = state_size))
        self.model.add(Dense(hidden_size, activation = 'tanh'))
        self.model.add(Dense(action_size, activation = 'linear'))
        self.optimizer = Adam(lr = learning_rate)
        self.model.compile(loss = self.loss, optimizer = self.optimizer)
    def replay(self, memory, batch_size, gamma):
        inputs = np.zeros((batch_size, 4))
        targets = np.zeros((batch_size, 2))
        mini_batch = memory.sample(batch_size)
        for i, (state_b, action_b, reward_b, next_state_b) in enumerate(mini_batch):
            inputs[i : i + 1] = state_b
            target = reward_b
            if not (next_state_b == np.zeros(state_b.shape)).all(axis = 1):
                retmainQs = self.model.predict(next_state_b)[0]
                next_action = np.argmax(retmainQs)
                target = reward_b + gamma * self.model.predict(next_state_b)[0][next_action]
            targets[i] = self.model.predict(state_b)
            targets[i][action_b] = target
            self.model.fit(inputs, targets, epochs = 1, verbose = 0)
    def loss(self, y_true, y_pred):
        err = y_true - y_pred
        cond = K.abs(err) < 1.0
        L2 = 0.5 * K.square(err)
        L1 = (K.abs(err) - 0.5)
        loss = tf.where(cond, L2, L1)
        return K.mean(loss)

class Memory:
    def __init__(self, max_size = 1000):
        self.buffer = deque(maxlen = max_size)
    def add(self, experience):
        self.buffer.append(experience)
    def sample(self, batch_size):
        idx = np.random.choice(np.arange(len(self.buffer)), size = batch_size, replace = False)
        return [self.buffer[ii] for ii in idx]
    def len(self):
        return len(self.buffer)

env = gym.make('CartPole-v0')
gamma = 0.99
memory_size = 5000
mainN = Network(hidden_size = 16, learning_rate = 0.00001)
memory = Memory(max_size = memory_size)
reward_trend = []
for episode in range(299):
    env.reset()
    state, reward, done, _ = env.step(env.action_space.sample())
    state = np.reshape(state, [1, 4])
    for t in range(200):
        #env.render()
        action = 0
        epsilon = 0.001 + 0.9 / (1.0 + episode)
        if epsilon <= np.random.uniform(0, 1):
            retTargetQs = mainN.model.predict(state)[0]
            action = np.argmax(retTargetQs)
        else:
            action = np.random.choice([0, 1])
        next_state, reward, done, info = env.step(action)
        next_state = np.reshape(next_state, [1, 4])
        if done:
            next_state = np.zeros(state.shape)
            if t < 195:
                reward = -1
            else:
                reward = 1
        else:
            reward = 0
        memory.add((state, action, reward, next_state))
        state = next_state
        if (memory.len() > 32):
            mainN.replay(memory, 32, gamma)
        if done:
            reward_trend.append(t + 1)
            print ('%d Episode  %d memory %d' % (episode, t + 1, memory.len()))
            break
plt.plot(reward_trend)
plt.savefig("gym31.png")
plt.show()


이상.

좋은 웹페이지 즐겨찾기