keras에서 강화 학습
개요
keras에서 강화 학습해 보았다.
openai-gym의 cartpole 해 보았다.
keras-rl은 사용하지 않는다.
사진
샘플 코드
import gym
import numpy as np
import time
from collections import deque
from gym import wrappers
from tensorflow.contrib.keras.python.keras.models import Sequential
from tensorflow.contrib.keras.python.keras.layers import Dense
from tensorflow.contrib.keras.python.keras.optimizers import Adam, SGD
from tensorflow.contrib.keras.python.keras import backend as K
import tensorflow as tf
import matplotlib.pyplot as plt
class Network:
def __init__(self, learning_rate = 0.01, state_size = 4, action_size = 2, hidden_size = 10):
self.model = Sequential()
self.model.add(Dense(hidden_size, activation = 'tanh', input_dim = state_size))
self.model.add(Dense(hidden_size, activation = 'tanh'))
self.model.add(Dense(action_size, activation = 'linear'))
self.optimizer = Adam(lr = learning_rate)
self.model.compile(loss = self.loss, optimizer = self.optimizer)
def replay(self, memory, batch_size, gamma):
inputs = np.zeros((batch_size, 4))
targets = np.zeros((batch_size, 2))
mini_batch = memory.sample(batch_size)
for i, (state_b, action_b, reward_b, next_state_b) in enumerate(mini_batch):
inputs[i : i + 1] = state_b
target = reward_b
if not (next_state_b == np.zeros(state_b.shape)).all(axis = 1):
retmainQs = self.model.predict(next_state_b)[0]
next_action = np.argmax(retmainQs)
target = reward_b + gamma * self.model.predict(next_state_b)[0][next_action]
targets[i] = self.model.predict(state_b)
targets[i][action_b] = target
self.model.fit(inputs, targets, epochs = 1, verbose = 0)
def loss(self, y_true, y_pred):
err = y_true - y_pred
cond = K.abs(err) < 1.0
L2 = 0.5 * K.square(err)
L1 = (K.abs(err) - 0.5)
loss = tf.where(cond, L2, L1)
return K.mean(loss)
class Memory:
def __init__(self, max_size = 1000):
self.buffer = deque(maxlen = max_size)
def add(self, experience):
self.buffer.append(experience)
def sample(self, batch_size):
idx = np.random.choice(np.arange(len(self.buffer)), size = batch_size, replace = False)
return [self.buffer[ii] for ii in idx]
def len(self):
return len(self.buffer)
env = gym.make('CartPole-v0')
gamma = 0.99
memory_size = 5000
mainN = Network(hidden_size = 16, learning_rate = 0.00001)
memory = Memory(max_size = memory_size)
reward_trend = []
for episode in range(299):
env.reset()
state, reward, done, _ = env.step(env.action_space.sample())
state = np.reshape(state, [1, 4])
for t in range(200):
#env.render()
action = 0
epsilon = 0.001 + 0.9 / (1.0 + episode)
if epsilon <= np.random.uniform(0, 1):
retTargetQs = mainN.model.predict(state)[0]
action = np.argmax(retTargetQs)
else:
action = np.random.choice([0, 1])
next_state, reward, done, info = env.step(action)
next_state = np.reshape(next_state, [1, 4])
if done:
next_state = np.zeros(state.shape)
if t < 195:
reward = -1
else:
reward = 1
else:
reward = 0
memory.add((state, action, reward, next_state))
state = next_state
if (memory.len() > 32):
mainN.replay(memory, 32, gamma)
if done:
reward_trend.append(t + 1)
print ('%d Episode %d memory %d' % (episode, t + 1, memory.len()))
break
plt.plot(reward_trend)
plt.savefig("gym31.png")
plt.show()
이상.
Reference
이 문제에 관하여(keras에서 강화 학습), 우리는 이곳에서 더 많은 자료를 발견하고 링크를 클릭하여 보았다
https://qiita.com/ohisama@github/items/e521e4581857e257efc5
텍스트를 자유롭게 공유하거나 복사할 수 있습니다.하지만 이 문서의 URL은 참조 URL로 남겨 두십시오.
우수한 개발자 콘텐츠 발견에 전념
(Collection and Share based on the CC Protocol.)
샘플 코드
import gym
import numpy as np
import time
from collections import deque
from gym import wrappers
from tensorflow.contrib.keras.python.keras.models import Sequential
from tensorflow.contrib.keras.python.keras.layers import Dense
from tensorflow.contrib.keras.python.keras.optimizers import Adam, SGD
from tensorflow.contrib.keras.python.keras import backend as K
import tensorflow as tf
import matplotlib.pyplot as plt
class Network:
def __init__(self, learning_rate = 0.01, state_size = 4, action_size = 2, hidden_size = 10):
self.model = Sequential()
self.model.add(Dense(hidden_size, activation = 'tanh', input_dim = state_size))
self.model.add(Dense(hidden_size, activation = 'tanh'))
self.model.add(Dense(action_size, activation = 'linear'))
self.optimizer = Adam(lr = learning_rate)
self.model.compile(loss = self.loss, optimizer = self.optimizer)
def replay(self, memory, batch_size, gamma):
inputs = np.zeros((batch_size, 4))
targets = np.zeros((batch_size, 2))
mini_batch = memory.sample(batch_size)
for i, (state_b, action_b, reward_b, next_state_b) in enumerate(mini_batch):
inputs[i : i + 1] = state_b
target = reward_b
if not (next_state_b == np.zeros(state_b.shape)).all(axis = 1):
retmainQs = self.model.predict(next_state_b)[0]
next_action = np.argmax(retmainQs)
target = reward_b + gamma * self.model.predict(next_state_b)[0][next_action]
targets[i] = self.model.predict(state_b)
targets[i][action_b] = target
self.model.fit(inputs, targets, epochs = 1, verbose = 0)
def loss(self, y_true, y_pred):
err = y_true - y_pred
cond = K.abs(err) < 1.0
L2 = 0.5 * K.square(err)
L1 = (K.abs(err) - 0.5)
loss = tf.where(cond, L2, L1)
return K.mean(loss)
class Memory:
def __init__(self, max_size = 1000):
self.buffer = deque(maxlen = max_size)
def add(self, experience):
self.buffer.append(experience)
def sample(self, batch_size):
idx = np.random.choice(np.arange(len(self.buffer)), size = batch_size, replace = False)
return [self.buffer[ii] for ii in idx]
def len(self):
return len(self.buffer)
env = gym.make('CartPole-v0')
gamma = 0.99
memory_size = 5000
mainN = Network(hidden_size = 16, learning_rate = 0.00001)
memory = Memory(max_size = memory_size)
reward_trend = []
for episode in range(299):
env.reset()
state, reward, done, _ = env.step(env.action_space.sample())
state = np.reshape(state, [1, 4])
for t in range(200):
#env.render()
action = 0
epsilon = 0.001 + 0.9 / (1.0 + episode)
if epsilon <= np.random.uniform(0, 1):
retTargetQs = mainN.model.predict(state)[0]
action = np.argmax(retTargetQs)
else:
action = np.random.choice([0, 1])
next_state, reward, done, info = env.step(action)
next_state = np.reshape(next_state, [1, 4])
if done:
next_state = np.zeros(state.shape)
if t < 195:
reward = -1
else:
reward = 1
else:
reward = 0
memory.add((state, action, reward, next_state))
state = next_state
if (memory.len() > 32):
mainN.replay(memory, 32, gamma)
if done:
reward_trend.append(t + 1)
print ('%d Episode %d memory %d' % (episode, t + 1, memory.len()))
break
plt.plot(reward_trend)
plt.savefig("gym31.png")
plt.show()
이상.
Reference
이 문제에 관하여(keras에서 강화 학습), 우리는 이곳에서 더 많은 자료를 발견하고 링크를 클릭하여 보았다
https://qiita.com/ohisama@github/items/e521e4581857e257efc5
텍스트를 자유롭게 공유하거나 복사할 수 있습니다.하지만 이 문서의 URL은 참조 URL로 남겨 두십시오.
우수한 개발자 콘텐츠 발견에 전념
(Collection and Share based on the CC Protocol.)
import gym
import numpy as np
import time
from collections import deque
from gym import wrappers
from tensorflow.contrib.keras.python.keras.models import Sequential
from tensorflow.contrib.keras.python.keras.layers import Dense
from tensorflow.contrib.keras.python.keras.optimizers import Adam, SGD
from tensorflow.contrib.keras.python.keras import backend as K
import tensorflow as tf
import matplotlib.pyplot as plt
class Network:
def __init__(self, learning_rate = 0.01, state_size = 4, action_size = 2, hidden_size = 10):
self.model = Sequential()
self.model.add(Dense(hidden_size, activation = 'tanh', input_dim = state_size))
self.model.add(Dense(hidden_size, activation = 'tanh'))
self.model.add(Dense(action_size, activation = 'linear'))
self.optimizer = Adam(lr = learning_rate)
self.model.compile(loss = self.loss, optimizer = self.optimizer)
def replay(self, memory, batch_size, gamma):
inputs = np.zeros((batch_size, 4))
targets = np.zeros((batch_size, 2))
mini_batch = memory.sample(batch_size)
for i, (state_b, action_b, reward_b, next_state_b) in enumerate(mini_batch):
inputs[i : i + 1] = state_b
target = reward_b
if not (next_state_b == np.zeros(state_b.shape)).all(axis = 1):
retmainQs = self.model.predict(next_state_b)[0]
next_action = np.argmax(retmainQs)
target = reward_b + gamma * self.model.predict(next_state_b)[0][next_action]
targets[i] = self.model.predict(state_b)
targets[i][action_b] = target
self.model.fit(inputs, targets, epochs = 1, verbose = 0)
def loss(self, y_true, y_pred):
err = y_true - y_pred
cond = K.abs(err) < 1.0
L2 = 0.5 * K.square(err)
L1 = (K.abs(err) - 0.5)
loss = tf.where(cond, L2, L1)
return K.mean(loss)
class Memory:
def __init__(self, max_size = 1000):
self.buffer = deque(maxlen = max_size)
def add(self, experience):
self.buffer.append(experience)
def sample(self, batch_size):
idx = np.random.choice(np.arange(len(self.buffer)), size = batch_size, replace = False)
return [self.buffer[ii] for ii in idx]
def len(self):
return len(self.buffer)
env = gym.make('CartPole-v0')
gamma = 0.99
memory_size = 5000
mainN = Network(hidden_size = 16, learning_rate = 0.00001)
memory = Memory(max_size = memory_size)
reward_trend = []
for episode in range(299):
env.reset()
state, reward, done, _ = env.step(env.action_space.sample())
state = np.reshape(state, [1, 4])
for t in range(200):
#env.render()
action = 0
epsilon = 0.001 + 0.9 / (1.0 + episode)
if epsilon <= np.random.uniform(0, 1):
retTargetQs = mainN.model.predict(state)[0]
action = np.argmax(retTargetQs)
else:
action = np.random.choice([0, 1])
next_state, reward, done, info = env.step(action)
next_state = np.reshape(next_state, [1, 4])
if done:
next_state = np.zeros(state.shape)
if t < 195:
reward = -1
else:
reward = 1
else:
reward = 0
memory.add((state, action, reward, next_state))
state = next_state
if (memory.len() > 32):
mainN.replay(memory, 32, gamma)
if done:
reward_trend.append(t + 1)
print ('%d Episode %d memory %d' % (episode, t + 1, memory.len()))
break
plt.plot(reward_trend)
plt.savefig("gym31.png")
plt.show()
Reference
이 문제에 관하여(keras에서 강화 학습), 우리는 이곳에서 더 많은 자료를 발견하고 링크를 클릭하여 보았다 https://qiita.com/ohisama@github/items/e521e4581857e257efc5텍스트를 자유롭게 공유하거나 복사할 수 있습니다.하지만 이 문서의 URL은 참조 URL로 남겨 두십시오.
우수한 개발자 콘텐츠 발견에 전념 (Collection and Share based on the CC Protocol.)