라즈파이에서 keras 2
3253 단어 Raspberrypi3강화 학습Keras2.0
개요
라즈파이에서 keras 해봤다.
gym의 cartpole 해 보았다.
keras에서 강화 학습 해 보았다.
사진
샘플 코드
import gym
import numpy as np
import random
from tensorflow.contrib.keras.python.keras.models import Model
from tensorflow.contrib.keras.python.keras.layers import *
from tensorflow.contrib.keras.python.keras import backend as K
from collections import deque
import matplotlib.pyplot as plt
def randf(s, e):
return (float(random.randrange(0, (e - s) * 9999)) / 10000) + s;
def discount_rewards(r, gamma = 0.99):
discounted_r = np.zeros_like(r)
running_add = 0
for t in reversed(range(0, len(r))):
running_add = running_add * gamma + r[t]
discounted_r[t] = running_add
return discounted_r
def one_hot(index, categories):
x = np.zeros((categories, ))
x[index] = 1
return x
def loss(advan):
def f(y_true, y_pred):
respons = K.sum(y_true * y_pred, axis = 1)
policy_loss = -K.sum(advan * K.log(respons))
return policy_loss
return f
env = gym.make('CartPole-v0')
state = Input(shape = (4, ))
x = Dense(24)(state)
x = Dense(24)(x)
x = Dense(2)(x)
x = Activation('softmax')(x)
model_a = Model(state, x)
state = Input(shape = (4, ))
adv_input = Input(shape = (1, ))
x = model_a(state)
model_b = Model([state, adv_input], x)
model_b.compile(optimizer = 'adam', loss = loss(adv_input))
all_rewards = deque(maxlen = 100)
epsilon = 1
epsilonMinimumValue = 0.001
reward_trend = []
for i_episode in range(200):
observation = env.reset()
state_history, action_history, reward_history = [], [], []
for t in range(200):
env.render()
state_history.append(observation)
pred = model_a.predict(np.expand_dims(observation, axis = 0))[0]
if (randf(0, 1) < epsilon):
action = np.random.choice(len(pred), 1, p = pred)[0]
else:
action = np.argmax(pred)
if (epsilon > epsilonMinimumValue):
epsilon = epsilon * 0.999
observation, reward, done, info = env.step(action)
reward_history.append(reward)
action_history.append(one_hot(action, 2))
if done:
reward_sum = sum(reward_history)
reward_trend.append(reward_sum)
print (i_episode, "Episode finished with reward {} {:.2f}".format(reward_sum, np.mean(all_rewards)))
if reward_sum > 198:
reward_sum += 100
all_rewards.append(reward_sum)
adv = discount_rewards(reward_history)
state_history = np.array(state_history)
action_history = np.array(action_history)
model_b.train_on_batch([state_history, adv], action_history)
break
plt.plot(reward_trend)
plt.savefig("gym26.png")
plt.show()
결과
이상.
Reference
이 문제에 관하여(라즈파이에서 keras 2), 우리는 이곳에서 더 많은 자료를 발견하고 링크를 클릭하여 보았다
https://qiita.com/ohisama@github/items/57129c29a90b061e7ab4
텍스트를 자유롭게 공유하거나 복사할 수 있습니다.하지만 이 문서의 URL은 참조 URL로 남겨 두십시오.
우수한 개발자 콘텐츠 발견에 전념
(Collection and Share based on the CC Protocol.)
샘플 코드
import gym
import numpy as np
import random
from tensorflow.contrib.keras.python.keras.models import Model
from tensorflow.contrib.keras.python.keras.layers import *
from tensorflow.contrib.keras.python.keras import backend as K
from collections import deque
import matplotlib.pyplot as plt
def randf(s, e):
return (float(random.randrange(0, (e - s) * 9999)) / 10000) + s;
def discount_rewards(r, gamma = 0.99):
discounted_r = np.zeros_like(r)
running_add = 0
for t in reversed(range(0, len(r))):
running_add = running_add * gamma + r[t]
discounted_r[t] = running_add
return discounted_r
def one_hot(index, categories):
x = np.zeros((categories, ))
x[index] = 1
return x
def loss(advan):
def f(y_true, y_pred):
respons = K.sum(y_true * y_pred, axis = 1)
policy_loss = -K.sum(advan * K.log(respons))
return policy_loss
return f
env = gym.make('CartPole-v0')
state = Input(shape = (4, ))
x = Dense(24)(state)
x = Dense(24)(x)
x = Dense(2)(x)
x = Activation('softmax')(x)
model_a = Model(state, x)
state = Input(shape = (4, ))
adv_input = Input(shape = (1, ))
x = model_a(state)
model_b = Model([state, adv_input], x)
model_b.compile(optimizer = 'adam', loss = loss(adv_input))
all_rewards = deque(maxlen = 100)
epsilon = 1
epsilonMinimumValue = 0.001
reward_trend = []
for i_episode in range(200):
observation = env.reset()
state_history, action_history, reward_history = [], [], []
for t in range(200):
env.render()
state_history.append(observation)
pred = model_a.predict(np.expand_dims(observation, axis = 0))[0]
if (randf(0, 1) < epsilon):
action = np.random.choice(len(pred), 1, p = pred)[0]
else:
action = np.argmax(pred)
if (epsilon > epsilonMinimumValue):
epsilon = epsilon * 0.999
observation, reward, done, info = env.step(action)
reward_history.append(reward)
action_history.append(one_hot(action, 2))
if done:
reward_sum = sum(reward_history)
reward_trend.append(reward_sum)
print (i_episode, "Episode finished with reward {} {:.2f}".format(reward_sum, np.mean(all_rewards)))
if reward_sum > 198:
reward_sum += 100
all_rewards.append(reward_sum)
adv = discount_rewards(reward_history)
state_history = np.array(state_history)
action_history = np.array(action_history)
model_b.train_on_batch([state_history, adv], action_history)
break
plt.plot(reward_trend)
plt.savefig("gym26.png")
plt.show()
결과
이상.
Reference
이 문제에 관하여(라즈파이에서 keras 2), 우리는 이곳에서 더 많은 자료를 발견하고 링크를 클릭하여 보았다
https://qiita.com/ohisama@github/items/57129c29a90b061e7ab4
텍스트를 자유롭게 공유하거나 복사할 수 있습니다.하지만 이 문서의 URL은 참조 URL로 남겨 두십시오.
우수한 개발자 콘텐츠 발견에 전념
(Collection and Share based on the CC Protocol.)
import gym
import numpy as np
import random
from tensorflow.contrib.keras.python.keras.models import Model
from tensorflow.contrib.keras.python.keras.layers import *
from tensorflow.contrib.keras.python.keras import backend as K
from collections import deque
import matplotlib.pyplot as plt
def randf(s, e):
return (float(random.randrange(0, (e - s) * 9999)) / 10000) + s;
def discount_rewards(r, gamma = 0.99):
discounted_r = np.zeros_like(r)
running_add = 0
for t in reversed(range(0, len(r))):
running_add = running_add * gamma + r[t]
discounted_r[t] = running_add
return discounted_r
def one_hot(index, categories):
x = np.zeros((categories, ))
x[index] = 1
return x
def loss(advan):
def f(y_true, y_pred):
respons = K.sum(y_true * y_pred, axis = 1)
policy_loss = -K.sum(advan * K.log(respons))
return policy_loss
return f
env = gym.make('CartPole-v0')
state = Input(shape = (4, ))
x = Dense(24)(state)
x = Dense(24)(x)
x = Dense(2)(x)
x = Activation('softmax')(x)
model_a = Model(state, x)
state = Input(shape = (4, ))
adv_input = Input(shape = (1, ))
x = model_a(state)
model_b = Model([state, adv_input], x)
model_b.compile(optimizer = 'adam', loss = loss(adv_input))
all_rewards = deque(maxlen = 100)
epsilon = 1
epsilonMinimumValue = 0.001
reward_trend = []
for i_episode in range(200):
observation = env.reset()
state_history, action_history, reward_history = [], [], []
for t in range(200):
env.render()
state_history.append(observation)
pred = model_a.predict(np.expand_dims(observation, axis = 0))[0]
if (randf(0, 1) < epsilon):
action = np.random.choice(len(pred), 1, p = pred)[0]
else:
action = np.argmax(pred)
if (epsilon > epsilonMinimumValue):
epsilon = epsilon * 0.999
observation, reward, done, info = env.step(action)
reward_history.append(reward)
action_history.append(one_hot(action, 2))
if done:
reward_sum = sum(reward_history)
reward_trend.append(reward_sum)
print (i_episode, "Episode finished with reward {} {:.2f}".format(reward_sum, np.mean(all_rewards)))
if reward_sum > 198:
reward_sum += 100
all_rewards.append(reward_sum)
adv = discount_rewards(reward_history)
state_history = np.array(state_history)
action_history = np.array(action_history)
model_b.train_on_batch([state_history, adv], action_history)
break
plt.plot(reward_trend)
plt.savefig("gym26.png")
plt.show()
이상.
Reference
이 문제에 관하여(라즈파이에서 keras 2), 우리는 이곳에서 더 많은 자료를 발견하고 링크를 클릭하여 보았다 https://qiita.com/ohisama@github/items/57129c29a90b061e7ab4텍스트를 자유롭게 공유하거나 복사할 수 있습니다.하지만 이 문서의 URL은 참조 URL로 남겨 두십시오.
우수한 개발자 콘텐츠 발견에 전념 (Collection and Share based on the CC Protocol.)