首页
学习
活动
专区
圈层
工具
发布
社区首页 >问答首页 >深度Q学习-训练速度明显减慢

深度Q学习-训练速度明显减慢
EN

Stack Overflow用户
提问于 2019-11-29 02:10:46
回答 1查看 1.2K关注 0票数 1

我正试着建立一个深Q网络来玩蛇。我设计了游戏,使窗口是600乘600,蛇的头移动30个像素每勾。我用内存重放和目标网络实现了DQN算法,但是一旦策略网络开始更新其权重,训练速度就会明显减慢,以至于权值更新循环的每一次迭代大约需要5分钟。此外,我看到几乎没有任何改善的代理人的表现,即使在训练了大约500集。下面是代理的代码:

代码语言:javascript
复制
import numpy as np
import tensorflow as tf
from snake_rl.envs.snake_env import SnakeEnv
import random
from Game.experience import Experience
import time
import pygame
from PIL import Image
from keras import Sequential
from keras.layers import Conv2D, Dense, BatchNormalization, Activation, Flatten, Reshape
import matplotlib.pyplot as plt
import matplotlib.image as mpimg


class Brain:
    def __init__(self, learning_rate, discount_rate, eps_start, eps_end, eps_decay, memory_size, batch_size, max_episodes, max_steps, target_update):
        self.memory = []
        self.push_count = 0
        self.learning_rate = learning_rate
        self.discount_rate = discount_rate
        self.eps_start = eps_start
        self.current_eps = eps_start
        self.eps_end = eps_end
        self.eps_decay = eps_decay
        self.memory_size = memory_size
        self.batch_size = batch_size
        self.max_steps = max_steps
        self.max_episodes = max_episodes
        self.current_episode = 1
        self.policy_model = None
        self.replay_model = None
        self.target_update = target_update
        pygame.init()
        self.screen = pygame.display.set_mode((600, 600))
        pygame.display.set_caption("Snake")       

    def build_model(self):
        self.policy_model = Sequential()
        self.policy_model.add(Conv2D(8, (5, 5), padding = 'same', activation = 'relu', data_format = "channels_last", input_shape = (600, 600, 2)))
        self.policy_model.add(Conv2D(16, (5, 5), padding="same", activation="relu"))
        self.policy_model.add(Conv2D(32, (5, 5), padding="same", activation="relu"))
        self.policy_model.add(Flatten())
        self.policy_model.add(Dense(16, activation = "relu"))
        self.policy_model.add(Dense(5, activation = "softmax"))
        self.policy_model.compile(optimizer = 'rmsprop', loss = 'mean_squared_error')

        self.replay_model = Sequential()
        self.replay_model.add(Conv2D(8, (5, 5), padding = 'same', activation = 'relu', data_format = "channels_last", input_shape = (600, 600, 2)))
        self.replay_model.add(Conv2D(16, (5, 5), padding="same", activation="relu"))
        self.replay_model.add(Conv2D(32, (5, 5), padding="same", activation="relu"))
        self.replay_model.add(Flatten())
        self.replay_model.add(Dense(16, activation = "relu"))
        self.replay_model.add(Dense(5, activation = "softmax"))
        self.replay_model.compile(optimizer = 'rmsprop', loss = 'mean_squared_error')
        print(self.policy_model.summary())

    def decay_epsilon(self, episode):
        self.current_eps = self.eps_end + (self.eps_start - self.eps_end) * np.exp(-self.eps_decay * episode)

    def push_memory(self, new_memory):
        if(len(self.memory) < self.memory_size):
            self.memory.append(new_memory)
        else:
            self.memory[self.push_count % self.memory_size] = new_memory
        self.push_count += 1

    def sample_memory(self):
        return random.sample(self.memory, self.batch_size)

    def can_sample_memory(self):
        return len(self.memory) >= self.batch_size

    def screenshot(self):
        data = pygame.image.tostring(self.screen, 'RGB')
        image = Image.frombytes('RGB', (600, 600), data)
        image = image.convert('LA')
        matrix = np.asarray(image.getdata(), dtype=np.uint8)
        matrix = (matrix - 128)/(128 - 1)
        matrix = np.reshape(matrix, (1, 600, 600, 2))
        return matrix

    def train(self):
        tf.logging.set_verbosity(tf.logging.ERROR)
        self.build_model()
        for episode in range(self.max_episodes):
            self.current_episode = episode
            env = SnakeEnv(self.screen)
            episode_reward = 0
            for timestep in range(self.max_steps):
                env.render(self.screen)
                state = self.screenshot()
                #state = env.get_state()
                action = None
                epsilon = self.current_eps
                if epsilon > random.random():
                    action = np.random.choice(env.action_space) #explore
                else:
                    values = self.policy_model.predict(state) #exploit
                    action = np.argmax(values)
                experience = env.step(action)
                if(experience['done'] == True):
                    episode_reward += experience['reward']
                    break
                episode_reward += experience['reward']
                self.push_memory(Experience(experience['state'], experience['action'], experience['reward'], experience['next_state']))
                self.decay_epsilon(episode)
                if self.can_sample_memory():
                    memory_sample = self.sample_memory()
                    X = []
                    Y = []
                    for memory in memory_sample:
                        memstate = memory.state
                        action = memory.action
                        next_state = memory.next_state
                        reward = memory.reward
                        max_q = reward + (self.discount_rate * self.replay_model.predict(next_state)) #bellman equation
                        X.append(memstate)
                        Y.append(max_q)
                    X = np.array(X)
                    X = X.reshape([-1, 600, 600, 2])
                    Y = np.array(Y)
                    Y = Y.reshape([128, 5])
                    self.policy_model.fit(X, Y)
            print("Episode: ", episode, " Total Reward: ", episode_reward)
            if episode % self.target_update == 0:
                self.replay_model.set_weights(self.policy_model.get_weights())
        self.policy_model.save_weights('weights.hdf5')
        pygame.quit()

    def render(self):
        self.env.render(self.screen)

    def choose_action(self, state):
        q_values = self.policy_model.predict(state)
        action = np.amax(q_values)
        return action

    def load(self):
        self.build_model()
        self.policy_model.load_weights("weights.hdf5")

    def play(self):
        for episode in range(100):
            env = SnakeEnv(self.screen)
            for timestep in range(1000):
                env.render(self.screen)
                pred = self.policy_model.predict(env.get_state())
                print(np.array(pred))
                action = np.amax(pred)
                d = env.step(action)
                if(d['done'] == True):
                    break

我的超参数如下:

代码语言:javascript
复制
learning_rate = 0.5
discount_rate = 0.99
eps_start = 1
eps_end = .01
eps_decay = .001
memory_size = 100000
batch_size = 128
max_episodes = 1000
max_steps = 5000
target_update = 10

有没有人对如何加快训练和提高成绩有任何建议?

EN

回答 1

Stack Overflow用户

发布于 2019-11-29 06:57:33

代码语言:javascript
复制
def decay_epsilon(self, episode):
    self.current_eps = self.eps_end + (self.eps_start - self.eps_end) * np.exp(-self.eps_decay * episode)



// part of code from train()
epsilon = self.current_eps
            if epsilon > random.random():
                action = np.random.choice(env.action_space) #explore
            else:
                values = self.policy_model.predict(state) #exploit
                action = np.argmax(values)

问题

当事件从1次增加到1000次时,随机行动的可能性从100%下降到36%。当剧集是500,有40%的机会采取随机行动。

我的解决方案

  1. 等一下。3000集使它成为随机action.
  2. eps_decay = 0.006的5%。当它是500集时,随机动作减少到5%。--
票数 2
EN
页面原文内容由Stack Overflow提供。腾讯云小微IT领域专用引擎提供翻译支持
原文链接:

https://stackoverflow.com/questions/59097989

复制
相关文章

相似问题

领券
问题归档专栏文章快讯文章归档关键词归档开发者手册归档开发者手册 Section 归档