DDQN 不训练突破

问题描述

我正在尝试基于双深 Q 网络的实施版本来进行 Breakout。我的问题是即使在 6000 集之后我也没有看到任何改进。 乍一看,我也没有发现超参数设置有任何问题:

伽玛 = 0.99

learning_rate = 0.0001

epsilon = 1.0

epsilon_min = 0.1,

epsilon_decay = 0.995,

剧集数 = 30000

这是我的代理的代码

class Agent():
    def __init__(self,state_size,action_size,gamma = 0.99,epsilon = 1.0,epsilon_min = 0.1,epsilon_decay = 0.995,update_rate = 10000,input_dims=(88,80,1)):
        self.gamma = gamma
        self.epsilon = epsilon
        self.epsilon_min = epsilon_min
        self.epsilon_decay = epsilon_decay
        self.update_rate = update_rate
        self.batch_size = 32
        
        #self.n_actions = n_actions
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen = 100000)
        
        self.model = self.build_model()
        self.target_model = self.build_model()
        self.target_model.set_weights(self.model.get_weights())
        
        self.memory = ReplayBuffer(100000,input_dims)
        self.model.summary()
        
        
    def build_model(self):
        x = Sequential()
        x.add(Conv2D(32,kernel_size = (8,8),strides = 4,activation = 'relu',input_shape = self.state_size))
        x.add(Conv2D(64,kernel_size=(4,4),strides = 2,activation = 'relu'))
        x.add(Conv2D(128,(3,3),strides = 1,activation = 'relu'))
        x.add(Flatten())
        
        #fc layers
        x.add(Dense(128,activation = 'relu'))
        x.add(Dense(128,activation = 'relu'))
        x.add(Dense(64,activation = 'relu'))
        x.add(Dense(self.action_size,activation = 'linear'))
        
        x.compile(loss = 'mse',optimizer = RMSprop(lr=0.00025,rho=0.95,epsilon = None,decay = 0.0))
        
        return x
        

    def remember(self,state,action,reward,next_state,done):
        # storage of experience in replay memory
        #self.memory.append((state,done))
        self.conv1 = nn.Conv2d(32,64,8,4)
        self.relu1 = nn.ReLU(inplace = True)
        self.conv2 = nn.Conv2d(64,128,4,2)
        self.relu2 = nn.ReLU(inplace=True)
        self.conv3 = nn.Conv2d(128,3,1)
        self.relu3 = nn.ReLU(inplace = True)
        self.fc4 = nn.Linear
        
        
    def epsilon_greedy(self,state):
        if random.rand() < self.epsilon:
            return rnd.randrange(self.action_size)
        act_values = self.model.predict(state)
        
        return np.argmax(act_values[0])

    
    def learn(self):
        if self.memory.mem_cntr < self.batch_size:
            return

        if self.learn_step_counter % self.replace == 0:
            self.q_next.set_weights(self.q_eval.get_weights())

        states,actions,rewards,states_,dones = \
                                    self.memory.sample_buffer(self.batch_size)

        q_pred = self.q_eval(states)
        q_next = tf.math.reduce_max(self.q_next(states_),axis=1,keepdims=True).numpy()
        q_target = np.copy(q_pred)

        # improve on my solution!
        for idx,terminal in enumerate(dones):
            if terminal:
                q_next[idx] = 0.0
            q_target[idx,actions[idx]] = rewards[idx] + self.gamma*q_next[idx]

        self.q_eval.train_on_batch(states,q_target)

        self.epsilon = self.epsilon - self.eps_dec if self.epsilon > \
                        self.eps_min else self.eps_min

        self.learn_step_counter += 1

这里是我的主要功能代码

#env = gym.make('MsPacman-v0')
env = gym.make('Breakout-v0')
env.reset()
        
state_size = (88,3)
action_size = env.action_space.n
episodes = 30000
batch_size = 32
skip_start = 90 
total_time = 0
all_reward = 0
blend = 4 # Number of images to blend
done = False
gamma = 0.99
        
agent = Agent(state_size,gamma,update_rate = 50)
        

l_episodes = []
scores = []
done = False
score = 0
lives = 5
for e in range(episodes):
    print('Episode:',e)
    done = False
    total_reward = 0
    game_score = 0
    #state = env.reset()
    state = process_frame(env.reset())
    images = deque(maxlen = blend)
    images.append(state)
    
    while not done:
        dead = False

        while not dead:
            env.render()
            
            action = agent.epsilon_greedy(state)
            next_state,done,info = env.step(action)
            
            agent.remember(state,done)
            
            state = next_state
            game_score += reward 
            
            agent.learn()
            
            
            dead = info['ale.lives']<lives
            lives = info['ale.lives']
            # When Pacman dies gives penalty of -100
            
            #if dead:
            fin_reward = game_score if not dead else -100
            #else:
            #    fin_reward = game_score
                
            print('Episode: {},game_score: {},fin reward : {}'.format(e,game_score,fin_reward))
            
            if lives == 1:
                done = True
                dead = True
        if done:
            scores.append(fin_reward)
            l_episodes.append(e)

有人看到我做错了什么吗?

最好的问候

解决方法

暂无找到可以解决该程序问题的有效方法,小编努力寻找整理中!

如果你已经找到好的解决方法,欢迎将解决方案带上本链接一起发送给小编。

小编邮箱:dio#foxmail.com (将#修改为@)