问题描述
我正在尝试基于双深 Q 网络的实施版本来进行 Breakout。我的问题是即使在 6000 集之后我也没有看到任何改进。 乍一看,我也没有发现超参数设置有任何问题:
伽玛 = 0.99
learning_rate = 0.0001
epsilon = 1.0
epsilon_min = 0.1,
epsilon_decay = 0.995,
剧集数 = 30000
这是我的代理的代码:
class Agent():
def __init__(self,state_size,action_size,gamma = 0.99,epsilon = 1.0,epsilon_min = 0.1,epsilon_decay = 0.995,update_rate = 10000,input_dims=(88,80,1)):
self.gamma = gamma
self.epsilon = epsilon
self.epsilon_min = epsilon_min
self.epsilon_decay = epsilon_decay
self.update_rate = update_rate
self.batch_size = 32
#self.n_actions = n_actions
self.state_size = state_size
self.action_size = action_size
self.memory = deque(maxlen = 100000)
self.model = self.build_model()
self.target_model = self.build_model()
self.target_model.set_weights(self.model.get_weights())
self.memory = ReplayBuffer(100000,input_dims)
self.model.summary()
def build_model(self):
x = Sequential()
x.add(Conv2D(32,kernel_size = (8,8),strides = 4,activation = 'relu',input_shape = self.state_size))
x.add(Conv2D(64,kernel_size=(4,4),strides = 2,activation = 'relu'))
x.add(Conv2D(128,(3,3),strides = 1,activation = 'relu'))
x.add(Flatten())
#fc layers
x.add(Dense(128,activation = 'relu'))
x.add(Dense(128,activation = 'relu'))
x.add(Dense(64,activation = 'relu'))
x.add(Dense(self.action_size,activation = 'linear'))
x.compile(loss = 'mse',optimizer = RMSprop(lr=0.00025,rho=0.95,epsilon = None,decay = 0.0))
return x
def remember(self,state,action,reward,next_state,done):
# storage of experience in replay memory
#self.memory.append((state,done))
self.conv1 = nn.Conv2d(32,64,8,4)
self.relu1 = nn.ReLU(inplace = True)
self.conv2 = nn.Conv2d(64,128,4,2)
self.relu2 = nn.ReLU(inplace=True)
self.conv3 = nn.Conv2d(128,3,1)
self.relu3 = nn.ReLU(inplace = True)
self.fc4 = nn.Linear
def epsilon_greedy(self,state):
if random.rand() < self.epsilon:
return rnd.randrange(self.action_size)
act_values = self.model.predict(state)
return np.argmax(act_values[0])
def learn(self):
if self.memory.mem_cntr < self.batch_size:
return
if self.learn_step_counter % self.replace == 0:
self.q_next.set_weights(self.q_eval.get_weights())
states,actions,rewards,states_,dones = \
self.memory.sample_buffer(self.batch_size)
q_pred = self.q_eval(states)
q_next = tf.math.reduce_max(self.q_next(states_),axis=1,keepdims=True).numpy()
q_target = np.copy(q_pred)
# improve on my solution!
for idx,terminal in enumerate(dones):
if terminal:
q_next[idx] = 0.0
q_target[idx,actions[idx]] = rewards[idx] + self.gamma*q_next[idx]
self.q_eval.train_on_batch(states,q_target)
self.epsilon = self.epsilon - self.eps_dec if self.epsilon > \
self.eps_min else self.eps_min
self.learn_step_counter += 1
#env = gym.make('MsPacman-v0')
env = gym.make('Breakout-v0')
env.reset()
state_size = (88,3)
action_size = env.action_space.n
episodes = 30000
batch_size = 32
skip_start = 90
total_time = 0
all_reward = 0
blend = 4 # Number of images to blend
done = False
gamma = 0.99
agent = Agent(state_size,gamma,update_rate = 50)
l_episodes = []
scores = []
done = False
score = 0
lives = 5
for e in range(episodes):
print('Episode:',e)
done = False
total_reward = 0
game_score = 0
#state = env.reset()
state = process_frame(env.reset())
images = deque(maxlen = blend)
images.append(state)
while not done:
dead = False
while not dead:
env.render()
action = agent.epsilon_greedy(state)
next_state,done,info = env.step(action)
agent.remember(state,done)
state = next_state
game_score += reward
agent.learn()
dead = info['ale.lives']<lives
lives = info['ale.lives']
# When Pacman dies gives penalty of -100
#if dead:
fin_reward = game_score if not dead else -100
#else:
# fin_reward = game_score
print('Episode: {},game_score: {},fin reward : {}'.format(e,game_score,fin_reward))
if lives == 1:
done = True
dead = True
if done:
scores.append(fin_reward)
l_episodes.append(e)
有人看到我做错了什么吗?
最好的问候
解决方法
暂无找到可以解决该程序问题的有效方法,小编努力寻找整理中!
如果你已经找到好的解决方法,欢迎将解决方案带上本链接一起发送给小编。
小编邮箱:dio#foxmail.com (将#修改为@)