问题描述
这是我的代码,它是一个简单的DQN,可以学习演奏蛇,例如,我不知道为什么它会在一段时间后停止学习。它了解到蛇头应该撞在墙上,但是它并没有学会吃水果,即使我给予靠近水果的奖励,而给予我远离水果的奖励也更大(这是为了使蛇理解它应该旨在取得成果)。但是由于某些原因,分数永远不会超过1或2: “” ############################################### ###### #MAIN.py
# -*- coding: utf-8 -*-
"""
Created on Mon Aug 10 13:04:45 2020
@author: Ryan
"""
from dq_learning import Agent
import numpy as np
import tensorflow as tf
import snake
import sys
import pygame
import gym
if __name__ == '__main__':
observation_space = 31
action_space = 4
lr = 0.001
n_games = 50000
steps = 1000
#env = gym.make("LunarLander-v2")
#observation_space = env.observation_space.shape
#action_space = env.action_space.n
agent = Agent(gamma=0.99,epsilon=1.0,lr=lr,input_dims=observation_space,n_actions=action_space,batch_size=64)
scores = []
eps_history = []
r = False
for i in range(n_games):
score = 0
#first observation
observation = [0 for i in range(observation_space)]
#observation = env.reset()
for j in range(steps):
# env.render()
for evt in pygame.event.get():
if evt.type == pygame.QUIT:
pygame.quit()
sys.exit()
#actions go from 0 to n_actions - based on the model prediction or random choice
#action space is the list of all the possible actions
action = agent.choose_action(observation)
#print("action: ",action)
#env.step(action) returns -> new observation,reward,done,info
observation_,info = snake.step(action,25)
#observation_,info = env.step(action)
#print(observation_,info)
score += reward
agent.store_transition(observation,action,observation_,done)
observation = observation_
agent.learn()
if done:
break
print("NEXT GAME")
done = False
eps_history.append(agent.epsilon)
scores.append(score)
avg_score = np.mean(scores[-100:])
print("episode: ",i," scores %.2f" %score,"average score: %.2f" %avg_score," epsilon %.2f" %agent.epsilon)
print("last score: ",scores[-1])
#####################################
#DQ_LEARNING.PY
# -*- coding: utf-8 -*-
"""
Created on Tue Aug 4 12:23:14 2020
@author: Ryan
"""
import numpy as np
import tensorflow as tf
from tensorflow import keras
class ReplayBuffer:
def __init__(self,max_size,input_dims):
self.mem_size = max_size
self.mem_cntr = 0
"""
print("self.mem_size: ",self.mem_size)
print("*input_dims: ",*input_dims)
"""
self.state_memory = np.zeros((self.mem_size,input_dims),dtype=np.float32)
self.new_state_memory = np.zeros((self.mem_size,dtype=np.float32)
self.action_memory = np.zeros(self.mem_size,np.int32)
self.reward_memory = np.zeros(self.mem_size,np.float32)
self.terminal_memory = np.zeros(self.mem_size,np.int32) #done flags
def store_transitions(self,state,state_,done):
"""print("storing transactions...")
print("mem_cntr: ",self.mem_cntr)
print("mem_size: ",self.mem_size)
"""
index = self.mem_cntr % self.mem_size
self.state_memory[index] = state
self.new_state_memory[index] = state_
self.reward_memory[index] = reward
self.action_memory[index] = action
self.terminal_memory[index] = 1 - int(done)
self.mem_cntr += 1
def sample_buffer(self,batch_size):
#print("sampling buffer...")
max_mem = min(self.mem_cntr,self.mem_size)
batch = np.random.choice(max_mem,batch_size,replace=False)
#print("batch:",batch)
states = self.state_memory[batch]
states_ = self.new_state_memory[batch]
rewards = self.reward_memory[batch]
actions = self.action_memory[batch]
terminal = self.terminal_memory[batch]
#print("self.action_mem: ",self.action_memory)
#print("actions: ",actions)
#print("state action rewards state_,terminal",(states,actions,rewards,states_,terminal))
return states,terminal
def build_dqn(lr,n_actions,input_dims,fc1_dims,fc2_dims):
model = keras.Sequential()
model.add(keras.layers.Dense(fc1_dims,activation='relu'))
model.add(keras.layers.Dense(fc2_dims,activation='relu'))
model.add(keras.layers.Dense(n_actions))
opt = keras.optimizers.Adam(learning_rate=lr)
model.compile(optimizer=opt,loss='mean_squared_error')
return model
class Agent():
def __init__(self,lr,gamma,epsilon,epsilon_dec=1e-3,epsilon_end=1e-2,mem_size=1e6,fname='dqn_model.h5'):
self.action_space = [i for i in range(n_actions)]
self.gamma = gamma
self.epsilon = epsilon
self.eps_min = epsilon_end
self.eps_dec = epsilon_dec
self.batch_size = batch_size
self.model_file = fname
self.memory = ReplayBuffer(int(mem_size),input_dims)
self.q_eval = build_dqn(lr,256,256)
def store_transition(self,new_state,done):
self.memory.store_transitions(state,done)
def choose_action(self,observation):
if np.random.random() < self.epsilon:
action = np.random.choice(self.action_space)
else:
state = np.array([observation])
actions = self.q_eval.predict(state)
action = np.argmax(actions)
return action
def learn(self):
if self.memory.mem_cntr < self.batch_size:
return
states,dones = \
self.memory.sample_buffer(self.batch_size)
q_eval = self.q_eval.predict(states)
q_next = self.q_eval.predict(states_)
q_target = np.copy(q_eval)
batch_index = np.arange(self.batch_size,dtype=np.int32)
q_target[batch_index,actions] = rewards + \
self.gamma * np.max(q_next,axis=1)*dones
self.q_eval.train_on_batch(states,q_target)
self.epsilon = self.epsilon - self.eps_dec if self.epsilon > \
self.eps_min else self.eps_min
def save_model(self):
self.q_eval.save(self.model_file)
def load_model(self):
self.q_eval = keras.models.load_model(self.model_file)
##########################################
# snake.py
# -*- coding: utf-8 -*-
"""
Created on Fri Sep 4 14:32:30 2020
@author: Ryan
"""
import pygame
import random
from math import sqrt
import time
class Snakehead:
def __init__(self,posx,posy,width,height):
self.posx = posx
self.posy = posy
self.width = width
self.height = height
self.movement = 'null'
self.speed = 16
self.gameover = False
def draw(self,display): #RGB #coordinates/dimentions
pygame.draw.rect(display,[0,0],[self.posx,self.posy,self.width,self.height])
def read_input(self,key):
if key == 0 and key != 1:
self.movement = 'left'
elif key == 1 and key != 0:
self.movement = 'right'
elif key == 2 and key != 3:
self.movement = 'up'
elif key == 3 and key != 2:
self.movement = 'down'
print(self.movement)
def get_pos(self):
return self.posx,self.posy
def get_movement(self):
return self.movement
def restart(self,ScreenW,ScreenH):
self.posx = ScreenW / 2 - 16/2
self.posy = ScreenH / 2 - 16/2
def move(self,SW,SH):
if self.movement == 'right':
self.posx += self.speed # self.posx = self.posx + self.speed
elif self.movement == 'left':
self.posx -= self.speed # self.posx = self.posx - self.speed
elif self.movement == 'up':
self.posy -= self.speed # self.posy = self.posy - self.speed
elif self.movement == 'down':
self.posy += self.speed # self.posy = self.posy + self.speed
class Food:
def __init__(self,height):
self.posx = posx
self.posy = posy
self.width = width
self.height = height
self.red = random.randint(155,255)
def draw(self,display):
pygame.draw.rect(display,[self.red,self.height])
def get_pos(self):
return self.posx,self.posy
def respawn(self,ScreenH):
self.posx = random.randint(1,(ScreenW - 16)/16) * 16
self.posy = random.randint(1,(ScreenH - 16)/16) * 16
self.red = random.randint(155,255)
class Tail:
def __init__(self,height):
self.width = width
self.height = height
self.posx = posx
self.posy = posy
self.RGB = [random.randint(0,255) for i in range(3)]
def draw(self,Diplay):
pygame.draw.rect(Diplay,self.RGB,16,16])
def move(self,px,py):
self.posx = px
self.posy = py
def get_pos(self):
return self.posx,self.posy
ScreenW = 720
ScreenH = 720
sheadX = 0
sheadY = 0
fX = 0
fY = 0
counter = 0
pygame.init()
pygame.display.set_caption("Snake Game")
display = pygame.display.set_mode([ScreenW,ScreenH])
display.fill([255,255,255]) #RGB white
black = [0,0]
font = pygame.font.SysFont(None,30)
score = font.render("score: 0",True,black)
shead = Snakehead(ScreenW / 2 - 16/2,ScreenH / 2 - 16/2,16)
f = Food(random.randint(0,(ScreenW - 16)/16) * 16 - 8,random.randint(0,(ScreenH - 16)/16) * 16,16)
tails = []
Fps = 60
timer_clock = pygame.time.Clock()
prevIoUs_distance = 0
d = 0
def step(action,observation_space):
global score,counter,tails,shead,gameover,prevIoUs_distance,d
shead.gameover = False
observation_,info = [0 for i in range(observation_space+6)],0
display.fill([255,255])
shead.read_input(action)
sheadX,sheadY = shead.get_pos()
fX,fY = f.get_pos()
#detect collision
if sheadX + 16 > fX and sheadX < fX + 16:
if sheadY + 16 > fY and sheadY < fY + 16:
#collision
f.respawn(ScreenW,ScreenH)
counter += 1 # counter = counter + 1
score = font.render("score: " + str(counter),black)
if len(tails) == 0:
tails.append(Tail(sheadX,sheadY,16))
#tails.append(tail.Tail(sheadX,shead.get_movement()))
else:
tX,tY = tails[-1].get_pos()
tails.append(Tail(tX,tY,16))
reward = 100
print(tails)
for i in range(len(tails)):
try:
tX,tY = tails[i].get_pos()
#print("tx: ",tX," ty: ",tY)
sX,sY = shead.get_pos()
#print("Sx: ",sX," sy: ",sY)
if i != 0 and i != 1:
#print("more than 2 tails")
if tX == sX and tY == sY:
print("collision")
#collision
shead.restart(ScreenW,ScreenH)
tails.clear()
counter = 0
display.blit(score,(10,10))
pygame.display.flip()
pygame.display.update()
reward = -300
shead.gameover = True
print("lost-3")
except:
shead.restart(ScreenW,ScreenH)
tails.clear()
counter = 0
reward = -300
shead.gameover = True
print("lost-0")
sX,sY = shead.get_pos()
if sX < 0 or sX + 16 > ScreenW:
shead.restart(1280,720)
counter = 0
display.blit(score,10))
pygame.display.flip()
pygame.display.update()
tails.clear()
print("lost-1")
reward = -200
shead.gameover = True
#restart
elif sY < 0 or sY + 16 > ScreenH:
shead.restart(1280,720)
counter = 0
display.blit(score,10))
pygame.display.flip()
pygame.display.update()
tails.clear()
reward = -200
shead.gameover = True
print("lost-2")
#restart
for i in range(1,len(tails)):
tX,tY = tails[len(tails) - i - 1].get_pos() # y = b - x
tails[len(tails) - i].move(tX,tY)
if len(tails) > 0:
tX,tY = shead.get_pos()
tails[0].move(tX,tY)
shead.move(ScreenW,ScreenH)
shead.draw(display)
display.blit(score,10))
for tail in tails:
tail.draw(display)
f.draw(display)
pygame.display.flip()
pygame.display.update()
timer_clock.tick(Fps)
#observation,done
done = shead.gameover
hx,hy = shead.get_pos()
hx /= ScreenW
hy /= ScreenH
fx,fy = f.get_pos()
fx /= ScreenW
fy /= ScreenH
observation_[0] = abs(hx - fx)
observation_[1] = abs(hy - fy)
prevIoUs_distance = d
d = sqrt((fx - hx)**2 + (fy - hy)**2)
#print("distance: ",d)
observation_[2] = d
observation_[3] = 0
#print("observation_[4]: ",observation_[4])
observation_[4] = hx
observation_[5] = hy
c = 6
xlist = []
ylist = []
for t in tails:
tx,ty = t.get_pos()
tx /= 16
ty /= 16
xlist.append(tx)
ylist.append(ty)
l = int(sqrt(observation_space))
startX,startY = shead.get_pos()
startX /= 16
startY /= 16
m = (l-1)/2
#print("xlist:",xlist)
#print("ylist:",ylist)
#print("startX: ",startX)
#print("startY: ",startY)
#print("m: ",m)
#print("l: ",l)
for x in range(l):
for y in range(l):
found = False
#print("position: (",int(startX) - m + x,",int(startY) - m + y,")")
for i in range(len(xlist)):
"""print("i:",i)
print("pos: ",startX - m + x)
print("j: ",j)
print("pos: ",startY - m + y)
"""
#print("current iteration: (",int(xlist[i]),int(ylist[i]),")")
if int(xlist[i]) == int(startX) - m + x and int(ylist[i]) == int(startY) - m + y:
#print("found a match")
observation_[c] = 1
#print("c is: ",c)
#print("observation_[c] is: ",observation_[c])
found = True
break
if not found:
#print("set to 0")
observation_[c] = 0
#print("increasing c...")
c += 1
print("reward: ",reward)
print("c_reward: ",counter*10)
d_reward = 10 if d < prevIoUs_distance else - 100
print("d_reward: ",d_reward)
print(observation_,reward + d_reward + counter*10,0)
return observation_,0
解决方法
奖励功能对我来说很好。
但是,您说“我给接近水果的人给予奖励,而走得更远则给予更大的负面奖励”,但是在代码中,它看起来并不像您使用 d_reward :
print("reward: ",reward)
print("c_reward: ",counter*10)
d_reward = 10 if d < previous_distance else - 100
print("d_reward: ",d_reward)
print(observation_,reward + d_reward + counter*10,done,0)
return observation_,reward,0
这很好,因为 d_reward 绝对不是必需的。只给吃苹果的人奖励,给快死的人奖励,否则给0。
我怀疑问题出在您的州代表处。仅通过查看您的状态,您的代理就无法知道应该走哪个方向,因为苹果相对于头部的位置信息是通过绝对值给出的。
举个例子,您的董事会如下:
[food,head,empty]
您的观察将是:
[1,1,0]
但是如果您的董事会是:
[empty,food]
观察结果相同:
[1,0]
这是一个问题。在没有给定输入的情况下,相同的动作可能是好事,也可能是坏事,而无需任何了解。这使得学习不可能。在我们的示例中,对于输入[1,0]
,我们的网络可能朝着(或远离)两个方向: left 和 right ,从不收敛。 / p>
这是因为在您的训练数据中,将有一些输入示例,其中向左的方向移动是好的,其他方向是中性的,其他方向是坏的,而这些输入的示例是正确是好,中立,不良等。
我建议在您的状态(或观察)中编码更多信息。我建议这样的事情(我从我的一个项目中拿来,您需要对其进行调整):
def get_state(self):
head = self.snake[0]
danger_top = head.y == self.board_dim.y - 1 or Point(head.x,head.y + 1) in self.snake
danger_bot = head.y == 0 or Point(head.x,head.y - 1) in self.snake
danger_right = head.x == self.board_dim.x - 1 or Point(head.x + 1,head.y) in self.snake
danger_left = head.x == 0 or Point(head.x - 1,head.y) in self.snake
apple_top = head.y < self.apple.y
apple_bot = head.y > self.apple.y
apple_right = head.x < self.apple.x
apple_left = head.x > self.apple.x
return np.array([
danger_top,danger_bot,danger_right,danger_left,apple_top,apple_bot,apple_right,apple_left],dtype=int)
请让我知道我是否错过了部分代码或您有任何疑问。预先谢谢你。