问题描述
我最近尝试在Google Colab中使用keras进行深度q学习。两个问题是错误“无法将NumPy数组转换为张量(不支持的对象类型生成器)”。在行中显示:“ next_q_value = self.target_network.predict(next_states)”请告诉我如何解决此问题。我已经尝试了几个小时。其次,我不确定我是否正确执行了preprocess()。我正在使用Breakout-Deterministic,这意味着每4帧执行一次操作。谁能展示和解释预处理的工作原理?
代码如下:
import tensorflow as tf
from tensorflow import keras
import numpy as np
import random
import gym
import time
import matplotlib.pyplot as plt
from matplotlib import animation
from matplotlib.animation import PillowWriter
#input_shape = (4,80,80)
class replay_buffer:
def __init__(self,buffer_size,input_shape=(210,160,3)):
self.buffer_size = buffer_size
self.action_memory = np.zeros(self.buffer_size,dtype=np.int32)
self.reward_memory = np.zeros(self.buffer_size,dtype=np.float32)
self.state_memory = np.zeros((self.buffer_size,*input_shape),dtype=np.float32)
self.next_state_memory = np.zeros((self.buffer_size,dtype=np.float32)
self.terminal_state_memory = np.zeros(self.buffer_size,dtype=np.float32)
def store_transition(self,index,state,next_state,action,reward,terminal_state):
self.action_memory[index] = action
self.reward_memory[index] = reward
self.state_memory[index] = state
self.next_state_memory[index] = next_state
self.terminal_state_memory[index] = terminal_state
def sample_buffer(self):
indexs = np.random.choice(range(len(self.terminal_state_memory)),size = self.buffer_size)
action_sample = np.array(self.action_memory[x] for x in indexs)
reward_sample = np.array(self.reward_memory[x] for x in indexs)
state_sample = np.array(self.state_memory[x] for x in indexs)
next_state_sample = np.array(self.next_state_memory[x] for x in indexs)
terminal_state_sample = np.array(self.terminal_state_memory[x] for x in indexs)
return action_sample,reward_sample,state_sample,next_state_sample,terminal_state_sample
class dqn_network(tf.keras.Model):
def __init__(self):
super(dqn_network,self).__init__()
self.input_layer = tf.keras.Input(shape=(84,84,4,))
self.first_hidden_layer = tf.keras.layers.Conv2D(16,8,strides=4,activation="relu")
self.second_hidden_layer = tf.keras.layers.Conv2D(32,strides=2,activation="relu")
self.dense_layer = tf.keras.layers.Dense(256,activation="relu")
self.output_layer = tf.keras.layers.Dense(4,activation="linear")
def __call__(self):
layer1 = self.first_hidden_layer(self.input_layer)
layer2 = self.second_hidden_layer(layer1)
layer3 = Flatten()(layer2)
layer4 = self.dense_layer(layer3)
layer5 = self.output_layer(layer4)
model = tf.keras.Model(inputs=layer1,outputs=layer5)
model.compile(optimizer= tf.keras.optimizers.Adam(lr=1e-3),loss= tf.keras.losses.mean_squared_error)
#return model
class agent(dqn_network):
def __init__(self,epsilon=1,max_epsilon=1,min_epsilon=0.1,update_target=10000,timestep=0):
super(agent,self).__init__()
self.epsilon = epsilon
self.max_epsilon = max_epsilon
self.min_epsilon = min_epsilon
self.target_network = dqn_network()
self.Q_network = dqn_network()
self.update_target = update_target
self.timestep = timestep
self.experience_relay = replay_buffer(32)
def update_timestep(self,newtimestep):
self.timestep = newtimestep
def update_target_network(self):
if self.update_target % self.timestep == 0:
self.target_network.set_weights(self.Q_network.get_weights())
def greedy_policy(self):
if random.uniform(0,1) < self.epsilon:
return np.random.choice(env.action_space.n)
else:
q_values = self.target_network(state)
return np.argmax(q_values[0])
def store_transition(self,terminal_state):
self.experience_relay.store_transition(index,terminal_state)
def annealing_epsilon(self):
interval = self.max_epsilon - self.min_epsilon
self.epsilon -= interval / 100000
if self.epsilon < 0.1:
self.epsilon = self.min_epsilon
def training(self):
if self.timestep % 4 == 0:
actions,rewards,states,next_states,dones = self.experience_relay.sample_buffer()
next_q_value = self.target_network.predict(next_states)
q_targets = rewards + (1-dones)*gamma*np.max(next_q_value,axis = 1)
mask = tf.one_hot(actions,env.action_space.n)
with tf.GradientTape() as tape:
total_q_value = dqn_network(states)
q_values = tf.reduce_sum(mask*total_q_value,axis=1,keepdims=True)
loss = tf.reduce_mean(tf.keras.losses.mean_squared_error(q_targets,q_values))
grad = tape.gradient(loss,self.Q_network.trainable.variables)
tf.keras.optimizers.Adam.apply_gradients(zip(grad,self.Q_network.trainable.variables))
class PreProcess(gym.ObservationWrapper):
def __init__(self,env=None):
super(PreProcess,self).__init__(env)
self.observation_space = gym.spaces.Box(low=0,high=255,shape=(84,1),dtype= np.uint8)
def greyscale(self):
return np.mean(self.observation_space,axis=2)
class model:
def __init__(self):
self.frame_buffer = []
def add_img(self,img):
self.frame_buffer.append(img)
def create_gif(self,filepath=None): # here here here
plt.figure(figsize=(self.frame_buffer[0].shape[1] / 72,self.frame_buffer[0].shape[0] / 72),dpi = 72)
patch = plt.imshow(self.frame_buffer[0])
plt.axis('off')
def animate(i):
patch.set_data(self.frame_buffer[i])
ani = animation.FuncAnimation(plt.gcf(),animate,frames = len(self.frame_buffer))
if filepath:
writergif = animation.PillowWriter(fps=20)
ani.save(filepath,writer = writergif)
print("file saved")
if __name__ == "__main__":
env = gym.make("BreakoutDeterministic-v4")
PreProcess(env)
dqn = agent()
target_update = 10000
Maxtimestep = 100000
episode_num = 0
frame_num = 0
state = env.reset()
while True:
image_file = model()
start = time.time()
for timestep in range(Maxtimestep):
frame_num +=1
action = dqn.greedy_policy()
dqn.annealing_epsilon()
next_state,done,info = env.step(action)
dqn.update_timestep(timestep)
dqn.store_transition(timestep,done)
img = env.render("rgb_array")
image_file.add_img(img)
if (target_update % frame_num) == 0:
dqn.training()
if done or (timestep == Maxtimestep-1):
end = time.time()
print("[episode: {},time taken: {:.5f} sec,timestep: {}]".format(episode_num + 1,end-start,timestep))
if episode_num % 10 == 0:
image_file.create_gif(filepath= r"./drive/My Drive/GIF-breakout-v1/episode{}.gif")
print("[[episode: {},timestep: {}]]".format(episode_num + 1,timestep))
break
break
episode_num += 1
Pls,help me :)
解决方法
暂无找到可以解决该程序问题的有效方法,小编努力寻找整理中!
如果你已经找到好的解决方法,欢迎将解决方案带上本链接一起发送给小编。
小编邮箱:dio#foxmail.com (将#修改为@)