问题描述
刚刚开始了解Policy Gradient,并且错误不断出现。它说要输入该范围内的Y值,但是该算法表示将Y值作为折价奖励,有时会高于该范围。错误是:
Traceback (most recent call last):
File "policy_cartpole.py",line 69,in <module>
pg.train(10000)
File "policy_cartpole.py",line 61,in train
loss = self.update_network(rewards,states,actions)
File "policy_cartpole.py",line 43,in update_network
loss = self.network.fit(states,discounted_rewards)
File "E:\projects\RL\Policy Gradient\venv\lib\site-packages\tensorflow\python\keras\engine\training.py",line 108,in _method_wrapper
return method(self,*args,**kwargs)
File "E:\projects\RL\Policy Gradient\venv\lib\site-packages\tensorflow\python\keras\engine\training.py",line 1098,in fit
tmp_logs = train_function(iterator)
File "E:\projects\RL\Policy Gradient\venv\lib\site-packages\tensorflow\python\eager\def_function.py",line 780,in __call__
result = self._call(*args,**kwds)
File "E:\projects\RL\Policy Gradient\venv\lib\site-packages\tensorflow\python\eager\def_function.py",line 840,in _call
return self._stateless_fn(*args,**kwds)
File "E:\projects\RL\Policy Gradient\venv\lib\site-packages\tensorflow\python\eager\function.py",line 2829,in __call__
return graph_function._filtered_call(args,kwargs) # pylint: disable=protected-access
File "E:\projects\RL\Policy Gradient\venv\lib\site-packages\tensorflow\python\eager\function.py",line 1848,in _filtered_call
cancellation_manager=cancellation_manager)
File "E:\projects\RL\Policy Gradient\venv\lib\site-packages\tensorflow\python\eager\function.py",line 1924,in _call_flat
ctx,args,cancellation_manager=cancellation_manager))
File "E:\projects\RL\Policy Gradient\venv\lib\site-packages\tensorflow\python\eager\function.py",line 550,in call
ctx=ctx)
File "E:\projects\RL\Policy Gradient\venv\lib\site-packages\tensorflow\python\eager\execute.py",line 60,in quick_execute
inputs,attrs,num_outputs)
tensorflow.python.framework.errors_impl.InvalidArgumentError: Received a label value of 2 which is outside the valid range of [0,2). Label values: 0 2 0 0 0 0 0 0 0
[[node sparse_categorical_crossentropy/SparseSoftmaxCrossEntropyWithLogits/SparseSoftmaxCrossEntropyWithLogits (defined at policy_cartpole.py:43) ]] [Op:__inference_train_function_1167]
Function call stack:
train_function
我找不到其他方法来获得打折奖励,以使其落入所需的相同范围内。 代码-
import gym
import numpy as np
from tensorflow.keras.models import Sequential
import tensorflow as tf
from tensorflow.keras import layers
env = gym.make('CartPole-v0')
GAMMA = 0.95
class policy_gradient:
def __init__(self):
self.num_actions = env.action_space.n
# self.actions = [i for i in range(self.num_actions)]
self.network = self.build_network()
def get_action(self,state):
probabs_action = self.network.predict(state.reshape(1,-1))
selected_action = np.random.choice(self.num_actions,p=probabs_action[0])
return selected_action
def build_network(self):
model = Sequential(
[
layers.Dense(64,activation='relu'),layers.Dense(64,layers.Dense(self.num_actions,activation='softmax'),]
)
model.compile(loss='sparse_categorical_crossentropy',optimizer='adam')
return model
def update_network(self,rewards,actions):
tot_reward = 0
discounted_rewards = []
for reward in rewards[::-1]:
tot_reward += reward + GAMMA * tot_reward
discounted_rewards.append(tot_reward)
discounted_rewards.reverse()
discounted_rewards = np.array(discounted_rewards)
discounted_rewards = (discounted_rewards - np.mean(discounted_rewards))/np.std(discounted_rewards)
states = np.vstack(states)
loss = self.network.fit(states,discounted_rewards)
return loss
def train(self,num_epochs):
for i in range(num_epochs):
state = env.reset()
rewards = []
states = []
actions = []
while True:
action = self.get_action(state)
new_state,reward,done,_ = env.step(action)
states.append(state)
rewards.append(reward)
actions.append(action)
if done:
loss = self.update_network(rewards,actions)
tot_reward = sum(rewards)
print(f'reward for episode {i+1} is {tot_reward}')
break
state = new_state
pg = policy_gradient()
pg.train(10000)
解决方法
暂无找到可以解决该程序问题的有效方法,小编努力寻找整理中!
如果你已经找到好的解决方法,欢迎将解决方案带上本链接一起发送给小编。
小编邮箱:dio#foxmail.com (将#修改为@)