AttributeError: 'Environment1' 对象没有属性 'observation

问题描述

我正在使用 Keras 构建 ddpg 模型，我按照这里的官方说明输入 link description here
但我将健身房环境更改为我自己的环境：
import tensorflow as tf
from tensorflow.keras import layers
import matplotlib.pyplot as plt


import numpy as np  # linear algebra
import pandas as pd  # data processing,CSV file I/O (e.g. pd.read_csv)
from subprocess import check_output
# print(check_output(["ls","../input"]).decode("utf8"))
import time
import copy
import numpy as np
import pandas as pd



data = pd.read_csv (r'C:\Users\willi\Downloads\spyv.csv')

data = data.loc[:,~data.columns.str.contains('^Unnamed')]


date_split = 377

train = data[:date_split]
test = data[date_split:]


class Environment1:

    def __init__(self,data,history_t=90):
        self.data = data
        self.history_t = history_t
        self.reset ()

    def reset(self):
        self.t = 0
        self.done = False
        self.profits = 0
        self.positions = []
        self.position_value = 0
        self.history = [0 for _ in range (self.history_t)]
        return [self.position_value] + self.history  # obs

    def step(self,act):
        reward = 0

        # act = 0: stay,1: buy,2: sell
        if act == 1:
            self.positions.append (self.data.iloc[self.t,:]['close'])
        elif act == 2:  # sell
            if len (self.positions) == 0:
                reward = -1
            else:
                profits = 0
                for p in self.positions:
                    profits += (self.data.iloc[self.t,:]['close'] - p)
                reward += profits
                self.profits += profits
                self.positions = []

        # set next time
        self.t += 1
        self.position_value = 0
        for p in self.positions:
            self.position_value += (self.data.iloc[self.t,:]['close'] - p)
        self.history.pop (0)
        self.history.append (self.data.iloc[self.t,:]['close'] - self.data.iloc[(self.t - 1),:]['close'])

        # clipping reward
        if reward > 0:
            reward = 1
        elif reward < 0:
            reward = -1

        return [self.position_value] + self.history,reward,self.done  # obs,done


env = Environment1 (train)
print (env.reset ())
for _ in range (3):
    pact = np.random.randint (3)
    print (env.step (pact))

#above here are all my own code,under here is the code from keras

num_states = env.observation_space.shape[0]
print("Size of State Space ->  {}".format(num_states))
num_actions = env.action_space.shape[0]
print("Size of Action Space ->  {}".format(num_actions))

upper_bound = env.action_space.high[0]
lower_bound = env.action_space.low[0]

print("Max Value of Action ->  {}".format(upper_bound))
print("Min Value of Action ->  {}".format(lower_bound))


class OUActionNoise:
    def __init__(self,mean,std_deviation,theta=0.15,dt=1e-2,x_initial=None):
        self.theta = theta
        self.mean = mean
        self.std_dev = std_deviation
        self.dt = dt
        self.x_initial = x_initial
        self.reset()

    def __call__(self):
        # Formula taken from https://www.wikipedia.org/wikI/Ornstein-Uhlenbeck_process.
        x = (
            self.x_prev
            + self.theta * (self.mean - self.x_prev) * self.dt
            + self.std_dev * np.sqrt(self.dt) * np.random.normal(size=self.mean.shape)
        )
        # Store x into x_prev
        # Makes next noise dependent on current one
        self.x_prev = x
        return x

    def reset(self):
        if self.x_initial is not None:
            self.x_prev = self.x_initial
        else:
            self.x_prev = np.zeros_like(self.mean)
            
            
class Buffer:
    def __init__(self,buffer_capacity=100000,batch_size=64):
        # Number of "experiences" to store at max
        self.buffer_capacity = buffer_capacity
        # Num of tuples to train on.
        self.batch_size = batch_size

        # Its tells us num of times record() was called.
        self.buffer_counter = 0

        # Instead of list of tuples as the exp.replay concept go
        # We use different np.arrays for each tuple element
        self.state_buffer = np.zeros((self.buffer_capacity,num_states))
        self.action_buffer = np.zeros((self.buffer_capacity,num_actions))
        self.reward_buffer = np.zeros((self.buffer_capacity,1))
        self.next_state_buffer = np.zeros((self.buffer_capacity,num_states))

    # Takes (s,a,r,s') obervation tuple as input
    def record(self,obs_tuple):
        # Set index to zero if buffer_capacity is exceeded,# replacing old records
        index = self.buffer_counter % self.buffer_capacity

        self.state_buffer[index] = obs_tuple[0]
        self.action_buffer[index] = obs_tuple[1]
        self.reward_buffer[index] = obs_tuple[2]
        self.next_state_buffer[index] = obs_tuple[3]

        self.buffer_counter += 1

    # Eager execution is turned on by default in TensorFlow 2. Decorating with tf.function allows
    # TensorFlow to build a static graph out of the logic and computations in our function.
    # This provides a large speed up for blocks of code that contain many small TensorFlow operations such as this one.
    @tf.function
    def update(
        self,state_batch,action_batch,reward_batch,next_state_batch,):
        # Training and updating Actor & Critic networks.
        # See Pseudo Code.
        with tf.GradientTape() as tape:
            target_actions = target_actor(next_state_batch,training=True)
            y = reward_batch + gamma * target_critic(
                [next_state_batch,target_actions],training=True
            )
            critic_value = critic_model([state_batch,action_batch],training=True)
            critic_loss = tf.math.reduce_mean(tf.math.square(y - critic_value))

        critic_grad = tape.gradient(critic_loss,critic_model.trainable_variables)
        critic_optimizer.apply_gradients(
            zip(critic_grad,critic_model.trainable_variables)
        )

        with tf.GradientTape() as tape:
            actions = actor_model(state_batch,training=True)
            critic_value = critic_model([state_batch,actions],training=True)
            # Used `-value` as we want to maximize the value given
            # by the critic for our actions
            actor_loss = -tf.math.reduce_mean(critic_value)

        actor_grad = tape.gradient(actor_loss,actor_model.trainable_variables)
        actor_optimizer.apply_gradients(
            zip(actor_grad,actor_model.trainable_variables)
        )

    # We compute the loss and update parameters
    def learn(self):
        # Get sampling range
        record_range = min(self.buffer_counter,self.buffer_capacity)
        # Randomly sample indices
        batch_indices = np.random.choice(record_range,self.batch_size)

        # Convert to tensors
        state_batch = tf.convert_to_tensor(self.state_buffer[batch_indices])
        action_batch = tf.convert_to_tensor(self.action_buffer[batch_indices])
        reward_batch = tf.convert_to_tensor(self.reward_buffer[batch_indices])
        reward_batch = tf.cast(reward_batch,dtype=tf.float32)
        next_state_batch = tf.convert_to_tensor(self.next_state_buffer[batch_indices])

        self.update(state_batch,next_state_batch)


# This update target parameters slowly
# Based on rate `tau`,which is much less than one.
@tf.function
def update_target(target_weights,weights,tau):
    for (a,b) in zip(target_weights,weights):
        a.assign(b * tau + a * (1 - tau))
        
        
def get_actor():
    # Initialize weights between -3e-3 and 3-e3
    last_init = tf.random_uniform_initializer(minval=-0.003,maxval=0.003)

    inputs = layers.Input(shape=(num_states,))
    out = layers.Dense(256,activation="relu")(inputs)
    out = layers.Dense(256,activation="relu")(out)
    outputs = layers.Dense(1,activation="tanh",kernel_initializer=last_init)(out)

    # Our upper bound is 2.0 for Pendulum.
    outputs = outputs * upper_bound
    model = tf.keras.Model(inputs,outputs)
    return model


def get_critic():
    # State as input
    state_input = layers.Input(shape=(num_states))
    state_out = layers.Dense(16,activation="relu")(state_input)
    state_out = layers.Dense(32,activation="relu")(state_out)

    # Action as input
    action_input = layers.Input(shape=(num_actions))
    action_out = layers.Dense(32,activation="relu")(action_input)

    # Both are passed through seperate layer before concatenating
    concat = layers.Concatenate()([state_out,action_out])

    out = layers.Dense(256,activation="relu")(concat)
    out = layers.Dense(256,activation="relu")(out)
    outputs = layers.Dense(1)(out)

    # Outputs single value for give state-action
    model = tf.keras.Model([state_input,action_input],outputs)

    return model


def policy(state,noise_object):
    sampled_actions = tf.squeeze(actor_model(state))
    noise = noise_object()
    # Adding noise to action
    sampled_actions = sampled_actions.numpy() + noise

    # We make sure action is within bounds
    legal_action = np.clip(sampled_actions,lower_bound,upper_bound)

    return [np.squeeze(legal_action)]

std_dev = 0.2
ou_noise = OUActionNoise(mean=np.zeros(1),std_deviation=float(std_dev) * np.ones(1))

actor_model = get_actor()
critic_model = get_critic()

target_actor = get_actor()
target_critic = get_critic()

# Making the weights equal initially
target_actor.set_weights(actor_model.get_weights())
target_critic.set_weights(critic_model.get_weights())

# Learning rate for actor-critic models
critic_lr = 0.002
actor_lr = 0.001

critic_optimizer = tf.keras.optimizers.Adam(critic_lr)
actor_optimizer = tf.keras.optimizers.Adam(actor_lr)

total_episodes = 100
# discount factor for future rewards
gamma = 0.99
# Used to update target networks
tau = 0.005

buffer = Buffer(50000,64)


# To store reward history of each episode
ep_reward_list = []
# To store average reward history of last few episodes
avg_reward_list = []

# Takes about 4 min to train
for ep in range(total_episodes):

    prev_state = env.reset()
    episodic_reward = 0

    while True:
        # Uncomment this to see the Actor in action
        # But not in a python notebook.
        # env.render()

        tf_prev_state = tf.expand_dims(tf.convert_to_tensor(prev_state),0)

        action = policy(tf_prev_state,ou_noise)
        # Recieve state and reward from environment.
        state,done,info = env.step(action)

        buffer.record((prev_state,action,state))
        episodic_reward += reward

        buffer.learn()
        update_target(target_actor.variables,actor_model.variables,tau)
        update_target(target_critic.variables,critic_model.variables,tau)

        # End this episode when `done` is True
        if done:
            break

        prev_state = state

    ep_reward_list.append(episodic_reward)

    # Mean of last 40 episodes
    avg_reward = np.mean(ep_reward_list[-40:])
    print("Episode * {} * Avg Reward is ==> {}".format(ep,avg_reward))
    avg_reward_list.append(avg_reward)

# Plotting graph
# Episodes versus Avg. Rewards
plt.plot(avg_reward_list)
plt.xlabel("Episode")
plt.ylabel("Avg. Epsiodic Reward")
plt.show()


# Save the weights
actor_model.save_weights("pendulum_actor.h5")
critic_model.save_weights("pendulum_critic.h5")

target_actor.save_weights("pendulum_target_actor.h5")
target_critic.save_weights("pendulum_target_critic.h5")
然后我收到错误：
AttributeError: 'Environment1' 对象没有属性 'observation_space'
有朋友可以帮忙吗？
解决方法

暂无找到可以解决该程序问题的有效方法，小编努力寻找整理中！
如果你已经找到好的解决方法，欢迎将解决方案带上本链接一起发送给小编。
小编邮箱:dio#foxmail.com (将#修改为@）
deep-learning machine-learning q-learning reinforcement-learning