记忆似乎需要先刷新才能再次训练？

问题描述

我一直在基线tf2上的OpenAI的Ant-v2上运行trpo_mpi算法，并且大多数时候都获得了不错的结果。但是，似乎每当我对要训练的模型进行某些修改或在trpo_mpi.py中操纵权重时，都会发生问题，并且无论循环多少次迭代，它都将停止训练。此外，通常应该经过培训的实例也会停止工作。然后，我将运行一个完全不同的默认程序安装程序，该程序似乎会影响内存刷新，然后重新运行第一个安装程序，并进行一些与以前相似的修改，然后一切都会开始。

这是baselines-tf2 / common.models.py中的默认模型（加上一些我认为没有影响的小修改）：

def network_fn(input_shape):
        ##################################################################################################
        # input_shape = (27,) - for Ant-v2
        print('input shape is {}'.format(input_shape))
        x_input = tf.keras.Input(shape=input_shape)
        # h = tf.keras.layers.Flatten(x_input)
        h = x_input
        for i in range(num_layers):
          h = tf.keras.layers.Dense(units=num_hidden,kernel_initializer=ortho_init(np.sqrt(2)),name='mlp_fc{}'.format(i),activation=activation)(h)

        network = tf.keras.Model(inputs=[x_input],outputs=[h])
        return network
    shape = (27,)
    network = network_fn(shape) # FOR Ant-v2 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
    return network_fn,network

这是我开始使用的更新模型：

def network_fn(input_shape): # Fractal TEST 2.0
        ##################################################################################################
        # input_shape = (27,) - for Ant-v2
        
        print('input shape is {}'.format(input_shape))
        x_input = tf.keras.Input(shape=input_shape)
        # h = tf.keras.layers.Flatten(x_input)
        h1 = x_input
        h1 = tf.keras.layers.Dense(units=64,name='dense_top_0'.format(0),activation=activation)(h1)
        h1 = tf.keras.layers.Dense(units=63,name='dense_top_1'.format(1),activation=activation)(h1)

        h2 = x_input
        h2 = tf.keras.layers.Dense(units=64,name='dense_bot_0'.format(2),activation=activation)(h2)
        h2 = tf.keras.layers.Dense(units=63,name='dense_bot_1'.format(3),activation=activation)(h2)

        h3 = tf.keras.layers.concatenate([h1,h2])

        h4 = tf.keras.layers.Dense(units=8,name='dense_out'.format(4),activation=activation)(h3)
        # network = tf.keras.Model(inputs=[x_input],outputs=[h3])
        network = tf.keras.Model(inputs=[x_input],outputs=[h4])
        return network

    shape = (27,)
    network = network_fn(shape) # FOR Ant-v2 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
    # print(network.summary())
    # pol_weights_bias_0 = network.get_layer('dense_0').get_weights()
    # print('weights_bias')
    # print(pol_weights_bias_0)
    # input('')
    return network_fn,network

这里是一个函数，它更新我在policy.py中创建并使用的权重，该权重在trpo_mpi.py中被调用

def kill_actions(self):
        # Successfully kills self.mean in DiagGaussianPd(Pd).sample() in distributions.py
        # Fractal test 2.0 (it's really just a duplicated NN)

        # Locking actions for bottom network
        pol_w_bot2 = self.policy_network.get_layer('dense_bot_0'.format(2)).get_weights()[0]
        pol_b_bot2 = self.policy_network.get_layer('dense_bot_0'.format(2)).get_weights()[1]
        val_w_bot2 = self.value_network.get_layer('dense_bot_0'.format(2)).get_weights()[0]
        val_b_bot2 = self.value_network.get_layer('dense_bot_0'.format(2)).get_weights()[1]

        pol_w_bot3 = self.policy_network.get_layer('dense_bot_1'.format(3)).get_weights()[0]
        pol_b_bot3 = self.policy_network.get_layer('dense_bot_1'.format(3)).get_weights()[1]
        val_w_bot3 = self.value_network.get_layer('dense_bot_1'.format(3)).get_weights()[0]
        val_b_bot3 = self.value_network.get_layer('dense_bot_1'.format(3)).get_weights()[1]

        pol_w_bot2 = pol_w_bot2*0
        pol_b_bot2 = pol_b_bot2*0
        val_w_bot2 = val_w_bot2*0
        val_b_bot2 = val_b_bot2*0

        pol_w_bot3 = pol_w_bot3*0
        pol_b_bot3 = pol_b_bot3*0
        val_w_bot3 = val_w_bot3*0
        val_b_bot3 = val_b_bot3*0

        self.policy_network.get_layer('dense_bot_0'.format(2)).set_weights([pol_w_bot2,pol_b_bot2])
        self.value_network.get_layer('dense_bot_0'.format(2)).set_weights([val_w_bot2,val_b_bot2])

        self.policy_network.get_layer('dense_bot_1'.format(3)).set_weights([pol_w_bot3,pol_b_bot3])
        self.value_network.get_layer('dense_bot_1'.format(3)).set_weights([val_w_bot3,val_b_bot3])

还有另一个控制权重的函数：

def hold_weights(self):
        # Fractal test 1.0
        # Hold the top weights to what they were saved at after phase 1
        pol_w_b_top0 = np.load('pol_weights_bias_top_0.npy',allow_pickle=True)
        val_w_b_top0 = np.load('val_weights_bias_top_0.npy',allow_pickle=True)

        pol_w_b_top1 = np.load('pol_weights_bias_top_1.npy',allow_pickle=True)
        val_w_b_top1 = np.load('val_weights_bias_top_1.npy',allow_pickle=True)

        self.policy_network.get_layer('dense_top_0'.format(0)).set_weights(pol_w_b_top0)
        self.value_network.get_layer('dense_top_0'.format(0)).set_weights(val_w_b_top0)
        self.policy_network.get_layer('dense_top_1'.format(1)).set_weights(pol_w_b_top1)
        self.value_network.get_layer('dense_top_1'.format(1)).set_weights(val_w_b_top1)

        # Set bottom weights to 0.5 of what they were going to be set at
        # Fractal test 2.0
        pol_w_bot0 = self.policy_network.get_layer('dense_bot_0'.format(2)).get_weights()[0]
        pol_b_bot0 = self.policy_network.get_layer('dense_bot_0'.format(2)).get_weights()[1]
        val_w_bot0 = self.value_network.get_layer('dense_bot_0'.format(2)).get_weights()[0]
        val_b_bot0 = self.value_network.get_layer('dense_bot_0'.format(2)).get_weights()[1]

        pol_w_bot1 = self.policy_network.get_layer('dense_bot_1'.format(3)).get_weights()[0]
        pol_b_bot1 = self.policy_network.get_layer('dense_bot_1'.format(3)).get_weights()[1]
        val_w_bot1 = self.value_network.get_layer('dense_bot_1'.format(3)).get_weights()[0]
        val_b_bot1 = self.value_network.get_layer('dense_bot_1'.format(3)).get_weights()[1]

        pol_w_bot0 = pol_w_bot0*0.5
        pol_b_bot0 = pol_b_bot0*0.5
        val_w_bot0 = val_w_bot0*0.5
        val_b_bot0 = val_b_bot0*0.5
        pol_w_bot1 = pol_w_bot1*0.5
        pol_b_bot1 = pol_b_bot1*0.5
        val_w_bot1 = val_w_bot1*0.5
        val_b_bot1 = val_b_bot1*0.5

        self.policy_network.get_layer('dense_bot_0'.format(2)).set_weights([pol_w_bot0,pol_b_bot0])
        self.value_network.get_layer('dense_bot_0'.format(2)).set_weights([val_w_bot0,val_b_bot0])
        self.policy_network.get_layer('dense_bot_1'.format(3)).set_weights([pol_w_bot1,pol_b_bot1])
        self.value_network.get_layer('dense_bot_1'.format(3)).set_weights([val_w_bot1,val_b_bot1])

最后，这是程序退出trpo_mpi.py中的“学习”功能以节省权重之后我一直在使用的代码

pol_weights_bias_top_0 = pi_policy_network.get_layer('dense_top_0'.format(0)).get_weights()
    val_weights_bias_top_0 = pi_value_network.get_layer('dense_top_0'.format(0)).get_weights()
    pol_weights_bias_top_1 = pi_policy_network.get_layer('dense_top_1'.format(1)).get_weights()
    val_weights_bias_top_1 = pi_value_network.get_layer('dense_top_1'.format(1)).get_weights()

    np.save('pol_weights_bias_top_0.npy'.format(0),pol_weights_bias_top_0)
    np.save('val_weights_bias_top_0.npy'.format(0),val_weights_bias_top_0)
    np.save('pol_weights_bias_top_1.npy'.format(1),pol_weights_bias_top_1)
    np.save('val_weights_bias_top_1.npy'.format(1),val_weights_bias_top_1)

和

pol_weights_bias_top_0 = pi_policy_network.get_layer('dense_top_0'.format(0)).get_weights()
    val_weights_bias_top_0 = pi_value_network.get_layer('dense_top_0'.format(0)).get_weights()
    pol_weights_bias_top_1 = pi_policy_network.get_layer('dense_top_1'.format(1)).get_weights()
    val_weights_bias_top_1 = pi_value_network.get_layer('dense_top_1'.format(1)).get_weights()

    np.save('pol_weights_bias_top_0_p2.npy'.format(0),pol_weights_bias_top_0)
    np.save('val_weights_bias_top_0_p2.npy'.format(0),val_weights_bias_top_0)
    np.save('pol_weights_bias_top_1_p2.npy'.format(1),pol_weights_bias_top_1)
    np.save('val_weights_bias_top_1_p2.npy'.format(1),val_weights_bias_top_1)

    pol_weights_bias_bot_0 = pi_policy_network.get_layer('dense_bot_0'.format(2)).get_weights()
    val_weights_bias_bot_0 = pi_value_network.get_layer('dense_bot_0'.format(2)).get_weights()
    pol_weights_bias_bot_1 = pi_policy_network.get_layer('dense_bot_1'.format(3)).get_weights()
    val_weights_bias_bot_1 = pi_value_network.get_layer('dense_bot_1'.format(3)).get_weights()

    np.save('pol_weights_bias_bot_0_p2.npy'.format(2),pol_weights_bias_bot_0)
    np.save('val_weights_bias_bot_0_p2.npy'.format(2),val_weights_bias_bot_0)
    np.save('pol_weights_bias_bot_1_p2.npy'.format(3),pol_weights_bias_bot_1)
    np.save('val_weights_bias_bot_1_p2.npy'.format(3),val_weights_bias_bot_1)

对不起，所有代码。但是我试图提供尽可能多的信息。

但是，有什么理由会使通常经过培训的安装会中断，直到运行单独的安装，然后再次开始工作？

***更新我可以使用更改后的代码运行未更改的安装，而更改后的代码不起作用也阻止了未更改的代码同时运行！

解决方法

暂无找到可以解决该程序问题的有效方法，小编努力寻找整理中！

如果你已经找到好的解决方法，欢迎将解决方案带上本链接一起发送给小编。

小编邮箱:dio#foxmail.com (将#修改为@）