问题描述
我一直在基线tf2上的OpenAI的Ant-v2上运行trpo_mpi算法,并且大多数时候都获得了不错的结果。但是,似乎每当我对要训练的模型进行某些修改或在trpo_mpi.py中操纵权重时,都会发生问题,并且无论循环多少次迭代,它都将停止训练。此外,通常应该经过培训的实例也会停止工作。然后,我将运行一个完全不同的默认程序安装程序,该程序似乎会影响内存刷新,然后重新运行第一个安装程序,并进行一些与以前相似的修改,然后一切都会开始。
这是baselines-tf2 / common.models.py中的默认模型(加上一些我认为没有影响的小修改):
def network_fn(input_shape):
##################################################################################################
# input_shape = (27,) - for Ant-v2
print('input shape is {}'.format(input_shape))
x_input = tf.keras.Input(shape=input_shape)
# h = tf.keras.layers.Flatten(x_input)
h = x_input
for i in range(num_layers):
h = tf.keras.layers.Dense(units=num_hidden,kernel_initializer=ortho_init(np.sqrt(2)),name='mlp_fc{}'.format(i),activation=activation)(h)
network = tf.keras.Model(inputs=[x_input],outputs=[h])
return network
shape = (27,)
network = network_fn(shape) # FOR Ant-v2 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
return network_fn,network
这是我开始使用的更新模型:
def network_fn(input_shape): # Fractal TEST 2.0
##################################################################################################
# input_shape = (27,) - for Ant-v2
print('input shape is {}'.format(input_shape))
x_input = tf.keras.Input(shape=input_shape)
# h = tf.keras.layers.Flatten(x_input)
h1 = x_input
h1 = tf.keras.layers.Dense(units=64,name='dense_top_0'.format(0),activation=activation)(h1)
h1 = tf.keras.layers.Dense(units=63,name='dense_top_1'.format(1),activation=activation)(h1)
h2 = x_input
h2 = tf.keras.layers.Dense(units=64,name='dense_bot_0'.format(2),activation=activation)(h2)
h2 = tf.keras.layers.Dense(units=63,name='dense_bot_1'.format(3),activation=activation)(h2)
h3 = tf.keras.layers.concatenate([h1,h2])
h4 = tf.keras.layers.Dense(units=8,name='dense_out'.format(4),activation=activation)(h3)
# network = tf.keras.Model(inputs=[x_input],outputs=[h3])
network = tf.keras.Model(inputs=[x_input],outputs=[h4])
return network
shape = (27,)
network = network_fn(shape) # FOR Ant-v2 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
# print(network.summary())
# pol_weights_bias_0 = network.get_layer('dense_0').get_weights()
# print('weights_bias')
# print(pol_weights_bias_0)
# input('')
return network_fn,network
这里是一个函数,它更新我在policy.py中创建并使用的权重,该权重在trpo_mpi.py中被调用
def kill_actions(self):
# Successfully kills self.mean in DiagGaussianPd(Pd).sample() in distributions.py
# Fractal test 2.0 (it's really just a duplicated NN)
# Locking actions for bottom network
pol_w_bot2 = self.policy_network.get_layer('dense_bot_0'.format(2)).get_weights()[0]
pol_b_bot2 = self.policy_network.get_layer('dense_bot_0'.format(2)).get_weights()[1]
val_w_bot2 = self.value_network.get_layer('dense_bot_0'.format(2)).get_weights()[0]
val_b_bot2 = self.value_network.get_layer('dense_bot_0'.format(2)).get_weights()[1]
pol_w_bot3 = self.policy_network.get_layer('dense_bot_1'.format(3)).get_weights()[0]
pol_b_bot3 = self.policy_network.get_layer('dense_bot_1'.format(3)).get_weights()[1]
val_w_bot3 = self.value_network.get_layer('dense_bot_1'.format(3)).get_weights()[0]
val_b_bot3 = self.value_network.get_layer('dense_bot_1'.format(3)).get_weights()[1]
pol_w_bot2 = pol_w_bot2*0
pol_b_bot2 = pol_b_bot2*0
val_w_bot2 = val_w_bot2*0
val_b_bot2 = val_b_bot2*0
pol_w_bot3 = pol_w_bot3*0
pol_b_bot3 = pol_b_bot3*0
val_w_bot3 = val_w_bot3*0
val_b_bot3 = val_b_bot3*0
self.policy_network.get_layer('dense_bot_0'.format(2)).set_weights([pol_w_bot2,pol_b_bot2])
self.value_network.get_layer('dense_bot_0'.format(2)).set_weights([val_w_bot2,val_b_bot2])
self.policy_network.get_layer('dense_bot_1'.format(3)).set_weights([pol_w_bot3,pol_b_bot3])
self.value_network.get_layer('dense_bot_1'.format(3)).set_weights([val_w_bot3,val_b_bot3])
def hold_weights(self):
# Fractal test 1.0
# Hold the top weights to what they were saved at after phase 1
pol_w_b_top0 = np.load('pol_weights_bias_top_0.npy',allow_pickle=True)
val_w_b_top0 = np.load('val_weights_bias_top_0.npy',allow_pickle=True)
pol_w_b_top1 = np.load('pol_weights_bias_top_1.npy',allow_pickle=True)
val_w_b_top1 = np.load('val_weights_bias_top_1.npy',allow_pickle=True)
self.policy_network.get_layer('dense_top_0'.format(0)).set_weights(pol_w_b_top0)
self.value_network.get_layer('dense_top_0'.format(0)).set_weights(val_w_b_top0)
self.policy_network.get_layer('dense_top_1'.format(1)).set_weights(pol_w_b_top1)
self.value_network.get_layer('dense_top_1'.format(1)).set_weights(val_w_b_top1)
# Set bottom weights to 0.5 of what they were going to be set at
# Fractal test 2.0
pol_w_bot0 = self.policy_network.get_layer('dense_bot_0'.format(2)).get_weights()[0]
pol_b_bot0 = self.policy_network.get_layer('dense_bot_0'.format(2)).get_weights()[1]
val_w_bot0 = self.value_network.get_layer('dense_bot_0'.format(2)).get_weights()[0]
val_b_bot0 = self.value_network.get_layer('dense_bot_0'.format(2)).get_weights()[1]
pol_w_bot1 = self.policy_network.get_layer('dense_bot_1'.format(3)).get_weights()[0]
pol_b_bot1 = self.policy_network.get_layer('dense_bot_1'.format(3)).get_weights()[1]
val_w_bot1 = self.value_network.get_layer('dense_bot_1'.format(3)).get_weights()[0]
val_b_bot1 = self.value_network.get_layer('dense_bot_1'.format(3)).get_weights()[1]
pol_w_bot0 = pol_w_bot0*0.5
pol_b_bot0 = pol_b_bot0*0.5
val_w_bot0 = val_w_bot0*0.5
val_b_bot0 = val_b_bot0*0.5
pol_w_bot1 = pol_w_bot1*0.5
pol_b_bot1 = pol_b_bot1*0.5
val_w_bot1 = val_w_bot1*0.5
val_b_bot1 = val_b_bot1*0.5
self.policy_network.get_layer('dense_bot_0'.format(2)).set_weights([pol_w_bot0,pol_b_bot0])
self.value_network.get_layer('dense_bot_0'.format(2)).set_weights([val_w_bot0,val_b_bot0])
self.policy_network.get_layer('dense_bot_1'.format(3)).set_weights([pol_w_bot1,pol_b_bot1])
self.value_network.get_layer('dense_bot_1'.format(3)).set_weights([val_w_bot1,val_b_bot1])
最后,这是程序退出trpo_mpi.py中的“学习”功能以节省权重之后我一直在使用的代码
pol_weights_bias_top_0 = pi_policy_network.get_layer('dense_top_0'.format(0)).get_weights()
val_weights_bias_top_0 = pi_value_network.get_layer('dense_top_0'.format(0)).get_weights()
pol_weights_bias_top_1 = pi_policy_network.get_layer('dense_top_1'.format(1)).get_weights()
val_weights_bias_top_1 = pi_value_network.get_layer('dense_top_1'.format(1)).get_weights()
np.save('pol_weights_bias_top_0.npy'.format(0),pol_weights_bias_top_0)
np.save('val_weights_bias_top_0.npy'.format(0),val_weights_bias_top_0)
np.save('pol_weights_bias_top_1.npy'.format(1),pol_weights_bias_top_1)
np.save('val_weights_bias_top_1.npy'.format(1),val_weights_bias_top_1)
和
pol_weights_bias_top_0 = pi_policy_network.get_layer('dense_top_0'.format(0)).get_weights()
val_weights_bias_top_0 = pi_value_network.get_layer('dense_top_0'.format(0)).get_weights()
pol_weights_bias_top_1 = pi_policy_network.get_layer('dense_top_1'.format(1)).get_weights()
val_weights_bias_top_1 = pi_value_network.get_layer('dense_top_1'.format(1)).get_weights()
np.save('pol_weights_bias_top_0_p2.npy'.format(0),pol_weights_bias_top_0)
np.save('val_weights_bias_top_0_p2.npy'.format(0),val_weights_bias_top_0)
np.save('pol_weights_bias_top_1_p2.npy'.format(1),pol_weights_bias_top_1)
np.save('val_weights_bias_top_1_p2.npy'.format(1),val_weights_bias_top_1)
pol_weights_bias_bot_0 = pi_policy_network.get_layer('dense_bot_0'.format(2)).get_weights()
val_weights_bias_bot_0 = pi_value_network.get_layer('dense_bot_0'.format(2)).get_weights()
pol_weights_bias_bot_1 = pi_policy_network.get_layer('dense_bot_1'.format(3)).get_weights()
val_weights_bias_bot_1 = pi_value_network.get_layer('dense_bot_1'.format(3)).get_weights()
np.save('pol_weights_bias_bot_0_p2.npy'.format(2),pol_weights_bias_bot_0)
np.save('val_weights_bias_bot_0_p2.npy'.format(2),val_weights_bias_bot_0)
np.save('pol_weights_bias_bot_1_p2.npy'.format(3),pol_weights_bias_bot_1)
np.save('val_weights_bias_bot_1_p2.npy'.format(3),val_weights_bias_bot_1)
对不起,所有代码。但是我试图提供尽可能多的信息。
但是,有什么理由会使通常经过培训的安装会中断,直到运行单独的安装,然后再次开始工作?
***更新 我可以使用更改后的代码运行未更改的安装,而更改后的代码不起作用也阻止了未更改的代码同时运行!
解决方法
暂无找到可以解决该程序问题的有效方法,小编努力寻找整理中!
如果你已经找到好的解决方法,欢迎将解决方案带上本链接一起发送给小编。
小编邮箱:dio#foxmail.com (将#修改为@)