问题描述
我有两个网络。第一个网络的输出是另一个网络的输入。为了计算第二个网络的损失,我使用香草策略梯度。我想将此损失反向传播到第一个网络。检查一下gradeints是否已更改后,我发现它们都不是。 我首先加载第一个网络(以这种方式在我的网络中预先训练的自动编码器):
def load_checkpoint(filepath,model):
checkpoint = torch.load(filepath)
model.load_state_dict(checkpoint['state_dict'])
for parameter in model.parameters():
parameter.requires_grad = True
model.train()
return model
然后我以这种方式定义两个网络的优化器:
class MultipleOptimizer(object):
def __init__(self,*op):
self.optimizers = op
def zero_grad(self):
for op in self.optimizers:
op.zero_grad()
def step(self):
for op in self.optimizers:
op.step()
opt = MultipleOptimizer(SGD(model.parameters(),lr=1,momentum=0.9),Adam(logits_net.parameters(),lr=lr))
奖励功能是:
#Reward function
def reward(x,act):
#print('action',act)
#print('x type',type(x))
km = KMeans(act,n_init=20,n_jobs=4)
y_pred = km.fit_predict(x.detach().cpu().numpy())# seems we can only get a centre from batch
#print('k-means output type',type(y_pred))
sil_score = sil(x.detach().cpu().numpy(),y_pred)
#print('sil score',sil_score)
return sil_score
第二个神经网络的体系结构和避免使用的替代方法(logits = logits.mean(0)):
def mlp(sizes,activation=nn.Tanh,output_activation=nn.Identity):
# Build a Feedforward neural network. outputs are the logits
layers = []
for j in range(len(sizes)-1):
act = activation if j < len(sizes)-2 else output_activation
layers += [nn.Linear(sizes[j],sizes[j+1]),act()]
return nn.Sequential(*layers)
class mlp2(torch.nn.Module):
def __init__(self):
super(mlp2,self).__init__()
self.linear1 = nn.Linear(10,100)
self.relu1 = nn.ReLU(inplace=True)
self.linear2 = torch.nn.Linear(100,100)
self.linear3 = torch.nn.Linear(100,20)
self.linear4 = torch.nn.Linear(2000,100)
self.ident = nn.Identity()
def forward(self,x):
a = self.linear1(x)
a = self.relu1(a)
a = self.linear2(a)
a = self.relu1(a)
a = self.linear3(a)
a = torch.flatten(a)
a = self.linear4(a)
a = self.relu1(a)
a = self.linear3(a)
out = self.ident(a)
return out
损失的计算顺序如下:
def get_policy(obs):
logits = logits_net(obs)
return Categorical(logits=logits.mean(0))
def get_action(obs):
return get_policy(obs).sample().item()
def Logp(obs,act):
logp = get_policy(obs).log_prob(act.cuda())
return logp
def compute_loss(logp,weights):
return -(logp * weights).mean()
def train_one_epoch():
# make some empty lists for logging.
batch_obs = [] # for observations
batch_acts = [] # for actions
batch_weights = [] # for R(tau) weighting in policy gradient
batch_logp = []
# reset episode-specific variables
j = 1 # signal from environment that episode is over
ep_rews = [] # list for rewards accrued throughout ep
for i,data in enumerate(train_loader):
#Create the mean image out of those 100 images
x,label = data
x = model(x.cuda())#torch.Size([100,10])
obs = x.data.cpu().numpy()#[100,10] - a trajectory with only one state
# Save obs
batch_obs.append(obs.copy())
#act in the environment
#act = get_action(torch.as_tensor(obs,dtype=torch.float32))
act = get_action(x)
print('action type',type(act))
#log probability
#logp = Logp(torch.as_tensor(obs,dtype=torch.float32),act = torch.as_tensor(act,dtype=torch.int32))
logp = Logp(x,dtype=torch.int32))
#rew = reward(obs,act+2)
rew = reward(x,act+2)
# save action,reward
batch_acts.append(act)
batch_weights.append(rew)#episode rewards
batch_logp.append(logp)
opt.zero_grad()
batch_logp = torch.stack(batch_logp,dim=0)
batch_loss = compute_loss(logp = torch.as_tensor(batch_logp,weights = torch.as_tensor(batch_weights,dtype=torch.float32))
batch_loss.backward() #does it return anything? gradients? print them!
opt.step()
for name,param in logits_net.named_parameters():
print(name,param.grad)
我进行了一些更改,并假设可能会重新创建一些张量,也许是这个问题:
我有第一个网络obs的输出,它像obs = x.data.cpu().numpy()
那样转换,然后发送到get_action函数:act = get_action(torch.as_tensor(obs,dtype=torch.float32))
。我将其更改为act = get_action(x)
,因此x直接发送到此函数。另外,将logp的参数更改为 logp = Logp(x,dtype=torch.int32))
。
完成这些更改后,我仍然获得渐变的none值。以这种方式计算损耗时,是否仍有可能反向传播梯度?我可以应用的任何更改?
感谢您的帮助。
解决方法
暂无找到可以解决该程序问题的有效方法,小编努力寻找整理中!
如果你已经找到好的解决方法,欢迎将解决方案带上本链接一起发送给小编。
小编邮箱:dio#foxmail.com (将#修改为@)