问题描述
我写了一个简单的pytorch脚本来训练MNIST,它运行良好。我重新实现了与Trainable类一起使用的脚本:
import numpy as np
import torch
import torch.optim as optim
import torch.nn as nn
from torchvision import datasets,transforms
from torch.utils.data import DataLoader
import torch.nn.functional as F
import ray
from ray import tune
# Change these values if you want the training to run quicker or slower.
EPOCH_SIZE = 512
TEST_SIZE = 256
class ConvNet(nn.Module):
def __init__(self):
super(ConvNet,self).__init__()
# In this example,we don't change the model architecture
# due to simplicity.
self.conv1 = nn.Conv2d(1,3,kernel_size=3)
self.fc = nn.Linear(192,10)
def forward(self,x):
x = F.relu(F.max_pool2d(self.conv1(x),3))
x = x.view(-1,192)
x = self.fc(x)
return F.log_softmax(x,dim=1)
class AlexTrainer(tune.Trainable):
def setup(self,config):
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Data Setup
mnist_transforms = transforms.Compose([transforms.ToTensor(),transforms.normalize((0.1307,),(0.3081,))])
self.train_loader = DataLoader(
datasets.MNIST("~/data",train=True,download=True,transform=mnist_transforms),batch_size=64,shuffle=True)
self.test_loader = DataLoader(
datasets.MNIST("~/data",train=False,shuffle=True)
self.model = ConvNet()
self.optimizer = optim.SGD(self.model.parameters(),lr=config["lr"],momentum=config["momentum"])
print('finished setup')
def step(self):
self.train()
print("after train")
acc = self.test()
return {'acc': acc}
def train(self):
print("in train")
self.model.train()
for batch_idx,(data,target) in enumerate(self.train_loader):
# We set this just for the example to run quickly.
if batch_idx * len(data) > EPOCH_SIZE:
return
data,target = data.to(self.device),target.to(self.device)
self.optimizer.zero_grad()
print(type(data))
output = self.model(data)
loss = F.nll_loss(output,target)
loss.backward()
self.optimizer.step()
def test(self):
self.model.eval()
correct = 0
total = 0
with torch.no_grad():
for batch_idx,target) in enumerate(self.test_loader):
# We set this just for the example to run quickly.
if batch_idx * len(data) > TEST_SIZE:
break
data,target.to(self.device)
outputs = self.model(data)
_,predicted = torch.max(outputs.data,1)
total += target.size(0)
correct += (predicted == target).sum().item()
return correct / total
if __name__ == '__main__':
ray.init()
analysis = tune.run(
AlexTrainer,stop={"training_iteration": 2},# verbose=1,config={
"lr": tune.sample_from(lambda spec: 10 ** (-10 * np.random.rand())),"momentum": tune.uniform(0.1,0.9)
}
)
但是,当我尝试运行时,这次却失败了:
Traceback (most recent call last):
File "/hdd/raytune/venv/lib/python3.6/site-packages/ray/tune/trial_runner.py",line 473,in _process_trial
is_duplicate = RESULT_DUPLICATE in result
TypeError: argument of type 'nonetype' is not iterable
Traceback (most recent call last):
File "/hdd/raytune/test_3.py",line 116,in <module>
"momentum": tune.uniform(0.1,0.9)
File "/hdd/raytune/venv/lib/python3.6/site-packages/ray/tune/tune.py",line 356,in run
raise TuneError("Trials did not complete",incomplete_trials)
ray.tune.error.TuneError: ('Trials did not complete',[AlexTrainer_9b3cd_00000])
这可能是什么原因?
解决方法
这是因为您实际上是在覆盖train
中现有的Trainable
方法。如果您将train
方法重命名为其他方法,则应该可以正常使用。