ValueError:尚不支持负步如何解决

问题描述

我使用 4 个 GPU,并使用 pytorch DDP 来加速训练。下面是我的数据集代码

import cv2
import ast
import torch
import numpy as np
import random
from glob import glob
from torch.utils.data import DataLoader,Dataset
from torch.utils.data.distributed import distributedSampler

cv2.setNumThreads(1)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
class VimeoDataset(Dataset):
    def __init__(self,mode): ##mode = 'train' means train mode or 'val' means val mode
        self.h = 256
        self.w = 448
        self.mode = mode
        self.trainlist = glob('/data/vimeoFlow2/dataset/train/*.npz')
        self.vallist = glob('/data/vimeoFlow2/dataset/val/*.npz')
        xx = np.arange(0,self.w).reshape(1,-1).repeat(self.h,0) #xx shape is(256,448)
        yy = np.arange(0,self.h).reshape(-1,1).repeat(self.w,1) #yy shape is(448,256)
        self.grid = np.stack((xx,yy),2).copy()
        self.npzs = []
        if self.mode == 'train':
            self.npzs = self.trainlist[:80]
        else:
            self.npzs = self.vallist[:20]

    def __len__(self):
        return len(self.npzs)

    def aug(self,img0,gt,img1,flow_gt,h,w):
        ih,iw,_ = img0.shape
        x = np.random.randint(0,ih - h + 1)
        y = np.random.randint(0,iw - w + 1)
        img0 = img0[x:x+h,y:y+w,:] #random generate image slices,img shape is (224,224,:) 
        img1 = img1[x:x+h,:]
        gt = gt[x:x+h,:]
        flow_gt = flow_gt[x:x+h,:]
        return img0,flow_gt

    def getimg(self,index):
        f = np.load(self.npzs[index])
        data = f['i0i1gt']
        if self.mode == 'train':
            flow_data = f['ft0ft1']
        else:
            flow_data = np.zeros((256,448,4))
        img0 = data[:,:,0:3]
        img1 = data[:,3:6]
        gt = data[:,6:9]
        flow_gt = flow_data
        return img0,flow_gt  
        
    def __getitem__(self,index):        
        img0,flow_gt = self.getimg(index)
        if self.mode == 'train':
            img0,flow_gt = self.aug(img0,224)
            flow_gt = torch.from_numpy(flow_gt.copy()).permute(2,1)
            img0 = torch.from_numpy(img0.copy()).permute(2,1)
            img1 = torch.from_numpy(img1.copy()).permute(2,1)
            gt = torch.from_numpy(gt.copy()).permute(2,1)             
            if random.uniform(0,1) < 0.5:
                img0 = img0[:,::-1] 
                img1 = img1[:,::-1]
                gt = gt[:,::-1]
            if random.uniform(0,1) < 0.5:
                img0 = img0[::-1]
                img1 = img1[::-1]
                gt = gt[::-1]
                flow_gt = flow_gt[::-1]
                flow_gt = np.concatenate((flow_gt[:,0:1],-flow_gt[:,1:2],flow_gt[:,2:3],3:4]),2)
            if random.uniform(0,::-1]
                img1 = img1[:,::-1]
                flow_gt = flow_gt[:,::-1]
                flow_gt = np.concatenate((-flow_gt[:,1) < 0.5:
                tmp = img1
                img1 = img0
                img0 = tmp
                flow_gt = np.concatenate((flow_gt[:,2:4],0:2]),2)
        flow_gt = torch.from_numpy(flow_gt.copy()).permute(2,1)
        img0 = torch.from_numpy(img0.copy()).permute(2,1)
        img1 = torch.from_numpy(img1.copy()).permute(2,1)
        gt = torch.from_numpy(gt.copy()).permute(2,1)
        return torch.cat((img0,gt),0),flow_gt

我使用以下数据加载器。

dataset = VimeoDataset(mode = 'train')
sampler = distributedSampler(dataset)
train_data = DataLoader(dataset,batch_size=args.batch_size,num_workers=0,pin_memory=True,drop_last=True,sampler=sampler)
dataset_val = VimeoDataset(mode = 'val')
val_data = DataLoader(dataset_val,batch_size=16,num_workers=0)

我使用以下启动命令。

python -m torch.distributed.launch --nproc_per_node=4 --nnodes=1 --node_rank=0 train.py --epoch=100 --batch_size=16

错误回溯如下。我的数据集分为train和val,分别是npz文件。由于内存大小的限制,我一次只能根据索引读取一个npz文件

 training...
    Traceback (most recent call last):
      File "train.py",line 173,in <module>
        train(model,args.local_rank)
      File "train.py",line 72,in train
        for i,data in enumerate(train_data):
      File "/opt/conda/lib/python3.6/site-packages/torch/utils/data/DataLoader.py",line 346,in __next__
        data = self._dataset_fetcher.fetch(index)  # may raise stopiteration
      File "/opt/conda/lib/python3.6/site-packages/torch/utils/data/_utils/fetch.py",line 44,in fetch
        data = [self.dataset[idx] for idx in possibly_batched_index]
      File "/opt/conda/lib/python3.6/site-packages/torch/utils/data/_utils/fetch.py",in <listcomp>
        data = [self.dataset[idx] for idx in possibly_batched_index]
      File "/code/RIFE/dataset.py",line 67,in __getitem__
        img0 = img0[::-1]
    ValueError: negative step not yet supported
    File "/opt/conda/lib/python3.6/site-packages/torch/distributed/launch.py",line 249,in main
        cmd=cmd)
    subprocess.CalledProcessError: Command '['/opt/conda/bin/python','-u','train.py','--local_rank=3','--epoch=100','--batch_size=1']' returned non-zero exit status 1.

解决方法

暂无找到可以解决该程序问题的有效方法,小编努力寻找整理中!

如果你已经找到好的解决方法,欢迎将解决方案带上本链接一起发送给小编。

小编邮箱:dio#foxmail.com (将#修改为@)