找不到相关的张量remote_handle:操作ID:14738,输出编号:0

问题描述

我正在使用colab pro TPU实例进行补丁图像分类。 我正在使用2.3.0版的tensorflow。

调用model.fit时,出现以下错误InvalidArgumentError: Unable to find the relevant tensor remote_handle: Op ID: 14738,Output num: 0,具有以下跟踪信息:

--------
InvalidArgumentError                      Traceback (most recent call last)
<ipython-input-20-5fd2ec1ce2f9> in <module>()
     15         steps_per_epoch=STEPS_PER_EPOCH,16         validation_data=dev_ds,---> 17         validation_steps=VALIDATION_STEPS
     18     )

6 frames
/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/engine/training.py in _method_wrapper(self,*args,**kwargs)
    106   def _method_wrapper(self,**kwargs):
    107     if not self._in_multi_worker_mode():  # pylint: disable=protected-access
--> 108       return method(self,**kwargs)
    109 
    110     # Running inside `run_distribute_coordinator` already.

/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/engine/training.py in fit(self,x,y,batch_size,epochs,verbose,callbacks,validation_split,validation_data,shuffle,class_weight,sample_weight,initial_epoch,steps_per_epoch,validation_steps,validation_batch_size,validation_freq,max_queue_size,workers,use_multiprocessing)
   1084       data_handler._initial_epoch = (  # pylint: disable=protected-access
   1085           self._maybe_load_initial_epoch_from_ckpt(initial_epoch))
-> 1086       for epoch,iterator in data_handler.enumerate_epochs():
   1087         self.reset_metrics()
   1088         callbacks.on_epoch_begin(epoch)

/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/engine/data_adapter.py in enumerate_epochs(self)
   1140         if self._insufficient_data:  # Set by `catch_stop_iteration`.
   1141           break
-> 1142         if self._adapter.should_recreate_iterator():
   1143           data_iterator = iter(self._dataset)
   1144         yield epoch,data_iterator

/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/engine/data_adapter.py in should_recreate_iterator(self)
    725     # each epoch.
    726     return (self._user_steps is None or
--> 727             cardinality.cardinality(self._dataset).numpy() == self._user_steps)
    728 
    729   def _validate_args(self,sample_weights,steps):

/usr/local/lib/python3.6/dist-packages/tensorflow/python/framework/ops.py in numpy(self)
   1061     """
   1062     # Todo(slebedev): Consider avoiding a copy for non-cpu or remote tensors.
-> 1063     maybe_arr = self._numpy()  # pylint: disable=protected-access
   1064     return maybe_arr.copy() if isinstance(maybe_arr,np.ndarray) else maybe_arr
   1065 

/usr/local/lib/python3.6/dist-packages/tensorflow/python/framework/ops.py in _numpy(self)
   1029       return self._numpy_internal()
   1030     except core._NotOkStatusException as e:  # pylint: disable=protected-access
-> 1031       six.raise_from(core._status_to_exception(e.code,e.message),None)  # pylint: disable=protected-access
   1032 
   1033   @property

/usr/local/lib/python3.6/dist-packages/six.py in raise_from(value,from_value)

InvalidArgumentError: Unable to find the relevant tensor remote_handle: Op ID: 14738,Output num: 0

H有两个包含300,000>和100,000 请记住,我的训练数据集无法容纳到内存中

以下是用于创建数据集的代码

train_dir = '/content/content/Data/train'
dev_dir = '/content/content/Data/dev'

def create_dataset(dir,label_dic,is_training=True):
    filepaths = list(tf.data.Dataset.list_files(dir + '/*.jpg'))

    labels = []

    for f in filepaths:
        ind = f.numpy().decode().split('/')[-1].split('.')[0]
        labels.append(label_dic[ind])

    ds = tf.data.Dataset.from_tensor_slices((filepaths,labels))
    ds = ds.map(load_images,num_parallel_calls=tf.data.experimental.AUTOTUNE)
    ds = ds.cache() 

    if is_training:
        ds = ds.shuffle(len(filepaths),reshuffle_each_iteration=True)
        ds = ds.repeat(EPOCHS) 
    ds = ds.batch(BATCH_SIZE) 
    ds = ds.prefetch(tf.data.experimental.AUTOTUNE)

    return ds


train_ds = create_dataset(train_dir,train_label)
dev_ds = create_dataset(dev_dir,dev_label,False)

以下是用于创建和编译模型以及拟合数据集的代码,我使用了带有VGG16后端的keras自定义模型

def create_model(input_shape,batch_size):
    VGG16 = keras.applications.VGG16(include_top=False,input_shape=input_shape,weights='imagenet')

    for layer in VGG16.layers:
        layer.trainable = False

    input_layer = keras.Input(shape=input_shape,batch_size=batch_size)

    VGG_out = VGG16(input_layer)

    x = Flatten(name='flatten',input_shape=(512,8,8))(VGG_out)
    x = Dense(256,activation='relu',name='fc1')(x)
    x = Dropout(0.5)(x)
    x = Dense(1,activation='sigmoid',name='fc2')(x)

    model = Model(input_layer,x)
    model.summary()
    return model

with strategy.scope():

    model = create_model(INPUT_SHAPE,BATCH_SIZE)
    model.compile(optimizer='adam',loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),metrics=['accuracy'])
    
model.fit(train_ds,epochs=5,steps_per_epoch=STEPS_PER_EPOCH,validation_data=dev_ds,validation_steps=VALIDATION_STEPS
    )

对于TPU的初始化和策略,我使用strategy = tf.distribute.TPUStrategy(resolver) 初始化代码如下所示:

resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='grpc://' + os.environ['COLAB_TPU_ADDR'])
tf.config.experimental_connect_to_cluster(resolver)

tf.tpu.experimental.initialize_tpu_system(resolver)
print("All devices: ",tf.config.list_logical_devices('TPU'))


可在以下位置找到整个笔记本的副本,其中包含输出Colab Ipython Notebook

解决方法

@Pooya448

我知道这已经很晚了,但这可能对困在这里的人有用。 以下是我用来连接 TPU 的函数。


def connect_to_tpu(tpu_address: str = None):
    if tpu_address is not None:  # When using GCP
        cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver(
            tpu=tpu_address)
        if tpu_address not in ("","local"):
            tf.config.experimental_connect_to_cluster(cluster_resolver)
        tf.tpu.experimental.initialize_tpu_system(cluster_resolver)
        strategy = tf.distribute.experimental.TPUStrategy(cluster_resolver)
        print("Running on TPU ",cluster_resolver.master())
        print("REPLICAS: ",strategy.num_replicas_in_sync)
        return cluster_resolver,strategy
    else:                           # When using Colab or Kaggle
        try:
            cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver.connect()
            strategy = tf.distribute.experimental.TPUStrategy(cluster_resolver)
            print("Running on TPU ",cluster_resolver.master())
            print("REPLICAS: ",strategy.num_replicas_in_sync)
            return cluster_resolver,strategy
        except:
            print("WARNING: No TPU detected.")
            mirrored_strategy = tf.distribute.MirroredStrategy()
            return None,mirrored_strategy