如何清除 Colab Tensorflow TPU 内存

问题描述

我正在执行多个折叠模型。每次折叠后,我想清除 TPU 内存,以免出现 OOM 错误

当前错误的完整跟踪。

ResourceExhaustedError                    Traceback (most recent call last)
<ipython-input-16-b7e0725f5c4d> in <module>()
      1 tf.keras.backend.clear_session()
      2 with config.strategy.scope():
----> 3   model = build_model(config.img_size,count = count_data_items(files_train)/config.batch_size)

25 frames
<ipython-input-9-5b219db28f69> in build_model(size,count)
      1 def build_model(size,count=820):
      2 
----> 3     base_model = efn.EfficientNetB7(input_shape=(size,size,3),weights='imagenet',include_top=False)
      4 
      5     model = tf.keras.Sequential([

/usr/local/lib/python3.7/dist-packages/efficientnet/__init__.py in wrapper(*args,**kwargs)
     55         kwargs['models'] = tfkeras.models
     56         kwargs['utils'] = tfkeras.utils
---> 57         return func(*args,**kwargs)
     58 
     59     return wrapper

/usr/local/lib/python3.7/dist-packages/efficientnet/model.py in EfficientNetB7(include_top,weights,input_tensor,input_shape,pooling,classes,**kwargs)
    604         input_tensor=input_tensor,input_shape=input_shape,605         pooling=pooling,classes=classes,--> 606         **kwargs
    607     )
    608 

/usr/local/lib/python3.7/dist-packages/efficientnet/model.py in EfficientNet(width_coefficient,depth_coefficient,default_resolution,dropout_rate,drop_connect_rate,depth_divisor,blocks_args,model_name,include_top,**kwargs)
    348                       use_bias=False,349                       kernel_initializer=CONV_KERNEL_INITIALIZER,--> 350                       name='stem_conv')(x)
    351     x = layers.Batchnormalization(axis=bn_axis,name='stem_bn')(x)
    352     x = layers.Activation(activation,name='stem_activation')(x)

/usr/local/lib/python3.7/dist-packages/tensorflow/python/keras/engine/base_layer.py in __call__(self,*args,**kwargs)
    968     if _in_functional_construction_mode(self,inputs,args,kwargs,input_list):
    969       return self._functional_construction_call(inputs,--> 970                                                 input_list)
    971 
    972     # Maintains info about the `Layer.call` stack.

/usr/local/lib/python3.7/dist-packages/tensorflow/python/keras/engine/base_layer.py in _functional_construction_call(self,input_list)
   1106       # Check input assumptions set after layer building,e.g. input shape.
   1107       outputs = self._keras_tensor_symbolic_call(
-> 1108           inputs,input_masks,kwargs)
   1109 
   1110       if outputs is None:

/usr/local/lib/python3.7/dist-packages/tensorflow/python/keras/engine/base_layer.py in _keras_tensor_symbolic_call(self,kwargs)
    838       return nest.map_structure(keras_tensor.KerasTensor,output_signature)
    839     else:
--> 840       return self._infer_output_signature(inputs,input_masks)
    841 
    842   def _infer_output_signature(self,input_masks):

/usr/local/lib/python3.7/dist-packages/tensorflow/python/keras/engine/base_layer.py in _infer_output_signature(self,input_masks)
    876           # overridden).
    877           # Todo(kaftan): do we maybe_build here,or have we already done it?
--> 878           self._maybe_build(inputs)
    879           inputs = self._maybe_cast_inputs(inputs)
    880           outputs = call_fn(inputs,**kwargs)

/usr/local/lib/python3.7/dist-packages/tensorflow/python/keras/engine/base_layer.py in _maybe_build(self,inputs)
   2623         # operations.
   2624         with tf_utils.maybe_init_scope(self):
-> 2625           self.build(input_shapes)  # pylint:disable=not-callable
   2626       # We must set also ensure that the layer is marked as built,and the build
   2627       # shape is stored since user defined build functions may not be calling

/usr/local/lib/python3.7/dist-packages/tensorflow/python/keras/layers/convolutional.py in build(self,input_shape)
    202         constraint=self.kernel_constraint,203         trainable=True,--> 204         dtype=self.dtype)
    205     if self.use_bias:
    206       self.bias = self.add_weight(

/usr/local/lib/python3.7/dist-packages/tensorflow/python/keras/engine/base_layer.py in add_weight(self,name,shape,dtype,initializer,regularizer,trainable,constraint,use_resource,synchronization,aggregation,**kwargs)
    653         synchronization=synchronization,654         aggregation=aggregation,--> 655         caching_device=caching_device)
    656     if regularizer is not None:
    657       # Todo(fchollet): in the future,this should be handled at the

/usr/local/lib/python3.7/dist-packages/tensorflow/python/training/tracking/base.py in _add_variable_with_custom_getter(self,getter,overwrite,**kwargs_for_getter)
    813         dtype=dtype,814         initializer=initializer,--> 815         **kwargs_for_getter)
    816 
    817     # If we set an initializer and the variable processed it,tracking will not

/usr/local/lib/python3.7/dist-packages/tensorflow/python/keras/engine/base_layer_utils.py in make_variable(name,caching_device,validate_shape,collections,partitioner)
    137       synchronization=synchronization,138       aggregation=aggregation,--> 139       shape=variable_shape if variable_shape else None)
    140 
    141 

/usr/local/lib/python3.7/dist-packages/tensorflow/python/ops/variables.py in __call__(cls,**kwargs)
    258   def __call__(cls,**kwargs):
    259     if cls is VariableV1:
--> 260       return cls._variable_v1_call(*args,**kwargs)
    261     elif cls is Variable:
    262       return cls._variable_v2_call(*args,**kwargs)

/usr/local/lib/python3.7/dist-packages/tensorflow/python/ops/variables.py in _variable_v1_call(cls,initial_value,variable_def,expected_shape,import_scope,shape)
    219         synchronization=synchronization,220         aggregation=aggregation,--> 221         shape=shape)
    222 
    223   def _variable_v2_call(cls,/usr/local/lib/python3.7/dist-packages/tensorflow/python/ops/variables.py in getter(**kwargs)
     65 
     66   def getter(**kwargs):
---> 67     return captured_getter(captured_prevIoUs,**kwargs)
     68 
     69   return getter

/usr/local/lib/python3.7/dist-packages/tensorflow/python/distribute/distribute_lib.py in creator_with_resource_vars(next_creator,**kwargs)
   2109         checkpoint_restore_uid = None
   2110 
-> 2111       created = self._create_variable(next_creator,**kwargs)
   2112 
   2113       if checkpoint_restore_uid is not None:

/usr/local/lib/python3.7/dist-packages/tensorflow/python/distribute/tpu_strategy.py in _create_variable(self,next_creator,**kwargs)
   1167         self._container_strategy(),_real_mirrored_creator,1168         distribute_utils.TPU_VARIABLE_CLASS_MAPPING,-> 1169         distribute_utils.TPU_VARIABLE_POLICY_MAPPING,**kwargs)
   1170 
   1171   def _gather_to_implementation(self,value,destinations,axis,options):

/usr/local/lib/python3.7/dist-packages/tensorflow/python/distribute/distribute_utils.py in create_mirrored_variable(strategy,real_mirrored_creator,class_mapping,policy_mapping,**kwargs)
    304   # here.
    305   with tape.stop_recording():
--> 306     value_list = real_mirrored_creator(**kwargs)
    307     # MirroredVariable is recreated during saved_model loading,and its
    308     # component variables (value_list) will have None initializer. We

/usr/local/lib/python3.7/dist-packages/tensorflow/python/distribute/tpu_strategy.py in _real_mirrored_creator(**kwargs)
   1158 
   1159           with context.device_policy(context.DEVICE_PLACEMENT_SILENT):
-> 1160             v = next_creator(**kwargs)
   1161 
   1162           assert not isinstance(v,tpu_values.TPUMirroredVariable)

/usr/local/lib/python3.7/dist-packages/tensorflow/python/ops/variables.py in <lambda>(**kwargs)
    197                         shape=None):
    198     """Call on Variable class. Useful to force the signature."""
--> 199     prevIoUs_getter = lambda **kwargs: default_variable_creator(None,**kwargs)
    200     for _,getter in ops.get_default_graph()._variable_creator_stack:  # pylint: disable=protected-access
    201       prevIoUs_getter = _make_getter(getter,prevIoUs_getter)

/usr/local/lib/python3.7/dist-packages/tensorflow/python/ops/variable_scope.py in default_variable_creator(next_creator,**kwargs)
   2624         synchronization=synchronization,2625         aggregation=aggregation,-> 2626         shape=shape)
   2627   else:
   2628     return variables.RefVariable(

/usr/local/lib/python3.7/dist-packages/tensorflow/python/ops/variables.py in __call__(cls,**kwargs)
    262       return cls._variable_v2_call(*args,**kwargs)
    263     else:
--> 264       return super(VariableMetaclass,cls).__call__(*args,**kwargs)
    265 
    266 

/usr/local/lib/python3.7/dist-packages/tensorflow/python/ops/resource_variable_ops.py in __init__(self,distribute_strategy,shape)
   1593           aggregation=aggregation,1594           shape=shape,-> 1595           distribute_strategy=distribute_strategy)
   1596 
   1597   def _init_from_args(self,/usr/local/lib/python3.7/dist-packages/tensorflow/python/ops/resource_variable_ops.py in _init_from_args(self,shape)
   1729                                                   dtype=dtype)
   1730           if shape is not None:
-> 1731             if not initial_value.shape.is_compatible_with(shape):
   1732               raise ValueError(
   1733                   "The initial value's shape (%s) is not compatible with "

/usr/local/lib/python3.7/dist-packages/tensorflow/python/framework/ops.py in shape(self)
   1196         # `_tensor_shape` is declared and defined in the deFinition of
   1197         # `EagerTensor`,in C.
-> 1198         self._tensor_shape = tensor_shape.TensorShape(self._shape_tuple())
   1199       except core._NotOkStatusException as e:
   1200         six.raise_from(core._status_to_exception(e.code,e.message),None)

ResourceExhaustedError: Failed to allocate request for 18.0KiB (18432B) on device ordinal 0

只是为了删除'看起来你的帖子主要是代码;请添加更多细节。'。只是为了删除“看起来您的帖子主要是代码;请添加更多细节。'。 只是为了删除“看起来您的帖子主要是代码;请添加更多详细信息。'。

解决方法

我个人不会尝试清除 TPU 内存。如果 Google Colab TPU 出现 OOM,请使用较小的批量大小、较小的模型,或者使用内存是 Colab TPU 两倍的 Kaggle TPU。