问题描述
我正在执行多个折叠模型。每次折叠后,我想清除 TPU 内存,以免出现 OOM 错误。
当前错误的完整跟踪。
ResourceExhaustedError Traceback (most recent call last)
<ipython-input-16-b7e0725f5c4d> in <module>()
1 tf.keras.backend.clear_session()
2 with config.strategy.scope():
----> 3 model = build_model(config.img_size,count = count_data_items(files_train)/config.batch_size)
25 frames
<ipython-input-9-5b219db28f69> in build_model(size,count)
1 def build_model(size,count=820):
2
----> 3 base_model = efn.EfficientNetB7(input_shape=(size,size,3),weights='imagenet',include_top=False)
4
5 model = tf.keras.Sequential([
/usr/local/lib/python3.7/dist-packages/efficientnet/__init__.py in wrapper(*args,**kwargs)
55 kwargs['models'] = tfkeras.models
56 kwargs['utils'] = tfkeras.utils
---> 57 return func(*args,**kwargs)
58
59 return wrapper
/usr/local/lib/python3.7/dist-packages/efficientnet/model.py in EfficientNetB7(include_top,weights,input_tensor,input_shape,pooling,classes,**kwargs)
604 input_tensor=input_tensor,input_shape=input_shape,605 pooling=pooling,classes=classes,--> 606 **kwargs
607 )
608
/usr/local/lib/python3.7/dist-packages/efficientnet/model.py in EfficientNet(width_coefficient,depth_coefficient,default_resolution,dropout_rate,drop_connect_rate,depth_divisor,blocks_args,model_name,include_top,**kwargs)
348 use_bias=False,349 kernel_initializer=CONV_KERNEL_INITIALIZER,--> 350 name='stem_conv')(x)
351 x = layers.Batchnormalization(axis=bn_axis,name='stem_bn')(x)
352 x = layers.Activation(activation,name='stem_activation')(x)
/usr/local/lib/python3.7/dist-packages/tensorflow/python/keras/engine/base_layer.py in __call__(self,*args,**kwargs)
968 if _in_functional_construction_mode(self,inputs,args,kwargs,input_list):
969 return self._functional_construction_call(inputs,--> 970 input_list)
971
972 # Maintains info about the `Layer.call` stack.
/usr/local/lib/python3.7/dist-packages/tensorflow/python/keras/engine/base_layer.py in _functional_construction_call(self,input_list)
1106 # Check input assumptions set after layer building,e.g. input shape.
1107 outputs = self._keras_tensor_symbolic_call(
-> 1108 inputs,input_masks,kwargs)
1109
1110 if outputs is None:
/usr/local/lib/python3.7/dist-packages/tensorflow/python/keras/engine/base_layer.py in _keras_tensor_symbolic_call(self,kwargs)
838 return nest.map_structure(keras_tensor.KerasTensor,output_signature)
839 else:
--> 840 return self._infer_output_signature(inputs,input_masks)
841
842 def _infer_output_signature(self,input_masks):
/usr/local/lib/python3.7/dist-packages/tensorflow/python/keras/engine/base_layer.py in _infer_output_signature(self,input_masks)
876 # overridden).
877 # Todo(kaftan): do we maybe_build here,or have we already done it?
--> 878 self._maybe_build(inputs)
879 inputs = self._maybe_cast_inputs(inputs)
880 outputs = call_fn(inputs,**kwargs)
/usr/local/lib/python3.7/dist-packages/tensorflow/python/keras/engine/base_layer.py in _maybe_build(self,inputs)
2623 # operations.
2624 with tf_utils.maybe_init_scope(self):
-> 2625 self.build(input_shapes) # pylint:disable=not-callable
2626 # We must set also ensure that the layer is marked as built,and the build
2627 # shape is stored since user defined build functions may not be calling
/usr/local/lib/python3.7/dist-packages/tensorflow/python/keras/layers/convolutional.py in build(self,input_shape)
202 constraint=self.kernel_constraint,203 trainable=True,--> 204 dtype=self.dtype)
205 if self.use_bias:
206 self.bias = self.add_weight(
/usr/local/lib/python3.7/dist-packages/tensorflow/python/keras/engine/base_layer.py in add_weight(self,name,shape,dtype,initializer,regularizer,trainable,constraint,use_resource,synchronization,aggregation,**kwargs)
653 synchronization=synchronization,654 aggregation=aggregation,--> 655 caching_device=caching_device)
656 if regularizer is not None:
657 # Todo(fchollet): in the future,this should be handled at the
/usr/local/lib/python3.7/dist-packages/tensorflow/python/training/tracking/base.py in _add_variable_with_custom_getter(self,getter,overwrite,**kwargs_for_getter)
813 dtype=dtype,814 initializer=initializer,--> 815 **kwargs_for_getter)
816
817 # If we set an initializer and the variable processed it,tracking will not
/usr/local/lib/python3.7/dist-packages/tensorflow/python/keras/engine/base_layer_utils.py in make_variable(name,caching_device,validate_shape,collections,partitioner)
137 synchronization=synchronization,138 aggregation=aggregation,--> 139 shape=variable_shape if variable_shape else None)
140
141
/usr/local/lib/python3.7/dist-packages/tensorflow/python/ops/variables.py in __call__(cls,**kwargs)
258 def __call__(cls,**kwargs):
259 if cls is VariableV1:
--> 260 return cls._variable_v1_call(*args,**kwargs)
261 elif cls is Variable:
262 return cls._variable_v2_call(*args,**kwargs)
/usr/local/lib/python3.7/dist-packages/tensorflow/python/ops/variables.py in _variable_v1_call(cls,initial_value,variable_def,expected_shape,import_scope,shape)
219 synchronization=synchronization,220 aggregation=aggregation,--> 221 shape=shape)
222
223 def _variable_v2_call(cls,/usr/local/lib/python3.7/dist-packages/tensorflow/python/ops/variables.py in getter(**kwargs)
65
66 def getter(**kwargs):
---> 67 return captured_getter(captured_prevIoUs,**kwargs)
68
69 return getter
/usr/local/lib/python3.7/dist-packages/tensorflow/python/distribute/distribute_lib.py in creator_with_resource_vars(next_creator,**kwargs)
2109 checkpoint_restore_uid = None
2110
-> 2111 created = self._create_variable(next_creator,**kwargs)
2112
2113 if checkpoint_restore_uid is not None:
/usr/local/lib/python3.7/dist-packages/tensorflow/python/distribute/tpu_strategy.py in _create_variable(self,next_creator,**kwargs)
1167 self._container_strategy(),_real_mirrored_creator,1168 distribute_utils.TPU_VARIABLE_CLASS_MAPPING,-> 1169 distribute_utils.TPU_VARIABLE_POLICY_MAPPING,**kwargs)
1170
1171 def _gather_to_implementation(self,value,destinations,axis,options):
/usr/local/lib/python3.7/dist-packages/tensorflow/python/distribute/distribute_utils.py in create_mirrored_variable(strategy,real_mirrored_creator,class_mapping,policy_mapping,**kwargs)
304 # here.
305 with tape.stop_recording():
--> 306 value_list = real_mirrored_creator(**kwargs)
307 # MirroredVariable is recreated during saved_model loading,and its
308 # component variables (value_list) will have None initializer. We
/usr/local/lib/python3.7/dist-packages/tensorflow/python/distribute/tpu_strategy.py in _real_mirrored_creator(**kwargs)
1158
1159 with context.device_policy(context.DEVICE_PLACEMENT_SILENT):
-> 1160 v = next_creator(**kwargs)
1161
1162 assert not isinstance(v,tpu_values.TPUMirroredVariable)
/usr/local/lib/python3.7/dist-packages/tensorflow/python/ops/variables.py in <lambda>(**kwargs)
197 shape=None):
198 """Call on Variable class. Useful to force the signature."""
--> 199 prevIoUs_getter = lambda **kwargs: default_variable_creator(None,**kwargs)
200 for _,getter in ops.get_default_graph()._variable_creator_stack: # pylint: disable=protected-access
201 prevIoUs_getter = _make_getter(getter,prevIoUs_getter)
/usr/local/lib/python3.7/dist-packages/tensorflow/python/ops/variable_scope.py in default_variable_creator(next_creator,**kwargs)
2624 synchronization=synchronization,2625 aggregation=aggregation,-> 2626 shape=shape)
2627 else:
2628 return variables.RefVariable(
/usr/local/lib/python3.7/dist-packages/tensorflow/python/ops/variables.py in __call__(cls,**kwargs)
262 return cls._variable_v2_call(*args,**kwargs)
263 else:
--> 264 return super(VariableMetaclass,cls).__call__(*args,**kwargs)
265
266
/usr/local/lib/python3.7/dist-packages/tensorflow/python/ops/resource_variable_ops.py in __init__(self,distribute_strategy,shape)
1593 aggregation=aggregation,1594 shape=shape,-> 1595 distribute_strategy=distribute_strategy)
1596
1597 def _init_from_args(self,/usr/local/lib/python3.7/dist-packages/tensorflow/python/ops/resource_variable_ops.py in _init_from_args(self,shape)
1729 dtype=dtype)
1730 if shape is not None:
-> 1731 if not initial_value.shape.is_compatible_with(shape):
1732 raise ValueError(
1733 "The initial value's shape (%s) is not compatible with "
/usr/local/lib/python3.7/dist-packages/tensorflow/python/framework/ops.py in shape(self)
1196 # `_tensor_shape` is declared and defined in the deFinition of
1197 # `EagerTensor`,in C.
-> 1198 self._tensor_shape = tensor_shape.TensorShape(self._shape_tuple())
1199 except core._NotOkStatusException as e:
1200 six.raise_from(core._status_to_exception(e.code,e.message),None)
ResourceExhaustedError: Failed to allocate request for 18.0KiB (18432B) on device ordinal 0
只是为了删除'看起来你的帖子主要是代码;请添加更多细节。'。只是为了删除“看起来您的帖子主要是代码;请添加更多细节。'。 只是为了删除“看起来您的帖子主要是代码;请添加更多详细信息。'。
解决方法
我个人不会尝试清除 TPU 内存。如果 Google Colab TPU 出现 OOM,请使用较小的批量大小、较小的模型,或者使用内存是 Colab TPU 两倍的 Kaggle TPU。