问题描述
我正在 Colab TPU 上训练我的模型。但是我遇到了 model.fit()
API 的一些令人困惑的行为。我只有在传递 validation_data
参数时才会遇到这个问题。
场景 1
model.fit(get_train_ds().repeat(),steps_per_epoch = 100,epochs = 10,callbacks = [lr_callback],validation_data = get_val_ds(),validation_steps = 100)
返回错误:
---------------------------------------------------------------------------
IndexError Traceback (most recent call last)
<ipython-input-20-f929303620ea> in <module>()
20 callbacks = [lr_callback],21 validation_data = get_val_ds(),---> 22 validation_steps = 100)
23
24 with open("trainHistoryDict",'wb') as file_pi:
22 frames
/usr/local/lib/python3.7/dist-packages/tensorflow/python/keras/models.py in _clone_functional_model(model,input_tensors,layer_fn)
178 input_tensors = nest.flatten(input_tensors)
179 for i,input_tensor in enumerate(input_tensors):
--> 180 original_input_layer = model._input_layers[i]
181
182 # Cache input layer. Create a new layer if the tensor is originally not
IndexError: list index out of range
场景 2
当使用 checkpoint
回调并将 monitor
设置为 val_loss
时,例如:
checkpoint = keras.callbacks.ModelCheckpoint('model_{epoch:03d}-{val_loss:.8f}.h5',save_weights_only=True,monitor='val_loss',save_freq="epoch")
返回错误
KeyError: 'Failed to format this callback filepath: "model_{epoch:03d}-{loss:.8f}.h5". Reason: \'val_acc\'
此外,在 GPU 上训练时,它总是显示每个 epoch 的验证损失。但是,在 TPU 上训练时,从未显示验证损失。
根据 3 个观察结果,我怀疑没有在 Colab TPU 上评估验证数据?那是对的吗?有没有办法在每个 epoch 结束时计算验证数据?
==== 更新:
我的数据管道代码如下:
batch_size = 16
gcs_path = "gs://my-bucket/"
train_pattern = gcs_path + "train/*.tfrecords"
train_fns = tf.io.gfile.glob(train_pattern)
val_pattern = gcs_path + "val/*.tfrecords"
val_fns = tf.io.gfile.glob(val_pattern)
def get_train_ds():
train_dataset = tf.data.TFRecordDataset(train_fns,num_parallel_reads=AUTO)
train_dataset = train_dataset.shuffle(buffer_size = len(train_fns),reshuffle_each_iteration = True)
train_dataset = train_dataset.map(parse_func)
train_dataset = train_dataset.batch(batch_size,drop_remainder = True).prefetch(batch_size)
return train_dataset
def get_val_ds():
val_dataset = tf.data.TFRecordDataset(val_fns,num_parallel_reads=AUTO)
val_dataset = val_dataset.map(parse_func)
val_dataset = val_dataset.batch(batch_size,drop_remainder = True).prefetch(AUTO)
return val_dataset
=== 更新
with tpu_strategy.scope(): # creating the model in the TPUStrategy scope means we will train the model on the TPU
model = create_model()
periodic_save = keras.callbacks.ModelCheckpoint('model_{epoch:03d}.h5',save_freq="epoch")
hist = model.fit(get_train_ds().repeat(),epochs = 5,verbose = 1,validation_steps = 10)
错误信息
---------------------------------------------------------------------------
IndexError Traceback (most recent call last)
<ipython-input-23-d088df3513e2> in <module>()
20 callbacks = [lr_callback],'wb') as file_pi:
22 frames
/usr/local/lib/python3.7/dist-packages/tensorflow/python/keras/engine/training_v1.py in fit(self,x,y,batch_size,epochs,verbose,callbacks,validation_split,validation_data,shuffle,class_weight,sample_weight,initial_epoch,steps_per_epoch,validation_steps,validation_freq,max_queue_size,workers,use_multiprocessing,**kwargs)
807 max_queue_size=max_queue_size,808 workers=workers,--> 809 use_multiprocessing=use_multiprocessing)
810
811 def evaluate(self,/usr/local/lib/python3.7/dist-packages/tensorflow/python/keras/engine/training_distributed.py in fit(self,model,**kwargs)
668 steps_per_epoch=steps_per_epoch,669 validation_steps=validation_steps,--> 670 validation_freq=validation_freq)
671
672 return training_arrays.fit_loop(
/usr/local/lib/python3.7/dist-packages/tensorflow/python/keras/engine/training_distributed.py in experimental_tpu_fit_loop(model,dataset,val_dataset,validation_freq)
271 steps=validation_steps,272 verbose=verbose,--> 273 callbacks=callbacks)
274 if not isinstance(val_outs,list):
275 val_outs = [val_outs]
/usr/local/lib/python3.7/dist-packages/tensorflow/python/keras/engine/training_distributed.py in experimental_tpu_test_loop(model,steps,callbacks)
340 test_input_data = iterator.get_next()
341 per_replica_outputs = current_strategy.run(
--> 342 _test_step_fn,args=(test_input_data,))
343 output_tensors = {}
344 for label,output in zip(out_labels,per_replica_outputs):
/usr/local/lib/python3.7/dist-packages/tensorflow/python/distribute/tpu_strategy.py in run(self,fn,args,kwargs,options)
344 fn = autograph.tf_convert(fn,autograph_ctx.control_status_ctx())
345 options = options or distribute_lib.Runoptions()
--> 346 return self.extended.tpu_run(fn,options)
347
348 @property
/usr/local/lib/python3.7/dist-packages/tensorflow/python/distribute/tpu_strategy.py in tpu_run(self,options)
1093 def tpu_run(self,options=None):
1094 func = self._tpu_function_creator(fn,options)
-> 1095 return func(args,kwargs)
1096
1097 def _tpu_function_creator(self,options):
/usr/local/lib/python3.7/dist-packages/tensorflow/python/distribute/tpu_strategy.py in tpu_function(args,kwargs)
1160 device_assignment=self._device_assignment,1161 maximum_shapes=maximum_shapes,-> 1162 padding_spec=padding_spec)
1163
1164 # Remove all no ops that may have been added during 'tpu.replicate()'
/usr/local/lib/python3.7/dist-packages/tensorflow/python/tpu/tpu.py in replicate(computation,inputs,inFeed_queue,device_assignment,name,maximum_shapes,padding_spec)
913 name,914 maximum_shapes=maximum_shapes,--> 915 padding_spec=padding_spec)[1]
916
917
/usr/local/lib/python3.7/dist-packages/tensorflow/python/tpu/tpu.py in split_compile_and_replicate(***Failed resolving arguments***)
1378 vscope.set_custom_getter(custom_getter)
1379
-> 1380 outputs = computation(*computation_inputs)
1381
1382 vscope.set_use_resource(saved_use_resource)
/usr/local/lib/python3.7/dist-packages/tensorflow/python/distribute/tpu_strategy.py in replicated_fn(replica_id,replica_args,replica_kwargs)
1122 """Wraps user function to provide replica ID and `Tensor` inputs."""
1123 with _TPUReplicaContext(strategy,replica_id_in_sync_group=replica_id):
-> 1124 result[0] = fn(*replica_args,**replica_kwargs)
1125 return result[0]
1126
/usr/local/lib/python3.7/dist-packages/tensorflow/python/autograph/impl/api.py in wrapper(*args,**kwargs)
253 try:
254 with conversion_ctx:
--> 255 return converted_call(f,options=options)
256 except Exception as e: # pylint:disable=broad-except
257 if hasattr(e,'ag_error_Metadata'):
/usr/local/lib/python3.7/dist-packages/tensorflow/python/autograph/impl/api.py in converted_call(f,caller_fn_scope,options)
530
531 if not options.user_requested and conversion.is_whitelisted(f):
--> 532 return _call_unconverted(f,options)
533
534 # internal_convert_user_code is for example turned off when issuing a dynamic
/usr/local/lib/python3.7/dist-packages/tensorflow/python/autograph/impl/api.py in _call_unconverted(f,options,update_cache)
337
338 if kwargs is not None:
--> 339 return f(*args,**kwargs)
340 return f(*args)
341
/usr/local/lib/python3.7/dist-packages/tensorflow/python/keras/engine/training_distributed.py in _test_step_fn(inputs)
331
332 (distribution_strategy_context.get_replica_context().merge_call(
--> 333 _build_model,args=(model,mode,targets)))
334
335 (_,outputs,updates,_) = _per_replica_execution_function(
/usr/local/lib/python3.7/dist-packages/tensorflow/python/distribute/distribute_lib.py in merge_call(self,merge_fn,kwargs)
2713 merge_fn = autograph.tf_convert(
2714 merge_fn,autograph_ctx.control_status_ctx(),convert_by_default=False)
-> 2715 return self._merge_call(merge_fn,kwargs)
2716
2717 def _merge_call(self,kwargs):
/usr/local/lib/python3.7/dist-packages/tensorflow/python/distribute/distribute_lib.py in _merge_call(self,kwargs)
2720 distribution_strategy_context._CrossReplicaThreadMode(self._strategy)) # pylint: disable=protected-access
2721 try:
-> 2722 return merge_fn(self._strategy,*args,**kwargs)
2723 finally:
2724 _pop_per_thread_mode()
/usr/local/lib/python3.7/dist-packages/tensorflow/python/autograph/impl/api.py in wrapper(*args,**kwargs)
273 def wrapper(*args,**kwargs):
274 with ag_ctx.ControlStatusCtx(status=ag_ctx.Status.UNSPECIFIED):
--> 275 return func(*args,**kwargs)
276
277 if inspect.isfunction(func) or inspect.ismethod(func):
/usr/local/lib/python3.7/dist-packages/tensorflow/python/keras/engine/training_distributed.py in _build_model(strategy,targets)
55 else:
56 dist_utils._build_distributed_network(model,strategy,---> 57 targets)
58
59
/usr/local/lib/python3.7/dist-packages/tensorflow/python/keras/distribute/distributed_training_utils.py in _build_distributed_network(model,targets)
780 distributed_model = strategy.extended.call_for_each_replica(
781 _build_network_on_replica,--> 782 args=(model,targets))
783 set_distributed_model(model,distributed_model)
784
/usr/local/lib/python3.7/dist-packages/tensorflow/python/distribute/distribute_lib.py in call_for_each_replica(self,kwargs)
2583 kwargs = {}
2584 with self._container_strategy().scope():
-> 2585 return self._call_for_each_replica(fn,kwargs)
2586
2587 def _call_for_each_replica(self,kwargs):
/usr/local/lib/python3.7/dist-packages/tensorflow/python/distribute/tpu_strategy.py in _call_for_each_replica(self,kwargs)
741 # we're in a tpu.rewrite(),and update TPUMirroredVariable accordingly.
742 with _TPUReplicaContext(self._container_strategy()):
--> 743 return fn(*args,**kwargs)
744
745 @contextlib.contextmanager
/usr/local/lib/python3.7/dist-packages/tensorflow/python/keras/distribute/distributed_training_utils.py in _build_network_on_replica(model,targets)
740 else:
741 updated_model = models._clone_functional_model(
--> 742 model,input_tensors=inputs,layer_fn=models.share_weights)
743 # Callable losses added directly to a functional Model need to be added
744 # here.
/usr/local/lib/python3.7/dist-packages/tensorflow/python/keras/models.py in _clone_functional_model(model,input_tensor in enumerate(input_tensors):
--> 180 original_input_layer = model._input_layers[i]
181
182 # Cache input layer. Create a new layer if the tensor is originally not
IndexError: list index out of range
解决方法
暂无找到可以解决该程序问题的有效方法,小编努力寻找整理中!
如果你已经找到好的解决方法,欢迎将解决方案带上本链接一起发送给小编。
小编邮箱:dio#foxmail.com (将#修改为@)