Azure ML - 如何修复此快照异常?

问题描述

我正在 Azure ML SDK 中创建管道。在我运行管道一段时间后,它报告我已达到 300MB 的快照限制。我遵循了一些提议的修复:

  • 每个步骤脚本都移动到一个单独的子文件

  • 我向管道添加一个数据存储区

  • 添加了这一行:azureml._restclient.snapshots_client.SNAPSHOT_MAX_SIZE_BYTES = 1000

但是在我提交管道后发生了一个新的快照错误

pipeline1 = Pipeline(default_source_directory=".",default_datastore=def_blob_store,workspace=ws,steps=[prep_step,hd_step,register_model_step])

错误信息:

    WARNING:root:If 'script' has been provided here and a script file name has been specified in 'run_config','script' provided in Scriptrunconfig initialization will take precedence.
---------------------------------------------------------------------------
SnapshotException                         Traceback (most recent call last)
<ipython-input-14-05c5aa4991aa> in <module>
----> 1 pipeline1 = Pipeline(default_source_directory=".",register_model_step])
      2 pipeline1.validate()
      3 pipeline_run = Experiment(ws,'health_insuarance').submit(pipeline1,regenerate_outputs=False)
      4 RunDetails(pipeline_run).show()

/anaconda/envs/azureml_py36/lib/python3.6/site-packages/azureml/core/_experiment_method.py in wrapper(self,*args,**kwargs)
     95             """
     96             ExperimentSubmitRegistrar.register_submit_function(self.__class__,submit_function)
---> 97             return init_func(self,**kwargs)
     98         return wrapper
     99     return real_decorator

/anaconda/envs/azureml_py36/lib/python3.6/site-packages/azureml/pipeline/core/pipeline.py in __init__(self,workspace,steps,description,default_datastore,default_source_directory,resolve_closure,_workflow_provider,_service_endpoint,**kwargs)
    175                 raise ValueError('parameter %s is not recognized for Pipeline ' % key)
    176         self._enable_email_notification = enable_email_notification
--> 177         self._graph = self._graph_builder.build(self._name,finalize=False)
    178 
    179     def _set_experiment_name(self,name):

/anaconda/envs/azureml_py36/lib/python3.6/site-packages/azureml/pipeline/core/builder.py in build(self,name,finalize,regenerate_outputs)
   1479                 pass
   1480 
-> 1481         graph = self.construct(name,steps)
   1482         if finalize:
   1483             graph.finalize(regenerate_outputs=regenerate_outputs)

/anaconda/envs/azureml_py36/lib/python3.6/site-packages/azureml/pipeline/core/builder.py in construct(self,steps)
   1501         self._graph = Graph(name,self._context)
   1502         self._nodeStack.append([])
-> 1503         self.process_collection(steps)
   1504         for builder in self._builderStack[::-1]:
   1505             builder.apply_rules()

/anaconda/envs/azureml_py36/lib/python3.6/site-packages/azureml/pipeline/core/builder.py in process_collection(self,collection)
   1537         self._nodeStack.append([])
   1538         self._builderStack.append(builder)
-> 1539         builder.process_collection(collection)
   1540         added_nodes = self._nodeStack.pop()
   1541         self._nodeStack[-1].extend(added_nodes)

/anaconda/envs/azureml_py36/lib/python3.6/site-packages/azureml/pipeline/core/builder.py in process_collection(self,collection)
   1828         """
   1829         for item in collection:
-> 1830             self._base_builder.process_collection(item)
   1831 
   1832     def apply_rules(self):

/anaconda/envs/azureml_py36/lib/python3.6/site-packages/azureml/pipeline/core/builder.py in process_collection(self,collection)
   1531         # just a step?
   1532         if isinstance(collection,Pipelinestep):
-> 1533             return self.process_step(collection)
   1534 
   1535         # delegate to correct builder

/anaconda/envs/azureml_py36/lib/python3.6/site-packages/azureml/pipeline/core/builder.py in process_step(self,step)
   1575             return self._step2node[step]
   1576 
-> 1577         node = step.create_node(self._graph,self._default_datastore,self._context)
   1578         self.assert_node_valid(step,self._graph,node)
   1579 

/anaconda/envs/azureml_py36/lib/python3.6/site-packages/azureml/pipeline/steps/hyper_drive_step.py in create_node(self,graph,context)
    247         """
    248         hyperdrive_config,reuse_hashable_config = self._get_hyperdrive_config(context._workspace,--> 249                                                                                context._experiment_name)
    250         self._params[HyperDriveStep._run_config_param_name] = json.dumps(hyperdrive_config)
    251         self._params[HyperDriveStep._run_reuse_hashable_config] = json.dumps(reuse_hashable_config)

/anaconda/envs/azureml_py36/lib/python3.6/site-packages/azureml/pipeline/steps/hyper_drive_step.py in _get_hyperdrive_config(self,experiment_name)
    323 
    324         hyperdrive_dto = _search._create_experiment_dto(self._hyperdrive_config,--> 325                                                         experiment_name,telemetry_values)
    326 
    327         hyperdrive_config = hyperdrive_dto.as_dict()

/anaconda/envs/azureml_py36/lib/python3.6/site-packages/azureml/train/hyperdrive/_search.py in _create_experiment_dto(hyperdrive_config,experiment_name,telemetry_values,activity_logger,**kwargs)
     41     if hyperdrive_config.source_directory is not None:
     42         snapshot_client = SnapshotsClient(workspace.service_context)
---> 43         snapshot_id = snapshot_client.create_snapshot(hyperdrive_config.source_directory)
     44 
     45         if activity_logger is not None:

/anaconda/envs/azureml_py36/lib/python3.6/site-packages/azureml/_restclient/snapshots_client.py in create_snapshot(self,file_or_folder_path,retry_on_failure,raise_on_validation_failure)
     83         exclude_function = ignore_file.is_file_excluded
     84 
---> 85         self._validate_snapshot_size(file_or_folder_path,exclude_function,raise_on_validation_failure)
     86 
     87         # Get the prevIoUs snapshot for this project

/anaconda/envs/azureml_py36/lib/python3.6/site-packages/azureml/_restclient/snapshots_client.py in _validate_snapshot_size(self,raise_on_validation_failure)
     61                             "\n".format(file_or_folder_path,SNAPSHOT_MAX_SIZE_BYTES / ONE_MB)
     62             if raise_on_validation_failure:
---> 63                 raise SnapshotException(error_message)
     64             else:
     65                 self._logger.warning(error_message)

SnapshotException: SnapshotException:
    Message: ====================================================================

While attempting to take snapshot of ./train/
Your total snapshot size exceeds the limit of 0.00095367431640625 MB.
Please see http://aka.ms/aml-largefiles on how to work with large files.

====================================================================


    InnerException None
    ErrorResponse 
{
    "error": {
        "message": "====================================================================\n\nWhile attempting to take snapshot of ./train/\nYour total snapshot size exceeds the limit of 0.00095367431640625 MB.\nPlease see http://aka.ms/aml-largefiles on how to work with large files.\n\n====================================================================\n\n"
    }
}

知道如何解决这个问题吗?

完整脚本在这里Script at Github

解决方法

好的,所以我找到了修复。

我更改了这一行,将一个数字等于 1GB:azureml._restclient.snapshots_client.SNAPSHOT_MAX_SIZE_BYTES = 1000000000

出于某种原因,即使默认值为 300 MB,您也必须以字节而不是兆字节为单位定义大小。不是特别直观。