You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
Renaming output.tar.gz to model.tar.gz in S3 bucket fixing the problem.
To reproduce
hyperparameters= {
"epochs": 10,
"batch-size": 64,
"embedding-dim": 125,
"hidden-dim": 2
}
estimator_config= {
"entry_point": "train_script.py",
"source_dir": "scripts", # we provide source_dir in order to install torchtext!!"framework_version": "1.9",
"py_version": "py38",
"instance_type": "ml.m5.xlarge",
"instance_count": 1,
"role": sagemaker.get_execution_role(),
"output_path": f"s3://{bucket_name}", # if this is not specified then SM will create new bucket to store artifacts"hyperparameters": hyperparameters,
}
pytorch_estimator=PyTorch(**estimator_config)
# https://sagemaker.readthedocs.io/en/stable/frameworks/pytorch/using_pytorch.htmldata_channels= {"train": f"s3://{bucket_name}/data/"}
pytorch_estimator.fit(data_channels)
pytorch_estimator.model_data# outputs -> s3://bucket-name/pytorch-training-2021-11-16-22-53-00-748/output/model.tar.gzpytorch_estimator.deploy(instance_type='ml.m5.xlarge',
initial_instance_count=1)
# ClientError: An error occurred (404) when calling the HeadObject operation: Not Found
Expected behavior
Output of pytorch_estimator.model_data should contain the actual and correct name of tar file which is output.tar.gz and endpoint should be deployed without ClientError
Screenshots or logs
ClientErrorTraceback (mostrecentcalllast)
<ipython-input-57-5ceec0733f49>in<module>1pytorch_estimator.deploy(instance_type='ml.m5.xlarge',
---->2initial_instance_count=1)
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/sagemaker/estimator.pyindeploy(self, initial_instance_count, instance_type, serializer, deserializer, accelerator_type, endpoint_name, use_compiled_model, wait, model_name, kms_key, data_capture_config, tags, **kwargs)
963wait=wait,
964kms_key=kms_key,
-->965data_capture_config=data_capture_config,
966 )
967~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/sagemaker/model.pyindeploy(self, initial_instance_count, instance_type, serializer, deserializer, accelerator_type, endpoint_name, tags, kms_key, wait, data_capture_config, **kwargs)
709self._base_name="-".join((self._base_name, compiled_model_suffix))
710-->711self._create_sagemaker_model(instance_type, accelerator_type, tags)
712production_variant=sagemaker.production_variant(
713self.name, instance_type, initial_instance_count, accelerator_type=accelerator_type~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/sagemaker/model.pyin_create_sagemaker_model(self, instance_type, accelerator_type, tags)
263/api/latest/reference/services/sagemaker.html#SageMaker.Client.add_tags264 """
--> 265 container_def = self.prepare_container_def(instance_type, accelerator_type=accelerator_type)
266
267 self._ensure_base_name_if_needed(container_def["Image"])
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/sagemaker/pytorch/model.py in prepare_container_def(self, instance_type, accelerator_type)
237
238 deploy_key_prefix = model_code_key_prefix(self.key_prefix, self.name, deploy_image)
--> 239 self._upload_code(deploy_key_prefix, repack=self._is_mms_version())
240 deploy_env = dict(self.env)
241 deploy_env.update(self._framework_env_vars())
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/sagemaker/model.py in _upload_code(self, key_prefix, repack)
1088 repacked_model_uri=repacked_model_data,
1089 sagemaker_session=self.sagemaker_session,
-> 1090 kms_key=self.model_kms_key,
1091 )
1092
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/sagemaker/utils.py in repack_model(inference_script, source_directory, dependencies, model_uri, repacked_model_uri, sagemaker_session, kms_key)
410
411 with _tmpdir() as tmp:
--> 412 model_dir = _extract_model(model_uri, sagemaker_session, tmp)
413
414 _create_or_update_code_dir(
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/sagemaker/utils.py in _extract_model(model_uri, sagemaker_session, tmp)
484 if model_uri.lower().startswith("s3://"):
485 local_model_path = os.path.join(tmp, "tar_file")
--> 486 download_file_from_url(model_uri, local_model_path, sagemaker_session)
487 else:
488 local_model_path = model_uri.replace("file://", "")
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/sagemaker/utils.py in download_file_from_url(url, dst, sagemaker_session)
497 bucket, key = url.netloc, url.path.lstrip("/")
498
--> 499 download_file(bucket, key, dst, sagemaker_session)
500
501
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/sagemaker/utils.py in download_file(bucket_name, path, target, sagemaker_session)
515 s3 = boto_session.resource("s3", region_name=sagemaker_session.boto_region_name)
516 bucket = s3.Bucket(bucket_name)
--> 517 bucket.download_file(path, target)
518
519
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/boto3/s3/inject.py in bucket_download_file(self, Key, Filename, ExtraArgs, Callback, Config)
245 return self.meta.client.download_file(
246 Bucket=self.name, Key=Key, Filename=Filename,
--> 247 ExtraArgs=ExtraArgs, Callback=Callback, Config=Config)
248
249
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/boto3/s3/inject.py in download_file(self, Bucket, Key, Filename, ExtraArgs, Callback, Config)
171 return transfer.download_file(
172 bucket=Bucket, key=Key, filename=Filename,
--> 173 extra_args=ExtraArgs, callback=Callback)
174
175
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/boto3/s3/transfer.py in download_file(self, bucket, key, filename, extra_args, callback)
305 bucket, key, filename, extra_args, subscribers)
306 try:
--> 307 future.result()
308 # This is for backwards compatibility where when retries are
309 # exceeded we need to throw the same error from boto3 instead of
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/s3transfer/futures.py in result(self)
104 # however if a KeyboardInterrupt is raised we want want to exit
105 # out of this and propogate the exception.
--> 106 return self._coordinator.result()
107 except KeyboardInterrupt as e:
108 self.cancel()
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/s3transfer/futures.py in result(self)
263 # final result.
264 if self._exception:
--> 265 raise self._exception
266 return self._result
267
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/s3transfer/tasks.py in _main(self, transfer_future, **kwargs)
253 # Call the submit method to start submitting tasks to execute the
254 # transfer.
--> 255 self._submit(transfer_future=transfer_future, **kwargs)
256 except BaseException as e:
257 # If there was an exception raised during the submission of task
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/s3transfer/download.py in _submit(self, client, config, osutil, request_executor, io_executor, transfer_future, bandwidth_limiter)
341 Bucket=transfer_future.meta.call_args.bucket,
342 Key=transfer_future.meta.call_args.key,
--> 343 **transfer_future.meta.call_args.extra_args
344 )
345 transfer_future.meta.provide_transfer_size(
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/botocore/client.py in _api_call(self, *args, **kwargs)
386 "%s() only accepts keyword arguments." % py_operation_name)
387 # The "self" in this scope is referring to the BaseClient.
--> 388 return self._make_api_call(operation_name, kwargs)
389
390 _api_call.__name__ = str(py_operation_name)
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/botocore/client.py in _make_api_call(self, operation_name, api_params)
706 error_code = parsed_response.get("Error", {}).get("Code")
707 error_class = self.exceptions.from_code(error_code)
--> 708 raise error_class(parsed_response, operation_name)
709 else:
710 return parsed_response
ClientError: An error occurred (404) when calling the HeadObject operation: Not Found
System information
A description of your system. Please provide:
SageMaker Python SDK version: 2.68.0
Framework name (eg. PyTorch) or algorithm (eg. KMeans): Pytorch
Describe the bug
When you create Pytorch's estimator and train the model and then use
pytorch_estimator.model_data
it outputsHowever, in reality when you check inside
S3
bucket itoutput.tar.gz
notmodel.tar.gz
!And I believe because of this reason when I try to deploy using the same estimator
I get
Renaming
output.tar.gz
tomodel.tar.gz
in S3 bucket fixing the problem.To reproduce
Expected behavior
Output of
pytorch_estimator.model_data
should contain the actual and correct name oftar
file which isoutput.tar.gz
and endpoint should be deployed withoutClientError
Screenshots or logs
System information
A description of your system. Please provide:
I believe it might be related to this issue #1354
Thanks a ton
Best
CC @nadiaya
The text was updated successfully, but these errors were encountered: