Skip to content

Commit ceb1f8e

Browse files
authored
Merge branch 'master' into update_HF_image
2 parents 66853ea + da08405 commit ceb1f8e

29 files changed

+353
-88
lines changed

CHANGELOG.md

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,59 @@
11
# Changelog
22

3+
## v2.55.0 (2021-08-25)
4+
5+
### Features
6+
7+
* Add information of Amazon-provided analysis image used by Mo…
8+
9+
### Bug Fixes and Other Changes
10+
11+
* Update Changelog to fix release
12+
* Fixing the order of populating container list
13+
* pass network isolation config to pipelineModel
14+
* Deference symbolic link when create tar file
15+
* multiprocess issue in feature_group.py
16+
* deprecate tag logic on Association
17+
18+
### Documentation Changes
19+
20+
* add dataset_definition to processing page
21+
22+
## v2.54.0 (2021-08-16)
23+
24+
### Features
25+
26+
* add pytorch 1.5.1 eia configuration
27+
28+
### Bug Fixes and Other Changes
29+
30+
* issue #2253 where Processing job in Local mode would call Describe API
31+
32+
## v2.53.0 (2021-08-12)
33+
34+
### Features
35+
36+
* support tuning step parameter range parameterization + support retry strategy in tuner
37+
38+
## v2.52.2.post0 (2021-08-11)
39+
40+
### Documentation Changes
41+
42+
* clarify that default_bucket creates a bucket
43+
* Minor updates to Clarify API documentation
44+
45+
## v2.52.2 (2021-08-10)
46+
47+
### Bug Fixes and Other Changes
48+
49+
* sklearn integ tests, remove swallowing exception on feature group delete attempt
50+
* sklearn integ test for custom bucket
51+
52+
### Documentation Changes
53+
54+
* Fix dataset_definition links
55+
* Document LambdaModel and LambdaPredictor classes
56+
357
## v2.52.1 (2021-08-06)
458

559
### Bug Fixes and Other Changes

VERSION

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
2.52.2.dev0
1+
2.55.1.dev0

doc/api/utility/inputs.rst

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,4 +5,8 @@ Inputs
55
:members:
66
:undoc-members:
77
:show-inheritance:
8-
:noindex:
8+
9+
.. automodule:: sagemaker.dataset_definition.inputs
10+
:members:
11+
:undoc-members:
12+
:show-inheritance:

doc/workflows/pipelines/sagemaker.workflow.pipelines.rst

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@ ConditionStep
55
-------------
66

77
.. autoclass:: sagemaker.workflow.condition_step.ConditionStep
8-
98
.. deprecated:: sagemaker.workflow.condition_step.JsonGet
109

1110
Conditions

src/sagemaker/automl/automl.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -328,6 +328,7 @@ def create_model(
328328
predictor_cls=predictor_cls,
329329
name=name,
330330
vpc_config=vpc_config,
331+
enable_network_isolation=enable_network_isolation,
331332
sagemaker_session=sagemaker_session or self.sagemaker_session,
332333
)
333334
return pipeline

src/sagemaker/clarify.py

Lines changed: 31 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -48,12 +48,17 @@ def __init__(
4848
headers (list[str]): A list of column names in the input dataset.
4949
features (str): JSONPath for locating the feature columns for bias metrics if the
5050
dataset format is JSONLines.
51-
dataset_type (str): Format of the dataset. Valid values are "text/csv" for CSV
52-
and "application/jsonlines" for JSONLines.
51+
dataset_type (str): Format of the dataset. Valid values are "text/csv" for CSV,
52+
"application/jsonlines" for JSONLines, and "application/x-parquet" for Parquet.
5353
s3_data_distribution_type (str): Valid options are "FullyReplicated" or
5454
"ShardedByS3Key".
5555
s3_compression_type (str): Valid options are "None" or "Gzip".
5656
"""
57+
if dataset_type not in ["text/csv", "application/jsonlines", "application/x-parquet"]:
58+
raise ValueError(
59+
f"Invalid dataset_type '{dataset_type}'."
60+
f" Please check the API documentation for the supported dataset types."
61+
)
5762
self.s3_data_input_path = s3_data_input_path
5863
self.s3_output_path = s3_output_path
5964
self.s3_data_distribution_type = s3_data_distribution_type
@@ -508,7 +513,7 @@ def run_pre_training_bias(
508513
kms_key=None,
509514
experiment_config=None,
510515
):
511-
"""Runs a ProcessingJob to compute the requested bias 'methods' of the input data.
516+
"""Runs a ProcessingJob to compute the pre-training bias methods of the input data.
512517
513518
Computes the requested methods that compare 'methods' (e.g. fraction of examples) for the
514519
sensitive group vs the other examples.
@@ -517,14 +522,14 @@ def run_pre_training_bias(
517522
data_config (:class:`~sagemaker.clarify.DataConfig`): Config of the input/output data.
518523
data_bias_config (:class:`~sagemaker.clarify.BiasConfig`): Config of sensitive groups.
519524
methods (str or list[str]): Selector of a subset of potential metrics:
520-
["`CI <https://docs.aws.amazon.com/sagemaker/latest/dg/clarify-post-training-bias-metric-ci.html>`_",
521-
"`DPL <https://docs.aws.amazon.com/sagemaker/latest/dg/clarify-post-training-bias-metric-dpl.html>`_",
522-
"`KL <https://docs.aws.amazon.com/sagemaker/latest/dg/clarify-post-training-bias-metric-kl.html>`_",
523-
"`JS <https://docs.aws.amazon.com/sagemaker/latest/dg/clarify-post-training-bias-metric-js.html>`_",
524-
"`LP <https://docs.aws.amazon.com/sagemaker/latest/dg/clarify-post-training-bias-metric-lp.html>`_",
525-
"`TVD <https://docs.aws.amazon.com/sagemaker/latest/dg/clarify-post-training-bias-metric-tvd.html>`_",
526-
"`KS <https://docs.aws.amazon.com/sagemaker/latest/dg/clarify-post-training-bias-metric-ks.html>`_",
527-
"`CDDL <https://docs.aws.amazon.com/sagemaker/latest/dg/clarify-post-training-bias-metric-cdd.html>`_"].
525+
["`CI <https://docs.aws.amazon.com/sagemaker/latest/dg/clarify-bias-metric-class-imbalance.html>`_",
526+
"`DPL <https://docs.aws.amazon.com/sagemaker/latest/dg/clarify-data-bias-metric-true-label-imbalance.html>`_",
527+
"`KL <https://docs.aws.amazon.com/sagemaker/latest/dg/clarify-data-bias-metric-kl-divergence.html>`_",
528+
"`JS <https://docs.aws.amazon.com/sagemaker/latest/dg/clarify-data-bias-metric-jensen-shannon-divergence.html>`_",
529+
"`LP <https://docs.aws.amazon.com/sagemaker/latest/dg/clarify-data-bias-metric-lp-norm.html>`_",
530+
"`TVD <https://docs.aws.amazon.com/sagemaker/latest/dg/clarify-data-bias-metric-total-variation-distance.html>`_",
531+
"`KS <https://docs.aws.amazon.com/sagemaker/latest/dg/clarify-data-bias-metric-kolmogorov-smirnov.html>`_",
532+
"`CDDL <https://docs.aws.amazon.com/sagemaker/latest/dg/clarify-data-bias-metric-cddl.html>`_"].
528533
Defaults to computing all.
529534
wait (bool): Whether the call should wait until the job completes (default: True).
530535
logs (bool): Whether to show the logs produced by the job.
@@ -538,7 +543,7 @@ def run_pre_training_bias(
538543
experiment_config (dict[str, str]): Experiment management configuration.
539544
Dictionary contains three optional keys:
540545
'ExperimentName', 'TrialName', and 'TrialComponentDisplayName'.
541-
"""
546+
""" # noqa E501
542547
analysis_config = data_config.get_config()
543548
analysis_config.update(data_bias_config.get_config())
544549
analysis_config["methods"] = {"pre_training_bias": {"methods": methods}}
@@ -562,7 +567,7 @@ def run_post_training_bias(
562567
kms_key=None,
563568
experiment_config=None,
564569
):
565-
"""Runs a ProcessingJob to compute the requested bias 'methods' of the model predictions.
570+
"""Runs a ProcessingJob to compute the post-training bias methods of the model predictions.
566571
567572
Spins up a model endpoint, runs inference over the input example in the
568573
's3_data_input_path' to obtain predicted labels. Computes a the requested methods that
@@ -633,12 +638,11 @@ def run_bias(
633638
kms_key=None,
634639
experiment_config=None,
635640
):
636-
"""Runs a ProcessingJob to compute the requested bias 'methods' of the model predictions.
641+
"""Runs a ProcessingJob to compute the requested bias methods.
637642
638-
Spins up a model endpoint, runs inference over the input example in the
639-
's3_data_input_path' to obtain predicted labels. Computes a the requested methods that
640-
compare 'methods' (e.g. accuracy, precision, recall) for the sensitive group vs the other
641-
examples.
643+
It computes the metrics of both the pre-training methods and the post-training methods.
644+
To calculate post-training methods, it needs to spin up a model endpoint, runs inference
645+
over the input example in the 's3_data_input_path' to obtain predicted labels.
642646
643647
Args:
644648
data_config (:class:`~sagemaker.clarify.DataConfig`): Config of the input/output data.
@@ -648,14 +652,14 @@ def run_bias(
648652
model_predicted_label_config (:class:`~sagemaker.clarify.ModelPredictedLabelConfig`):
649653
Config of how to extract the predicted label from the model output.
650654
pre_training_methods (str or list[str]): Selector of a subset of potential metrics:
651-
["`CI <https://docs.aws.amazon.com/sagemaker/latest/dg/clarify-post-training-bias-metric-ci.html>`_",
652-
"`DPL <https://docs.aws.amazon.com/sagemaker/latest/dg/clarify-post-training-bias-metric-dpl.html>`_",
653-
"`KL <https://docs.aws.amazon.com/sagemaker/latest/dg/clarify-post-training-bias-metric-kl.html>`_",
654-
"`JS <https://docs.aws.amazon.com/sagemaker/latest/dg/clarify-post-training-bias-metric-js.html>`_",
655-
"`LP <https://docs.aws.amazon.com/sagemaker/latest/dg/clarify-post-training-bias-metric-lp.html>`_",
656-
"`TVD <https://docs.aws.amazon.com/sagemaker/latest/dg/clarify-post-training-bias-metric-tvd.html>`_",
657-
"`KS <https://docs.aws.amazon.com/sagemaker/latest/dg/clarify-post-training-bias-metric-ks.html>`_",
658-
"`CDDL <https://docs.aws.amazon.com/sagemaker/latest/dg/clarify-post-training-bias-metric-cdd.html>`_"].
655+
["`CI <https://docs.aws.amazon.com/sagemaker/latest/dg/clarify-bias-metric-class-imbalance.html>`_",
656+
"`DPL <https://docs.aws.amazon.com/sagemaker/latest/dg/clarify-data-bias-metric-true-label-imbalance.html>`_",
657+
"`KL <https://docs.aws.amazon.com/sagemaker/latest/dg/clarify-data-bias-metric-kl-divergence.html>`_",
658+
"`JS <https://docs.aws.amazon.com/sagemaker/latest/dg/clarify-data-bias-metric-jensen-shannon-divergence.html>`_",
659+
"`LP <https://docs.aws.amazon.com/sagemaker/latest/dg/clarify-data-bias-metric-lp-norm.html>`_",
660+
"`TVD <https://docs.aws.amazon.com/sagemaker/latest/dg/clarify-data-bias-metric-total-variation-distance.html>`_",
661+
"`KS <https://docs.aws.amazon.com/sagemaker/latest/dg/clarify-data-bias-metric-kolmogorov-smirnov.html>`_",
662+
"`CDDL <https://docs.aws.amazon.com/sagemaker/latest/dg/clarify-data-bias-metric-cddl.html>`_"].
659663
Defaults to computing all.
660664
post_training_methods (str or list[str]): Selector of a subset of potential metrics:
661665
["`DPPL <https://docs.aws.amazon.com/sagemaker/latest/dg/clarify-post-training-bias-metric-dppl.html>`_"
@@ -682,7 +686,7 @@ def run_bias(
682686
experiment_config (dict[str, str]): Experiment management configuration.
683687
Dictionary contains three optional keys:
684688
'ExperimentName', 'TrialName', and 'TrialComponentDisplayName'.
685-
"""
689+
""" # noqa E501
686690
analysis_config = data_config.get_config()
687691
analysis_config.update(bias_config.get_config())
688692
analysis_config["predictor"] = model_config.get_predictor_config()

src/sagemaker/dataset_definition/inputs.py

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ class RedshiftDatasetDefinition(ApiObject):
2727
2828
With this input, SQL queries will be executed using Redshift to generate datasets to S3.
2929
30-
Attributes:
30+
Parameters:
3131
cluster_id (str): The Redshift cluster Identifier.
3232
database (str): The name of the Redshift database used in Redshift query execution.
3333
db_user (str): The database user name used in Redshift query execution.
@@ -60,7 +60,7 @@ class AthenaDatasetDefinition(ApiObject):
6060
6161
With this input, SQL queries will be executed using Athena to generate datasets to S3.
6262
63-
Attributes:
63+
Parameters:
6464
catalog (str): The name of the data catalog used in Athena query execution.
6565
database (str): The name of the database used in the Athena query execution.
6666
query_string (str): The SQL query statements, to be executed.
@@ -87,7 +87,7 @@ class AthenaDatasetDefinition(ApiObject):
8787
class DatasetDefinition(ApiObject):
8888
"""DatasetDefinition input.
8989
90-
Attributes:
90+
Parameters:
9191
data_distribution_type (str): Whether the generated dataset is FullyReplicated or
9292
ShardedByS3Key (default).
9393
input_mode (str): Whether to use File or Pipe input mode. In File (default) mode, Amazon
@@ -98,10 +98,9 @@ class DatasetDefinition(ApiObject):
9898
local_path (str): The local path where you want Amazon SageMaker to download the Dataset
9999
Definition inputs to run a processing job. LocalPath is an absolute path to the input
100100
data. This is a required parameter when `AppManaged` is False (default).
101-
redshift_dataset_definition
102-
(:class:`~sagemaker.dataset_definition.RedshiftDatasetDefinition`): Redshift
103-
dataset definition.
104-
athena_dataset_definition (:class:`~sagemaker.dataset_definition.AthenaDatasetDefinition`):
101+
redshift_dataset_definition (:class:`~sagemaker.dataset_definition.inputs.RedshiftDatasetDefinition`):
102+
Configuration for Redshift Dataset Definition input.
103+
athena_dataset_definition (:class:`~sagemaker.dataset_definition.inputs.AthenaDatasetDefinition`):
105104
Configuration for Athena Dataset Definition input.
106105
"""
107106

@@ -126,7 +125,7 @@ class S3Input(ApiObject):
126125
S3 list operations are not strongly consistent.
127126
Use ManifestFile if strong consistency is required.
128127
129-
Attributes:
128+
Parameters:
130129
s3_uri (str): the path to a specific S3 object or a S3 prefix
131130
local_path (str): the path to a local directory. If not provided, skips data download
132131
by SageMaker platform.

src/sagemaker/feature_store/feature_group.py

Lines changed: 32 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -207,7 +207,8 @@ def _ingest_single_batch(
207207
for row in data_frame[start_index:end_index].itertuples():
208208
record = [
209209
FeatureValue(
210-
feature_name=data_frame.columns[index - 1], value_as_string=str(row[index])
210+
feature_name=data_frame.columns[index - 1],
211+
value_as_string=str(row[index]),
211212
)
212213
for index in range(1, len(row))
213214
if pd.notna(row[index])
@@ -270,13 +271,24 @@ def _run_multi_process(self, data_frame: DataFrame, wait=True, timeout=None):
270271
timeout (Union[int, float]): ``concurrent.futures.TimeoutError`` will be raised
271272
if timeout is reached.
272273
"""
274+
# pylint: disable=I1101
273275
batch_size = math.ceil(data_frame.shape[0] / self.max_processes)
276+
# pylint: enable=I1101
274277

275278
args = []
276279
for i in range(self.max_processes):
277280
start_index = min(i * batch_size, data_frame.shape[0])
278281
end_index = min(i * batch_size + batch_size, data_frame.shape[0])
279-
args += [(data_frame[start_index:end_index], start_index, timeout)]
282+
args += [
283+
(
284+
self.max_workers,
285+
self.feature_group_name,
286+
self.sagemaker_fs_runtime_client_config,
287+
data_frame[start_index:end_index],
288+
start_index,
289+
timeout,
290+
)
291+
]
280292

281293
def init_worker():
282294
# ignore keyboard interrupts in child processes.
@@ -285,13 +297,21 @@ def init_worker():
285297
self._processing_pool = ProcessingPool(self.max_processes, init_worker)
286298
self._processing_pool.restart(force=True)
287299

288-
f = lambda x: self._run_multi_threaded(*x) # noqa: E731
300+
f = lambda x: IngestionManagerPandas._run_multi_threaded(*x) # noqa: E731
289301
self._async_result = self._processing_pool.amap(f, args)
290302

291303
if wait:
292304
self.wait(timeout=timeout)
293305

294-
def _run_multi_threaded(self, data_frame: DataFrame, row_offset=0, timeout=None) -> List[int]:
306+
@staticmethod
307+
def _run_multi_threaded(
308+
max_workers: int,
309+
feature_group_name: str,
310+
sagemaker_fs_runtime_client_config: Config,
311+
data_frame: DataFrame,
312+
row_offset=0,
313+
timeout=None,
314+
) -> List[int]:
295315
"""Start the ingestion process.
296316
297317
Args:
@@ -305,21 +325,23 @@ def _run_multi_threaded(self, data_frame: DataFrame, row_offset=0, timeout=None)
305325
Returns:
306326
List of row indices that failed to be ingested.
307327
"""
308-
executor = ThreadPoolExecutor(max_workers=self.max_workers)
309-
batch_size = math.ceil(data_frame.shape[0] / self.max_workers)
328+
executor = ThreadPoolExecutor(max_workers=max_workers)
329+
# pylint: disable=I1101
330+
batch_size = math.ceil(data_frame.shape[0] / max_workers)
331+
# pylint: enable=I1101
310332

311333
futures = {}
312-
for i in range(self.max_workers):
334+
for i in range(max_workers):
313335
start_index = min(i * batch_size, data_frame.shape[0])
314336
end_index = min(i * batch_size + batch_size, data_frame.shape[0])
315337
futures[
316338
executor.submit(
317-
self._ingest_single_batch,
318-
feature_group_name=self.feature_group_name,
339+
IngestionManagerPandas._ingest_single_batch,
340+
feature_group_name=feature_group_name,
319341
data_frame=data_frame,
320342
start_index=start_index,
321343
end_index=end_index,
322-
client_config=self.sagemaker_fs_runtime_client_config,
344+
client_config=sagemaker_fs_runtime_client_config,
323345
)
324346
] = (start_index + row_offset, end_index + row_offset)
325347

src/sagemaker/image_uri_config/model-monitor.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
"ap-east-1": "001633400207",
88
"ap-northeast-1": "574779866223",
99
"ap-northeast-2": "709848358524",
10+
"ap-northeast-3": "990339680094",
1011
"ap-south-1": "126357580389",
1112
"ap-southeast-1": "245545462676",
1213
"ap-southeast-2": "563025443158",

0 commit comments

Comments
 (0)