Skip to content

Commit d9e2567

Browse files
Add deprecation warning for s3_data_distribution_type in Clarify Data Config (#2847)
1 parent 70308b1 commit d9e2567

File tree

2 files changed

+28
-3
lines changed

2 files changed

+28
-3
lines changed

src/sagemaker/clarify.py

+15-3
Original file line numberDiff line numberDiff line change
@@ -22,11 +22,17 @@
2222
import tempfile
2323
from abc import ABC, abstractmethod
2424
from sagemaker import image_uris, s3, utils
25+
from sagemaker.deprecations import deprecation_warning
2526
from sagemaker.processing import ProcessingInput, ProcessingOutput, Processor
2627

2728
logger = logging.getLogger(__name__)
2829

2930

31+
@deprecation_warning(
32+
msg="s3_data_distribution_type parameter will no longer be supported. Everything else will"
33+
" remain as is",
34+
date="15 Mar 2022",
35+
)
3036
class DataConfig:
3137
"""Config object related to configurations of the input and output dataset."""
3238

@@ -58,8 +64,8 @@ def __init__(
5864
dataset format is JSONLines.
5965
dataset_type (str): Format of the dataset. Valid values are "text/csv" for CSV,
6066
"application/jsonlines" for JSONLines, and "application/x-parquet" for Parquet.
61-
s3_data_distribution_type (str): Valid options are "FullyReplicated" or
62-
"ShardedByS3Key".
67+
s3_data_distribution_type (str): Deprecated. Only valid option is "FullyReplicated".
68+
Any other value is ignored.
6369
s3_compression_type (str): Valid options are "None" or "Gzip".
6470
joinsource (str): The name or index of the column in the dataset that acts as an
6571
identifier column (for instance, while performing a join). This column is only
@@ -80,7 +86,13 @@ def __init__(
8086
self.s3_data_input_path = s3_data_input_path
8187
self.s3_output_path = s3_output_path
8288
self.s3_analysis_config_output_path = s3_analysis_config_output_path
83-
self.s3_data_distribution_type = s3_data_distribution_type
89+
if s3_data_distribution_type != "FullyReplicated":
90+
logger.warning(
91+
"s3_data_distribution_type parameter, set to %s, is being ignored. Only"
92+
" valid option is FullyReplicated",
93+
s3_data_distribution_type,
94+
)
95+
self.s3_data_distribution_type = "FullyReplicated"
8496
self.s3_compression_type = s3_compression_type
8597
self.label = label
8698
self.headers = headers

tests/unit/test_clarify.py

+13
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,19 @@ def test_invalid_data_config():
8282
)
8383

8484

85+
def test_s3_data_distribution_type_ignorance():
86+
data_config = DataConfig(
87+
s3_data_input_path="s3://input/train.csv",
88+
s3_output_path="s3://output/analysis_test_result",
89+
label="Label",
90+
headers=["Label", "F1", "F2", "F3", "F4"],
91+
dataset_type="text/csv",
92+
joinsource="F4",
93+
s3_data_distribution_type="ShardedByS3Key",
94+
)
95+
assert data_config.s3_data_distribution_type == "FullyReplicated"
96+
97+
8598
def test_bias_config():
8699
label_values = [1]
87100
facet_name = "F1"

0 commit comments

Comments
 (0)