aws
diff --git a/‎CHANGELOG.md
+20 b/‎CHANGELOG.md
+20
diff --git a/‎README.rst
+1-1 b/‎README.rst
+1-1
diff --git a/‎VERSION
+1-1 b/‎VERSION
+1-1
diff --git a/‎doc/amazon_sagemaker_featurestore.rst
+97 b/‎doc/amazon_sagemaker_featurestore.rst
+97
diff --git a/‎src/sagemaker/feature_store/dataset_builder.py
+27-18 b/‎src/sagemaker/feature_store/dataset_builder.py
+27-18
diff --git a/‎src/sagemaker/fw_utils.py
+66-20 b/‎src/sagemaker/fw_utils.py
+66-20
diff --git a/‎src/sagemaker/git_utils.py
+11-7 b/‎src/sagemaker/git_utils.py
+11-7
@@ -1,5 +1,25 @@
 # Changelog
 
+## v2.135.1.post0 (2023-03-02)
+
+### Documentation Changes
+
+ * update feature store dataset builder docs
+
+## v2.135.1 (2023-03-01)
+
+### Bug Fixes and Other Changes
+
+ * Revert back to stable apache-airflow-providers-amazon from 7.2.1 to 4.0.0.
+ * Typo in graviton algos
+ * build(deps): bump apache-airflow-providers-amazon from 4.0.0 to 7.2.1 in /requirements/extras
+ * Support cloning private repo using ssh key
+ * Create a default SageMaker Session inside FeatureGroup class
+
+### Documentation Changes
+
+ * fix typo in README
+
 ## v2.135.0 (2023-02-23)
 
 ### Features
 
@@ -126,7 +126,7 @@ To run the unit tests with tox, run:
 
     tox tests/unit
 
-**Integrations tests**
+**Integration tests**
 
 To run the integration tests, the following prerequisites must be met
 
 
@@ -1 +1 @@
-2.135.1.dev0
+2.135.2.dev0
@@ -380,6 +380,102 @@ location for the data set to be saved there.
 From here you can train a model using this data set and then perform
 inference.
 
+.. rubric:: Using the Offline Store SDK: Getting Started
+   :name: bCe9CA61b79
+
+The Feature Store Offline SDK provides the ability to quickly and easily
+build ML-ready datasets for use by ML model training or pre-processing.
+The SDK makes it easy to build datasets from SQL join, point-in-time accurate
+join, and event range time frames, all without the need to write any SQL code.
+This functionality is accessed via the DatasetBuilder class which is the
+primary entry point for the SDK functionality.
+
+.. code:: python
+
+   from sagemaker.feature_store.feature_store import FeatureStore
+
+   feature_store = FeatureStore(sagemaker_session=feature_store_session)
+
+.. code:: python
+
+   base_feature_group = identity_feature_group
+   target_feature_group = transaction_feature_group
+
+You can create dataset using `create_dataset` of feature store API.
+`base` can either be a feature group or a pandas dataframe.
+
+.. code:: python
+
+   result_df, query = feature_store.create_dataset(
+      base=base_feature_group,
+      output_path=f"s3://{s3_bucket_name}"
+   ).to_dataframe()
+
+If you want to join other feature group, you can specify extra
+feature group using `with_feature_group` method.
+
+.. code:: python
+
+   dataset_builder = feature_store.create_dataset(
+      base=base_feature_group,
+      output_path=f"s3://{s3_bucket_name}"
+   ).with_feature_group(target_feature_group, record_identifier_name)
+
+   result_df, query = dataset_builder.to_dataframe()
+
+.. rubric:: Using the Offline Store SDK: Configuring the DatasetBuilder
+   :name: bCe9CA61b80
+
+How the DatasetBuilder produces the resulting dataframe can be configured
+in various ways.
+
+By default the Python SDK will exclude all deleted and duplicate records.
+However if you need either of them in returned dataset, you can call
+`include_duplicated_records` or `include_deleted_records` when creating
+dataset builder.
+
+.. code:: python
+
+   dataset_builder.include_duplicated_records()
+   dataset_builder.include_deleted_records()
+
+The DatasetBuilder provides `with_number_of_records_from_query_results` and
+`with_number_of_recent_records_by_record_identifier` methods to limit the
+number of records returned for the offline snapshot.
+
+`with_number_of_records_from_query_results` will limit the number of records
+in the output. For example, when N = 100, only 100 records are going to be
+returned in either the csv or dataframe.
+
+.. code:: python
+
+   dataset_builder.with_number_of_records_from_query_results(number_of_records=N)
+
+On the other hand, `with_number_of_recent_records_by_record_identifier` is
+used to deal with records which have the same identifier. They are going
+to be sorted according to `event_time` and return at most N recent records
+in the output.
+
+.. code:: python
+
+   dataset_builder.with_number_of_recent_records_by_record_identifier(number_of_recent_records=N)
+
+Since these functions return the dataset builder, these functions can
+be chained.
+
+.. code:: python
+
+   dataset_builder
+      .with_number_of_records_from_query_results(number_of_records=N)
+      .include_duplicated_records()
+      .with_number_of_recent_records_by_record_identifier(number_of_recent_records=N)
+      .to_dataframe()
+
+There are additional configurations that can be made for various use cases,
+such as time travel and point-in-time join. These are outlined in the
+Feature Store `DatasetBuilder API Reference
+<https://sagemaker.readthedocs.io/en/stable/api/prep_data/feature_store.html#dataset-builder>`__.
+
 .. rubric:: Delete a feature group
    :name: bCe9CA61b78
 
@@ -395,3 +491,4 @@ The following code example is from the fraud detection example.
 
    identity_feature_group.delete()
    transaction_feature_group.delete()
+
@@ -171,24 +171,33 @@ class DatasetBuilder:
         _event_time_identifier_feature_name (str): A string representing the event time identifier
             feature if base is a DataFrame (default: None).
         _included_feature_names (List[str]): A list of strings representing features to be
-            included in the output (default: None).
-        _kms_key_id (str): An KMS key id. If set, will be used to encrypt the result file
+            included in the output. If not set, all features will be included in the output.
             (default: None).
-        _point_in_time_accurate_join (bool): A boolean representing whether using point in time join
-            or not (default: False).
-        _include_duplicated_records (bool): A boolean representing whether including duplicated
-            records or not (default: False).
-        _include_deleted_records (bool): A boolean representing whether including deleted records or
-            not (default: False).
-        _number_of_recent_records (int): An int that how many records will be returned for each
-            record identifier (default: 1).
-        _number_of_records (int): An int that how many records will be returned (default: None).
-        _write_time_ending_timestamp (datetime.datetime): A datetime that all records' write time in
-            dataset will be before it (default: None).
-        _event_time_starting_timestamp (datetime.datetime): A datetime that all records' event time
-            in dataset will be after it (default: None).
-        _event_time_ending_timestamp (datetime.datetime): A datetime that all records' event time in
-            dataset will be before it (default: None).
+        _kms_key_id (str): A KMS key id. If set, will be used to encrypt the result file
+            (default: None).
+        _point_in_time_accurate_join (bool): A boolean representing if point-in-time join
+            is applied to the resulting dataframe when calling "to_dataframe".
+            When set to True, users can retrieve data using “row-level time travel”
+            according to the event times provided to the DatasetBuilder. This requires that the
+            entity dataframe with event times is submitted as the base in the constructor
+            (default: False).
+        _include_duplicated_records (bool): A boolean representing whether the resulting dataframe
+            when calling "to_dataframe" should include duplicated records (default: False).
+        _include_deleted_records (bool): A boolean representing whether the resulting
+            dataframe when calling "to_dataframe" should include deleted records (default: False).
+        _number_of_recent_records (int): An integer representing how many records will be
+            returned for each record identifier (default: 1).
+        _number_of_records (int): An integer representing the number of records that should be
+            returned in the resulting dataframe when calling "to_dataframe" (default: None).
+        _write_time_ending_timestamp (datetime.datetime): A datetime that represents the latest
+            write time for a record to be included in the resulting dataset. Records with a
+            newer write time will be omitted from the resulting dataset. (default: None).
+        _event_time_starting_timestamp (datetime.datetime): A datetime that represents the earliest
+            event time for a record to be included in the resulting dataset. Records
+            with an older event time will be omitted from the resulting dataset. (default: None).
+        _event_time_ending_timestamp (datetime.datetime): A datetime that represents the latest
+            event time for a record to be included in the resulting dataset. Records
+            with a newer event time will be omitted from the resulting dataset. (default: None).
         _feature_groups_to_be_merged (List[FeatureGroupToBeMerged]): A list of
             FeatureGroupToBeMerged which will be joined to base (default: []).
         _event_time_identifier_feature_type (FeatureTypeEnum): A FeatureTypeEnum representing the
@@ -247,7 +256,7 @@ def with_feature_group(
         return self
 
     def point_in_time_accurate_join(self):
-        """Set join type as point in time accurate join.
+        """Enable point-in-time accurate join.
 
         Returns:
             This DatasetBuilder object.
 
@@ -148,11 +148,17 @@
 ]
 
 
-TORCH_DISTRIBUTED_SUPPORTED_FRAMEWORK_VERSIONS = ["1.11", "1.11.0"]
-
+TORCH_DISTRIBUTED_GPU_SUPPORTED_FRAMEWORK_VERSIONS = ["1.13.1"]
 
 TRAINIUM_SUPPORTED_DISTRIBUTION_STRATEGIES = ["torch_distributed"]
-
+TRAINIUM_SUPPORTED_TORCH_DISTRIBUTED_FRAMEWORK_VERSIONS = [
+    "1.11",
+    "1.11.0",
+    "1.12",
+    "1.12.0",
+    "1.12.1",
+    "1.13.1",
+]
 
 SMDISTRIBUTED_SUPPORTED_STRATEGIES = ["dataparallel", "modelparallel"]
 
@@ -1055,9 +1061,8 @@ def validate_torch_distributed_distribution(
     Raises:
         ValueError: if
             `py_version` is not python3 or
-            `framework_version` is not in TORCH_DISTRIBUTED_SUPPORTED_FRAMEWORK_VERSIONS
+            `framework_version` is not compatible with instance types
     """
-
     torch_distributed_enabled = False
     if "torch_distributed" in distribution:
         torch_distributed_enabled = distribution.get("torch_distributed").get("enabled", False)
@@ -1066,30 +1071,36 @@ def validate_torch_distributed_distribution(
         return
 
     err_msg = ""
+
     if not image_uri:
         # ignore framework_version and py_version if image_uri is set
         # in case image_uri is not set, then both are mandatory
-        if framework_version not in TORCH_DISTRIBUTED_SUPPORTED_FRAMEWORK_VERSIONS:
-            err_msg += (
-                f"Provided framework_version {framework_version} is not supported by"
-                " torch_distributed.\n"
-                "Please specify one of the supported framework versions:"
-                f" {TORCH_DISTRIBUTED_SUPPORTED_FRAMEWORK_VERSIONS} \n"
-            )
         if "py3" not in py_version:
             err_msg += (
                 f"Provided py_version {py_version} is not supported by torch_distributed.\n"
-                "Please specify py_version>=py3"
+                "Please specify py_version>=py3\n"
             )
 
-    # Check instance compatibility
-    match = re.match(r"^ml[\._]([a-z\d]+)\.?\w*$", instance_type)
-    if match:
-        if not match[1].startswith("trn"):
+        # Check instance and framework_version compatibility
+        if _is_gpu_instance(instance_type):
+            if framework_version not in TORCH_DISTRIBUTED_GPU_SUPPORTED_FRAMEWORK_VERSIONS:
+                err_msg += (
+                    f"Provided framework_version {framework_version} is not supported by"
+                    f" torch_distributed for instance {instance_type}.\n"
+                    "Please specify one of the supported framework versions:"
+                    f"{TORCH_DISTRIBUTED_GPU_SUPPORTED_FRAMEWORK_VERSIONS} \n"
+                )
+        elif _is_trainium_instance(instance_type):
+            if framework_version not in TRAINIUM_SUPPORTED_TORCH_DISTRIBUTED_FRAMEWORK_VERSIONS:
+                err_msg += (
+                    f"Provided framework_version {framework_version} is not supported by"
+                    f" torch_distributed for instance {instance_type}.\n"
+                    "Please specify one of the supported framework versions:"
+                    f"{TRAINIUM_SUPPORTED_TORCH_DISTRIBUTED_FRAMEWORK_VERSIONS} \n"
+                )
+        else:
             err_msg += (
-                "torch_distributed is currently supported only for trainium instances.\n"
-                " Please refer https://sagemaker.readthedocs.io/en/stable/frameworks/pytorch/using_pytorch.html#distributed-pytorch-training \n"  # noqa E501  # pylint: disable=c0301
-                "for information regarding distributed training on non-trainium instances"
+                "Currently torch_distributed is supported only for GPU and Trainium instances.\n"
             )
 
     # Check entry point type
@@ -1103,6 +1114,41 @@ def validate_torch_distributed_distribution(
         raise ValueError(err_msg)
 
 
+def _is_gpu_instance(instance_type):
+    """Returns bool indicating whether instance_type supports GPU
+
+    Args:
+        instance_type (str): Name of the instance_type to check against.
+
+    Returns:
+        bool: Whether or not the instance_type supports GPU
+    """
+    if isinstance(instance_type, str):
+        match = re.match(r"^ml[\._]([a-z\d]+)\.?\w*$", instance_type)
+        if match:
+            if match[1].startswith("p") or match[1].startswith("g"):
+                return True
+        if instance_type == "local_gpu":
+            return True
+    return False
+
+
+def _is_trainium_instance(instance_type):
+    """Returns bool indicating whether instance_type is a Trainium instance
+
+    Args:
+        instance_type (str): Name of the instance_type to check against.
+
+    Returns:
+        bool: Whether or not the instance_type is a Trainium instance
+    """
+    if isinstance(instance_type, str):
+        match = re.match(r"^ml[\._]([a-z\d]+)\.?\w*$", instance_type)
+        if match and match[1].startswith("trn"):
+            return True
+    return False
+
+
 def python_deprecation_warning(framework, latest_supported_version):
     """Placeholder docstring"""
     return PYTHON_2_DEPRECATION_WARNING.format(
 
@@ -174,7 +174,7 @@ def _clone_command_for_github_like(git_config, dest_dir):
         CalledProcessError: If failed to clone git repo.
     """
     is_https = git_config["repo"].startswith("https://")
-    is_ssh = git_config["repo"].startswith("git@")
+    is_ssh = git_config["repo"].startswith("git@") or git_config["repo"].startswith("ssh://")
     if not is_https and not is_ssh:
         raise ValueError("Invalid Git url provided.")
     if is_ssh:
@@ -277,12 +277,16 @@ def _run_clone_command(repo_url, dest_dir):
     if repo_url.startswith("https://"):
         my_env["GIT_TERMINAL_PROMPT"] = "0"
         subprocess.check_call(["git", "clone", repo_url, dest_dir], env=my_env)
-    elif repo_url.startswith("git@"):
-        with tempfile.NamedTemporaryFile() as sshnoprompt:
-            with open(sshnoprompt.name, "w") as write_pipe:
-                write_pipe.write("ssh -oBatchMode=yes $@")
-            os.chmod(sshnoprompt.name, 0o511)
-            my_env["GIT_SSH"] = sshnoprompt.name
+    elif repo_url.startswith("git@") or repo_url.startswith("ssh://"):
+        try:
+            with tempfile.NamedTemporaryFile() as sshnoprompt:
+                with open(sshnoprompt.name, "w") as write_pipe:
+                    write_pipe.write("ssh -oBatchMode=yes $@")
+                os.chmod(sshnoprompt.name, 0o511)
+                my_env["GIT_SSH"] = sshnoprompt.name
+                subprocess.check_call(["git", "clone", repo_url, dest_dir], env=my_env)
+        except subprocess.CalledProcessError:
+            del my_env["GIT_SSH"]
             subprocess.check_call(["git", "clone", repo_url, dest_dir], env=my_env)