aws
diff --git a/‎CHANGELOG.md
+23 b/‎CHANGELOG.md
+23
diff --git a/‎VERSION
+1-1 b/‎VERSION
+1-1
diff --git a/‎doc/api/prep_data/feature_store.rst
-1 b/‎doc/api/prep_data/feature_store.rst
-1
diff --git a/‎src/sagemaker/feature_store/feature_group.py
+2-2 b/‎src/sagemaker/feature_store/feature_group.py
+2-2
diff --git a/‎src/sagemaker/feature_store/feature_processor/__init__.py
+2 b/‎src/sagemaker/feature_store/feature_processor/__init__.py
+2
diff --git a/‎src/sagemaker/feature_store/feature_processor/_config_uploader.py
+2-2 b/‎src/sagemaker/feature_store/feature_processor/_config_uploader.py
+2-2
diff --git a/‎src/sagemaker/feature_store/feature_processor/_data_source.py
+68-2 b/‎src/sagemaker/feature_store/feature_processor/_data_source.py
+68-2
diff --git a/‎src/sagemaker/feature_store/feature_processor/_factory.py
+24-8 b/‎src/sagemaker/feature_store/feature_processor/_factory.py
+24-8
diff --git a/‎src/sagemaker/feature_store/feature_processor/_feature_processor_config.py
+10-2 b/‎src/sagemaker/feature_store/feature_processor/_feature_processor_config.py
+10-2
diff --git a/‎src/sagemaker/feature_store/feature_processor/_params_loader.py
+1-1 b/‎src/sagemaker/feature_store/feature_processor/_params_loader.py
+1-1
diff --git a/‎src/sagemaker/feature_store/feature_processor/_spark_factory.py
+25-12 b/‎src/sagemaker/feature_store/feature_processor/_spark_factory.py
+25-12
@@ -1,5 +1,28 @@
 # Changelog
 
+## v2.191.0 (2023-10-05)
+
+### Features
+
+ * Selective Step Execution milestone 2 features
+ * feature-processor extra data sources support
+
+## v2.190.0 (2023-10-04)
+
+### Features
+
+ * Add support for in-memory feature groups and collection type features in Feature Store.
+
+### Bug Fixes and Other Changes
+
+ * chore: xfail resource in use failure for specific test
+ * Add missing API docs for processors
+
+### Documentation Changes
+
+ * Bring back (de)serializers documentation
+ * Add missing AirFlow operators + link to airflow documentation
+
 ## v2.189.0 (2023-10-03)
 
 ### Features
 
@@ -1 +1 @@
-2.189.1.dev0
+2.191.1.dev0
@@ -6,7 +6,6 @@ Feature Group
 
 .. autoclass:: sagemaker.feature_store.feature_group.FeatureGroup
     :members:
-    :exclude-members: load_feature_definitions
     :show-inheritance:
 
 .. autoclass:: sagemaker.feature_store.feature_group.AthenaQuery
 
@@ -759,8 +759,8 @@ def load_feature_definitions(
         """Load feature definitions from a Pandas DataFrame.
 
         Column name is used as feature name. Feature type is inferred from the dtype
-        of the column. Dtype int_, int8, int16, int32, int64, uint8, uint16, uint32
-        and uint64 are mapped to Integral feature type. Dtype float_, float16, float32
+        of the column. Dtype :literal:`int_`, int8, int16, int32, int64, uint8, uint16, uint32
+        and uint64 are mapped to Integral feature type. Dtype :literal:`float_`, float16, float32
         and float64 are mapped to Fractional feature type. string dtype is mapped to
         String feature type.
 
 
@@ -17,6 +17,8 @@
     CSVDataSource,
     FeatureGroupDataSource,
     ParquetDataSource,
+    BaseDataSource,
+    PySparkDataSource,
 )
 from sagemaker.feature_store.feature_processor._exceptions import (  # noqa: F401
     IngestionError,
 
@@ -31,7 +31,7 @@
     _JobSettings,
     RUNTIME_SCRIPTS_CHANNEL_NAME,
     REMOTE_FUNCTION_WORKSPACE,
-    SPARK_CONF_WORKSPACE,
+    SPARK_CONF_CHANNEL_NAME,
     _prepare_and_upload_spark_dependent_files,
 )
 from sagemaker.remote_function.runtime_environment.runtime_environment_manager import (
@@ -99,7 +99,7 @@ def prepare_step_input_channel_for_spark_mode(
             )
 
         if config_file_s3_uri:
-            input_data_config[SPARK_CONF_WORKSPACE] = TrainingInput(
+            input_data_config[SPARK_CONF_CHANNEL_NAME] = TrainingInput(
                 s3_data=config_file_s3_uri,
                 s3_data_type="S3Prefix",
                 distribution=S3_DATA_DISTRIBUTION_TYPE,
 
@@ -13,10 +13,76 @@
 """Contains classes to define input data sources."""
 from __future__ import absolute_import
 
-from typing import Optional
+from typing import Optional, Dict, Union, TypeVar, Generic
+from abc import ABC, abstractmethod
+from pyspark.sql import DataFrame, SparkSession
+
 
 import attr
 
+T = TypeVar("T")
+
+
+@attr.s
+class BaseDataSource(Generic[T], ABC):
+    """Abstract base class for feature processor data sources.
+
+    Provides a skeleton for customization requiring the overriding of the method to read data from
+    data source and return the specified type.
+    """
+
+    @abstractmethod
+    def read_data(self, *args, **kwargs) -> T:
+        """Read data from data source and return the specified type.
+
+        Args:
+            args: Arguments for reading the data.
+            kwargs: Keyword argument for reading the data.
+        Returns:
+            T: The specified abstraction of data source.
+        """
+
+    @property
+    @abstractmethod
+    def data_source_unique_id(self) -> str:
+        """The identifier for the customized feature processor data source.
+
+        Returns:
+            str: The data source unique id.
+        """
+
+    @property
+    @abstractmethod
+    def data_source_name(self) -> str:
+        """The name for the customized feature processor data source.
+
+        Returns:
+            str: The data source name.
+        """
+
+
+@attr.s
+class PySparkDataSource(BaseDataSource[DataFrame], ABC):
+    """Abstract base class for feature processor data sources.
+
+    Provides a skeleton for customization requiring the overriding of the method to read data from
+    data source and return the Spark DataFrame.
+    """
+
+    @abstractmethod
+    def read_data(
+        self, spark: SparkSession, params: Optional[Dict[str, Union[str, Dict]]] = None
+    ) -> DataFrame:
+        """Read data from data source and convert the data to Spark DataFrame.
+
+        Args:
+            spark (SparkSession): The Spark session to read the data.
+            params (Optional[Dict[str, Union[str, Dict]]]): Parameters provided to the
+                feature_processor decorator.
+        Returns:
+            DataFrame: The Spark DataFrame as an abstraction on the data source.
+        """
+
 
 @attr.s
 class FeatureGroupDataSource:
@@ -26,7 +92,7 @@ class FeatureGroupDataSource:
         name (str): The name or ARN of the Feature Group.
         input_start_offset (Optional[str], optional): A duration specified as a string in the
             format '<no> <unit>' where 'no' is a number and 'unit' is a unit of time in ['hours',
-            'days', 'weeks', 'months', 'years'] (plural and singluar forms). Inputs contain data
+            'days', 'weeks', 'months', 'years'] (plural and singular forms). Inputs contain data
             with event times no earlier than input_start_offset in the past. Offsets are relative
             to the function execution time. If the function is executed by a Schedule, then the
             offset is relative to the scheduled start time. Defaults to None.
 
@@ -13,6 +13,7 @@
 """Contains static factory classes to instantiate complex objects for the FeatureProcessor."""
 from __future__ import absolute_import
 
+from typing import Dict
 from pyspark.sql import DataFrame
 
 from sagemaker.feature_store.feature_processor._enums import FeatureProcessorMode
@@ -41,6 +42,7 @@
     InputValidator,
     SparkUDFSignatureValidator,
     InputOffsetValidator,
+    BaseDataSourceValidator,
     ValidatorChain,
 )
 
@@ -55,6 +57,7 @@ def get_validation_chain(fp_config: FeatureProcessorConfig) -> ValidatorChain:
             InputValidator(),
             FeatureProcessorArgValidator(),
             InputOffsetValidator(),
+            BaseDataSourceValidator(),
         ]
 
         mode = fp_config.mode
@@ -85,14 +88,19 @@ def get_udf_wrapper(fp_config: FeatureProcessorConfig) -> UDFWrapper:
         mode = fp_config.mode
 
         if FeatureProcessorMode.PYSPARK == mode:
-            return UDFWrapperFactory._get_spark_udf_wrapper()
+            return UDFWrapperFactory._get_spark_udf_wrapper(fp_config)
 
         raise ValueError(f"FeatureProcessorMode {mode} is not supported.")
 
     @staticmethod
-    def _get_spark_udf_wrapper() -> UDFWrapper[DataFrame]:
-        """Instantiate a new UDFWrapper for PySpark functions."""
-        spark_session_factory = UDFWrapperFactory._get_spark_session_factory()
+    def _get_spark_udf_wrapper(fp_config: FeatureProcessorConfig) -> UDFWrapper[DataFrame]:
+        """Instantiate a new UDFWrapper for PySpark functions.
+
+        Args:
+            fp_config (FeatureProcessorConfig): the configuration values for the feature_processor
+                decorator.
+        """
+        spark_session_factory = UDFWrapperFactory._get_spark_session_factory(fp_config.spark_config)
         feature_store_manager_factory = UDFWrapperFactory._get_feature_store_manager_factory()
 
         output_manager = UDFWrapperFactory._get_spark_output_receiver(feature_store_manager_factory)
@@ -131,7 +139,7 @@ def _get_spark_output_receiver(
 
         Args:
             feature_store_manager_factory (FeatureStoreManagerFactory): A factory to provide
-                that provides a FeaturStoreManager that handles data ingestion to a Feature Group.
+                that provides a FeatureStoreManager that handles data ingestion to a Feature Group.
                 The factory lazily loads the FeatureStoreManager.
 
         Returns:
@@ -140,10 +148,18 @@ def _get_spark_output_receiver(
         return SparkOutputReceiver(feature_store_manager_factory)
 
     @staticmethod
-    def _get_spark_session_factory() -> SparkSessionFactory:
-        """Instantiate a new SparkSessionFactory"""
+    def _get_spark_session_factory(spark_config: Dict[str, str]) -> SparkSessionFactory:
+        """Instantiate a new SparkSessionFactory
+
+        Args:
+            spark_config (Dict[str, str]): The Spark configuration that will be passed to the
+                initialization of Spark session.
+
+        Returns:
+            SparkSessionFactory: A Spark session factory instance.
+        """
         environment_helper = EnvironmentHelper()
-        return SparkSessionFactory(environment_helper)
+        return SparkSessionFactory(environment_helper, spark_config)
 
     @staticmethod
     def _get_feature_store_manager_factory() -> FeatureStoreManagerFactory:
 
@@ -21,6 +21,7 @@
     CSVDataSource,
     FeatureGroupDataSource,
     ParquetDataSource,
+    BaseDataSource,
 )
 from sagemaker.feature_store.feature_processor._enums import FeatureProcessorMode
 
@@ -37,21 +38,27 @@ class FeatureProcessorConfig:
     It only serves as an immutable data class.
     """
 
-    inputs: Sequence[Union[FeatureGroupDataSource, CSVDataSource, ParquetDataSource]] = attr.ib()
+    inputs: Sequence[
+        Union[FeatureGroupDataSource, CSVDataSource, ParquetDataSource, BaseDataSource]
+    ] = attr.ib()
     output: str = attr.ib()
     mode: FeatureProcessorMode = attr.ib()
     target_stores: Optional[List[str]] = attr.ib()
     parameters: Optional[Dict[str, Union[str, Dict]]] = attr.ib()
     enable_ingestion: bool = attr.ib()
+    spark_config: Dict[str, str] = attr.ib()
 
     @staticmethod
     def create(
-        inputs: Sequence[Union[FeatureGroupDataSource, CSVDataSource, ParquetDataSource]],
+        inputs: Sequence[
+            Union[FeatureGroupDataSource, CSVDataSource, ParquetDataSource, BaseDataSource]
+        ],
         output: str,
         mode: FeatureProcessorMode,
         target_stores: Optional[List[str]],
         parameters: Optional[Dict[str, Union[str, Dict]]],
         enable_ingestion: bool,
+        spark_config: Dict[str, str],
     ) -> "FeatureProcessorConfig":
         """Static initializer."""
         return FeatureProcessorConfig(
@@ -61,4 +68,5 @@ def create(
             target_stores=target_stores,
             parameters=parameters,
             enable_ingestion=enable_ingestion,
+            spark_config=spark_config,
         )
@@ -72,7 +72,7 @@ def get_parameter_args(
                 feature_processor decorator.
 
         Returns:
-            Dict[str, Union[str, Dict]]: A dictionary containin both user provided
+            Dict[str, Union[str, Dict]]: A dictionary that contains both user provided
                 parameters (feature_processor argument) and system parameters.
         """
         return {
 
@@ -14,7 +14,7 @@
 from __future__ import absolute_import
 
 from functools import lru_cache
-from typing import List, Tuple
+from typing import List, Tuple, Dict
 
 import feature_store_pyspark
 import feature_store_pyspark.FeatureStoreManager as fsm
@@ -34,14 +34,19 @@ class SparkSessionFactory:
     instance throughout the application.
     """
 
-    def __init__(self, environment_helper: EnvironmentHelper) -> None:
+    def __init__(
+        self, environment_helper: EnvironmentHelper, spark_config: Dict[str, str] = None
+    ) -> None:
         """Initialize the SparkSessionFactory.
 
         Args:
             environment_helper (EnvironmentHelper): A helper class to determine the current
                 execution.
+            spark_config (Dict[str, str]): The Spark configuration that will be passed to the
+                initialization of Spark session.
         """
         self.environment_helper = environment_helper
+        self.spark_config = spark_config
 
     @property
     @lru_cache()
@@ -106,24 +111,32 @@ def _get_spark_configs(self, is_training_job) -> List[Tuple[str, str]]:
             ("spark.port.maxRetries", "50"),
         ]
 
+        if self.spark_config:
+            spark_configs.extend(self.spark_config.items())
+
         if not is_training_job:
+            fp_spark_jars = feature_store_pyspark.classpath_jars()
+            fp_spark_packages = [
+                "org.apache.hadoop:hadoop-aws:3.3.1",
+                "org.apache.hadoop:hadoop-common:3.3.1",
+            ]
+
+            if self.spark_config and "spark.jars" in self.spark_config:
+                fp_spark_jars.append(self.spark_config.get("spark.jars"))
+
+            if self.spark_config and "spark.jars.packages" in self.spark_config:
+                fp_spark_packages.append(self.spark_config.get("spark.jars.packages"))
+
             spark_configs.extend(
                 (
-                    (
-                        "spark.jars",
-                        ",".join(feature_store_pyspark.classpath_jars()),
-                    ),
+                    ("spark.jars", ",".join(fp_spark_jars)),
                     (
                         "spark.jars.packages",
-                        ",".join(
-                            [
-                                "org.apache.hadoop:hadoop-aws:3.3.1",
-                                "org.apache.hadoop:hadoop-common:3.3.1",
-                            ]
-                        ),
+                        ",".join(fp_spark_packages),
                     ),
                 )
             )
+
         return spark_configs
 
     def _get_jsc_hadoop_configs(self) -> List[Tuple[str, str]]:
Original file line number	Diff line number	Diff line change
`@@ -17,6 +17,8 @@`
`17`	`17`	`CSVDataSource,`
`18`	`18`	`FeatureGroupDataSource,`
`19`	`19`	`ParquetDataSource,`
	`20`	`+ BaseDataSource,`
	`21`	`+ PySparkDataSource,`
`20`	`22`	`)`
`21`	`23`	`from sagemaker.feature_store.feature_processor._exceptions import ( # noqa: F401`
`22`	`24`	`IngestionError,`