13
13
"""Contains classes to define input data sources."""
14
14
from __future__ import absolute_import
15
15
16
- from typing import Optional
16
+ from typing import Optional , Dict , Union , TypeVar , Generic
17
+ from abc import ABC , abstractmethod
18
+ from pyspark .sql import DataFrame , SparkSession
19
+
17
20
18
21
import attr
19
22
23
+ T = TypeVar ("T" )
24
+
25
+
26
+ @attr .s
27
+ class BaseDataSource (Generic [T ], ABC ):
28
+ """Abstract base class for feature processor data sources.
29
+
30
+ Provides a skeleton for customization requiring the overriding of the method to read data from
31
+ data source and return the specified type.
32
+ """
33
+
34
+ @abstractmethod
35
+ def read_data (self , * args , ** kwargs ) -> T :
36
+ """Read data from data source and return the specified type.
37
+
38
+ Args:
39
+ args: Arguments for reading the data.
40
+ kwargs: Keyword argument for reading the data.
41
+ Returns:
42
+ T: The specified abstraction of data source.
43
+ """
44
+
45
+ @property
46
+ @abstractmethod
47
+ def data_source_unique_id (self ) -> str :
48
+ """The identifier for the customized feature processor data source.
49
+
50
+ Returns:
51
+ str: The data source unique id.
52
+ """
53
+
54
+ @property
55
+ @abstractmethod
56
+ def data_source_name (self ) -> str :
57
+ """The name for the customized feature processor data source.
58
+
59
+ Returns:
60
+ str: The data source name.
61
+ """
62
+
63
+
64
+ @attr .s
65
+ class PySparkDataSource (BaseDataSource [DataFrame ], ABC ):
66
+ """Abstract base class for feature processor data sources.
67
+
68
+ Provides a skeleton for customization requiring the overriding of the method to read data from
69
+ data source and return the Spark DataFrame.
70
+ """
71
+
72
+ @abstractmethod
73
+ def read_data (
74
+ self , spark : SparkSession , params : Optional [Dict [str , Union [str , Dict ]]] = None
75
+ ) -> DataFrame :
76
+ """Read data from data source and convert the data to Spark DataFrame.
77
+
78
+ Args:
79
+ spark (SparkSession): The Spark session to read the data.
80
+ params (Optional[Dict[str, Union[str, Dict]]]): Parameters provided to the
81
+ feature_processor decorator.
82
+ Returns:
83
+ DataFrame: The Spark DataFrame as an abstraction on the data source.
84
+ """
85
+
20
86
21
87
@attr .s
22
88
class FeatureGroupDataSource :
@@ -26,7 +92,7 @@ class FeatureGroupDataSource:
26
92
name (str): The name or ARN of the Feature Group.
27
93
input_start_offset (Optional[str], optional): A duration specified as a string in the
28
94
format '<no> <unit>' where 'no' is a number and 'unit' is a unit of time in ['hours',
29
- 'days', 'weeks', 'months', 'years'] (plural and singluar forms). Inputs contain data
95
+ 'days', 'weeks', 'months', 'years'] (plural and singular forms). Inputs contain data
30
96
with event times no earlier than input_start_offset in the past. Offsets are relative
31
97
to the function execution time. If the function is executed by a Schedule, then the
32
98
offset is relative to the scheduled start time. Defaults to None.
0 commit comments