@@ -88,10 +88,24 @@ def get_feature_group_as_dataframe(
88
88
event_time_feature_name : str = None ,
89
89
latest_ingestion : bool = True ,
90
90
verbose : bool = True ,
91
- ** pandas_read_csv_kwargs ,
91
+ ** kwargs ,
92
92
) -> DataFrame :
93
93
"""Get a :class:`sagemaker.feature_store.feature_group.FeatureGroup` as a pandas.DataFrame
94
94
95
+ Examples:
96
+ >>> from sagemaker.feature_store.feature_utils import get_feature_group_as_dataframe
97
+ >>>
98
+ >>> region = "eu-west-1"
99
+ >>> fg_data = get_feature_group_as_dataframe(feature_group_name="feature_group",
100
+ >>> athena_bucket="s3://bucket/athena_queries",
101
+ >>> region=region,
102
+ >>> event_time_feature_name="EventTimeId"
103
+ >>> )
104
+ >>>
105
+ >>> type(fg_data)
106
+ <class 'pandas.core.frame.DataFrame'>
107
+ >>>
108
+
95
109
Description:
96
110
Method to run an athena query over a Feature Group in a Feature Store
97
111
to retrieve its data.It needs the sagemaker.Session linked to a role
@@ -106,17 +120,22 @@ def get_feature_group_as_dataframe(
106
120
in the feature group that wasn't deleted. It needs to use the keyword
107
121
"#{table}" to refer to the FeatureGroup name. e.g.:
108
122
'SELECT * FROM "sagemaker_featurestore"."#{table}"'
123
+ It must not end by ';'.
109
124
athena_bucket (str): Amazon S3 bucket for running the query
110
- role (str): role of the account used to extract data from feature store
111
- session (str): :class:`sagemaker.session.Session`
112
- of SageMaker used to work with the feature store
125
+ role (str): role to be assumed to extract data from feature store. If not specified
126
+ the default sagemaker execution role will be used.
127
+ session (str): `:obj:sagemaker.session.Session`
128
+ of SageMaker used to work with the feature store. Optional, with
129
+ role and region parameters it will infer the session.
113
130
event_time_feature_name (str): eventTimeId feature. Mandatory only if the
114
- latest ingestion is True
131
+ latest ingestion is True.
115
132
latest_ingestion (bool): if True it will get the data only from the latest ingestion.
116
133
If False it will take whatever is specified in the query, or
117
134
if not specify it, it will get all the data that wasn't deleted.
118
135
verbose (bool): if True show messages, if False is silent.
119
-
136
+ **kwargs (object): key arguments used for the method pandas.read_csv to be able to
137
+ have a better tuning on data. For more info read:
138
+ https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html
120
139
Returns:
121
140
dataset (pandas.DataFrame): dataset with the data retrieved from feature group
122
141
"""
@@ -139,12 +158,13 @@ def get_feature_group_as_dataframe(
139
158
)
140
159
logger .exception (exc )
141
160
raise exc
161
+
142
162
query += ";"
143
163
144
164
if session is not None :
145
165
sagemaker_session = session
146
- elif role is not None and region is not None :
147
- sagemaker_session = get_session_from_role (region = region )
166
+ elif region is not None :
167
+ sagemaker_session = get_session_from_role (region = region , assume_role = role )
148
168
else :
149
169
exc = Exception ("Argument Session or role and region must be specified." )
150
170
logger .exception (exc )
@@ -166,7 +186,7 @@ def get_feature_group_as_dataframe(
166
186
sample_query .wait ()
167
187
168
188
# run Athena query. The output is loaded to a Pandas dataframe.
169
- dataset = sample_query .as_dataframe (** pandas_read_csv_kwargs )
189
+ dataset = sample_query .as_dataframe (** kwargs )
170
190
171
191
msg = f"Data shape retrieve from { feature_group_name } : { dataset .shape } "
172
192
logger .info (msg )
@@ -217,7 +237,7 @@ def prepare_fg_from_dataframe_or_file(
217
237
record_id : str = "record_id" ,
218
238
event_id : str = "data_as_of_date" ,
219
239
verbose : bool = False ,
220
- ** pandas_read_csv_kwargs ,
240
+ ** kwargs ,
221
241
) -> FeatureGroup :
222
242
"""Prepares a dataframe to create a :class:`sagemaker.feature_store.feature_group.FeatureGroup`
223
243
@@ -229,7 +249,9 @@ def prepare_fg_from_dataframe_or_file(
229
249
by default with the names 'record_id' and 'data_as_of_date'.
230
250
231
251
Args:
232
- **pandas_read_csv_kwargs (object):
252
+ **kwargs (object): key arguments used for the method pandas.read_csv to be able to
253
+ have a better tuning on data. For more info read:
254
+ https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html
233
255
feature_group_name (str): feature group name
234
256
dataframe_or_path (str, Path, pandas.DataFrame) : pandas.DataFrame or path to the data
235
257
verbose (bool) : True for displaying messages, False for silent method.
@@ -256,8 +278,8 @@ def prepare_fg_from_dataframe_or_file(
256
278
if isinstance (dataframe_or_path , DataFrame ):
257
279
data = dataframe_or_path
258
280
elif isinstance (dataframe_or_path , str ):
259
- pandas_read_csv_kwargs .pop ("filepath_or_buffer" , None )
260
- data = read_csv (filepath_or_buffer = dataframe_or_path , ** pandas_read_csv_kwargs )
281
+ kwargs .pop ("filepath_or_buffer" , None )
282
+ data = read_csv (filepath_or_buffer = dataframe_or_path , ** kwargs )
261
283
else :
262
284
exc = Exception (
263
285
str (
0 commit comments