Skip to content

Commit d2ff02a

Browse files
committed
doc: added more doc and examples
1 parent badb9bd commit d2ff02a

File tree

2 files changed

+37
-16
lines changed

2 files changed

+37
-16
lines changed

src/sagemaker/feature_store/feature_group.py

+2-3
Original file line numberDiff line numberDiff line change
@@ -133,9 +133,8 @@ def as_dataframe(self, **kwargs) -> DataFrame:
133133
"""Download the result of the current query and load it into a DataFrame.
134134
135135
Args:
136-
kwargs: key arguments used for the method pandas.read_csv to be able to have
137-
a better tuning on data.
138-
For more info read
136+
**kwargs (object): key arguments used for the method pandas.read_csv to be able to
137+
have a better tuning on data. For more info read:
139138
https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html
140139
141140
Returns:

src/sagemaker/feature_store/feature_utils.py

+35-13
Original file line numberDiff line numberDiff line change
@@ -88,10 +88,24 @@ def get_feature_group_as_dataframe(
8888
event_time_feature_name: str = None,
8989
latest_ingestion: bool = True,
9090
verbose: bool = True,
91-
**pandas_read_csv_kwargs,
91+
**kwargs,
9292
) -> DataFrame:
9393
"""Get a :class:`sagemaker.feature_store.feature_group.FeatureGroup` as a pandas.DataFrame
9494
95+
Examples:
96+
>>> from sagemaker.feature_store.feature_utils import get_feature_group_as_dataframe
97+
>>>
98+
>>> region = "eu-west-1"
99+
>>> fg_data = get_feature_group_as_dataframe(feature_group_name="feature_group",
100+
>>> athena_bucket="s3://bucket/athena_queries",
101+
>>> region=region,
102+
>>> event_time_feature_name="EventTimeId"
103+
>>> )
104+
>>>
105+
>>> type(fg_data)
106+
<class 'pandas.core.frame.DataFrame'>
107+
>>>
108+
95109
Description:
96110
Method to run an athena query over a Feature Group in a Feature Store
97111
to retrieve its data.It needs the sagemaker.Session linked to a role
@@ -106,17 +120,22 @@ def get_feature_group_as_dataframe(
106120
in the feature group that wasn't deleted. It needs to use the keyword
107121
"#{table}" to refer to the FeatureGroup name. e.g.:
108122
'SELECT * FROM "sagemaker_featurestore"."#{table}"'
123+
It must not end by ';'.
109124
athena_bucket (str): Amazon S3 bucket for running the query
110-
role (str): role of the account used to extract data from feature store
111-
session (str): :class:`sagemaker.session.Session`
112-
of SageMaker used to work with the feature store
125+
role (str): role to be assumed to extract data from feature store. If not specified
126+
the default sagemaker execution role will be used.
127+
session (str): `:obj:sagemaker.session.Session`
128+
of SageMaker used to work with the feature store. Optional, with
129+
role and region parameters it will infer the session.
113130
event_time_feature_name (str): eventTimeId feature. Mandatory only if the
114-
latest ingestion is True
131+
latest ingestion is True.
115132
latest_ingestion (bool): if True it will get the data only from the latest ingestion.
116133
If False it will take whatever is specified in the query, or
117134
if not specify it, it will get all the data that wasn't deleted.
118135
verbose (bool): if True show messages, if False is silent.
119-
136+
**kwargs (object): key arguments used for the method pandas.read_csv to be able to
137+
have a better tuning on data. For more info read:
138+
https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html
120139
Returns:
121140
dataset (pandas.DataFrame): dataset with the data retrieved from feature group
122141
"""
@@ -139,12 +158,13 @@ def get_feature_group_as_dataframe(
139158
)
140159
logger.exception(exc)
141160
raise exc
161+
142162
query += ";"
143163

144164
if session is not None:
145165
sagemaker_session = session
146-
elif role is not None and region is not None:
147-
sagemaker_session = get_session_from_role(region=region)
166+
elif region is not None:
167+
sagemaker_session = get_session_from_role(region=region, assume_role=role)
148168
else:
149169
exc = Exception("Argument Session or role and region must be specified.")
150170
logger.exception(exc)
@@ -166,7 +186,7 @@ def get_feature_group_as_dataframe(
166186
sample_query.wait()
167187

168188
# run Athena query. The output is loaded to a Pandas dataframe.
169-
dataset = sample_query.as_dataframe(**pandas_read_csv_kwargs)
189+
dataset = sample_query.as_dataframe(**kwargs)
170190

171191
msg = f"Data shape retrieve from {feature_group_name}: {dataset.shape}"
172192
logger.info(msg)
@@ -217,7 +237,7 @@ def prepare_fg_from_dataframe_or_file(
217237
record_id: str = "record_id",
218238
event_id: str = "data_as_of_date",
219239
verbose: bool = False,
220-
**pandas_read_csv_kwargs,
240+
**kwargs,
221241
) -> FeatureGroup:
222242
"""Prepares a dataframe to create a :class:`sagemaker.feature_store.feature_group.FeatureGroup`
223243
@@ -229,7 +249,9 @@ def prepare_fg_from_dataframe_or_file(
229249
by default with the names 'record_id' and 'data_as_of_date'.
230250
231251
Args:
232-
**pandas_read_csv_kwargs (object):
252+
**kwargs (object): key arguments used for the method pandas.read_csv to be able to
253+
have a better tuning on data. For more info read:
254+
https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html
233255
feature_group_name (str): feature group name
234256
dataframe_or_path (str, Path, pandas.DataFrame) : pandas.DataFrame or path to the data
235257
verbose (bool) : True for displaying messages, False for silent method.
@@ -256,8 +278,8 @@ def prepare_fg_from_dataframe_or_file(
256278
if isinstance(dataframe_or_path, DataFrame):
257279
data = dataframe_or_path
258280
elif isinstance(dataframe_or_path, str):
259-
pandas_read_csv_kwargs.pop("filepath_or_buffer", None)
260-
data = read_csv(filepath_or_buffer=dataframe_or_path, **pandas_read_csv_kwargs)
281+
kwargs.pop("filepath_or_buffer", None)
282+
data = read_csv(filepath_or_buffer=dataframe_or_path, **kwargs)
261283
else:
262284
exc = Exception(
263285
str(

0 commit comments

Comments
 (0)