@@ -171,6 +171,7 @@ class IngestionManagerPandas:
171
171
feature_group_name (str): name of the Feature Group.
172
172
sagemaker_fs_runtime_client_config (Config): instance of the Config class
173
173
for boto calls.
174
+ sagemaker_session (Session): session instance to perform boto calls.
174
175
data_frame (DataFrame): pandas DataFrame to be ingested to the given feature group.
175
176
max_workers (int): number of threads to create.
176
177
max_processes (int): number of processes to create. Each process spawns
@@ -180,7 +181,8 @@ class IngestionManagerPandas:
180
181
"""
181
182
182
183
feature_group_name : str = attr .ib ()
183
- sagemaker_fs_runtime_client_config : Config = attr .ib ()
184
+ sagemaker_fs_runtime_client_config : Config = attr .ib (default = None )
185
+ sagemaker_session : Session = attr .ib (default = None )
184
186
max_workers : int = attr .ib (default = 1 )
185
187
max_processes : int = attr .ib (default = 1 )
186
188
profile_name : str = attr .ib (default = None )
@@ -216,29 +218,20 @@ def _ingest_single_batch(
216
218
if "max_attempts" not in retry_config and "total_max_attempts" not in retry_config :
217
219
client_config = copy .deepcopy (client_config )
218
220
client_config .retries = {"max_attempts" : 10 , "mode" : "standard" }
219
- sagemaker_featurestore_runtime_client = boto3 .Session (profile_name = profile_name ).client (
221
+ sagemaker_fs_runtime_client = boto3 .Session (profile_name = profile_name ).client (
220
222
service_name = "sagemaker-featurestore-runtime" , config = client_config
221
223
)
222
224
223
225
logger .info ("Started ingesting index %d to %d" , start_index , end_index )
224
226
failed_rows = list ()
225
227
for row in data_frame [start_index :end_index ].itertuples ():
226
- record = [
227
- FeatureValue (
228
- feature_name = data_frame .columns [index - 1 ],
229
- value_as_string = str (row [index ]),
230
- )
231
- for index in range (1 , len (row ))
232
- if pd .notna (row [index ])
233
- ]
234
- try :
235
- sagemaker_featurestore_runtime_client .put_record (
236
- FeatureGroupName = feature_group_name ,
237
- Record = [value .to_dict () for value in record ],
238
- )
239
- except Exception as e : # pylint: disable=broad-except
240
- logger .error ("Failed to ingest row %d: %s" , row [0 ], e )
241
- failed_rows .append (row [0 ])
228
+ IngestionManagerPandas ._ingest_row (
229
+ data_frame = data_frame ,
230
+ row = row ,
231
+ feature_group_name = feature_group_name ,
232
+ sagemaker_fs_runtime_client = sagemaker_fs_runtime_client ,
233
+ failed_rows = failed_rows ,
234
+ )
242
235
return failed_rows
243
236
244
237
@property
@@ -280,6 +273,69 @@ def wait(self, timeout=None):
280
273
f"Failed to ingest some data into FeatureGroup { self .feature_group_name } " ,
281
274
)
282
275
276
+ @staticmethod
277
+ def _ingest_row (
278
+ data_frame : DataFrame ,
279
+ row : int ,
280
+ feature_group_name : str ,
281
+ sagemaker_fs_runtime_client : Session ,
282
+ failed_rows : List [int ],
283
+ ):
284
+ """Ingest a single Dataframe row into FeatureStore.
285
+
286
+ Args:
287
+ data_frame (DataFrame): source DataFrame to be ingested.
288
+ row (int): current row that is being ingested
289
+ feature_group_name (str): name of the Feature Group.
290
+ sagemaker_featurestore_runtime_client (Session): session instance to perform boto calls.
291
+ failed_rows (List[int]): list of indices from the data frame for which ingestion failed.
292
+
293
+
294
+ Returns:
295
+ int of row indices that failed to be ingested.
296
+ """
297
+ record = [
298
+ FeatureValue (
299
+ feature_name = data_frame .columns [index - 1 ],
300
+ value_as_string = str (row [index ]),
301
+ )
302
+ for index in range (1 , len (row ))
303
+ if pd .notna (row [index ])
304
+ ]
305
+ try :
306
+ sagemaker_fs_runtime_client .put_record (
307
+ FeatureGroupName = feature_group_name ,
308
+ Record = [value .to_dict () for value in record ],
309
+ )
310
+ except Exception as e : # pylint: disable=broad-except
311
+ logger .error ("Failed to ingest row %d: %s" , row [0 ], e )
312
+ failed_rows .append (row [0 ])
313
+
314
+ def _run_single_process_single_thread (self , data_frame : DataFrame ):
315
+ """Ingest a utilizing single process and single thread.
316
+
317
+ Args:
318
+ data_frame (DataFrame): source DataFrame to be ingested.
319
+ """
320
+ logger .info ("Started ingesting index %d to %d" )
321
+ failed_rows = list ()
322
+ sagemaker_fs_runtime_client = self .sagemaker_session .sagemaker_featurestore_runtime_client
323
+ for row in data_frame .itertuples ():
324
+ IngestionManagerPandas ._ingest_row (
325
+ data_frame = data_frame ,
326
+ row = row ,
327
+ feature_group_name = self .feature_group_name ,
328
+ sagemaker_fs_runtime_client = sagemaker_fs_runtime_client ,
329
+ failed_rows = failed_rows ,
330
+ )
331
+ self ._failed_indices = failed_rows
332
+
333
+ if len (self ._failed_indices ) > 0 :
334
+ raise IngestionError (
335
+ self ._failed_indices ,
336
+ f"Failed to ingest some data into FeatureGroup { self .feature_group_name } " ,
337
+ )
338
+
283
339
def _run_multi_process (self , data_frame : DataFrame , wait = True , timeout = None ):
284
340
"""Start the ingestion process with the specified number of processes.
285
341
@@ -391,7 +447,10 @@ def run(self, data_frame: DataFrame, wait=True, timeout=None):
391
447
timeout (Union[int, float]): ``concurrent.futures.TimeoutError`` will be raised
392
448
if timeout is reached.
393
449
"""
394
- self ._run_multi_process (data_frame = data_frame , wait = wait , timeout = timeout )
450
+ if self .max_workers == 1 and self .max_processes == 1 and self .profile_name is None :
451
+ self ._run_single_process_single_thread (data_frame = data_frame )
452
+ else :
453
+ self ._run_multi_process (data_frame = data_frame , wait = wait , timeout = timeout )
395
454
396
455
397
456
class IngestionError (Exception ):
@@ -755,6 +814,7 @@ def ingest(
755
814
756
815
manager = IngestionManagerPandas (
757
816
feature_group_name = self .name ,
817
+ sagemaker_session = self .sagemaker_session ,
758
818
sagemaker_fs_runtime_client_config = self .sagemaker_session .sagemaker_featurestore_runtime_client .meta .config ,
759
819
max_workers = max_workers ,
760
820
max_processes = max_processes ,
0 commit comments