@@ -168,7 +168,7 @@ def _ingest_single_batch(
168
168
sagemaker_session : Session ,
169
169
start_index : int ,
170
170
end_index : int ,
171
- ):
171
+ ) -> List [ int ] :
172
172
"""Ingest a single batch of DataFrame rows into FeatureStore.
173
173
174
174
Args:
@@ -177,50 +177,62 @@ def _ingest_single_batch(
177
177
sagemaker_session (Session): session instance to perform boto calls.
178
178
start_index (int): starting position to ingest in this batch.
179
179
end_index (int): ending position to ingest in this batch.
180
+
181
+ Returns:
182
+ List of row indices that failed to be ingested.
180
183
"""
181
184
logger .info ("Started ingesting index %d to %d" , start_index , end_index )
182
- for row in data_frame [start_index :end_index ].itertuples (index = False ):
185
+ failed_rows = list ()
186
+ for row in data_frame [start_index :end_index ].itertuples ():
183
187
record = [
184
188
FeatureValue (
185
- feature_name = data_frame .columns [index ], value_as_string = str (row [index ])
189
+ feature_name = data_frame .columns [index - 1 ], value_as_string = str (row [index ])
186
190
)
187
- for index in range (len (row ))
191
+ for index in range (1 , len (row ))
188
192
if pd .notna (row [index ])
189
193
]
190
- sagemaker_session .put_record (
191
- feature_group_name = feature_group_name , record = [value .to_dict () for value in record ]
192
- )
194
+ try :
195
+ sagemaker_session .put_record (
196
+ feature_group_name = feature_group_name ,
197
+ record = [value .to_dict () for value in record ],
198
+ )
199
+ except Exception as e : # pylint: disable=broad-except
200
+ logger .error ("Failed to ingest row %d: %s" , row [0 ], e )
201
+ failed_rows .append (row [0 ])
202
+ return failed_rows
193
203
194
- def wait (self , timeout = None ):
204
+ def wait (self , timeout = None ) -> List [ int ] :
195
205
"""Wait for the ingestion process to finish.
196
206
197
207
Args:
198
208
timeout (Union[int, float]): ``concurrent.futures.TimeoutError`` will be raised
199
209
if timeout is reached.
210
+
211
+ Returns:
212
+ List of row indices that failed to be ingested.
200
213
"""
201
- failed = False
214
+ failed = []
202
215
for future in as_completed (self ._futures , timeout = timeout ):
203
216
start , end = self ._futures [future ]
204
- try :
205
- future .result ()
206
- except Exception as e : # pylint: disable=broad-except
207
- failed = True
208
- logger .error ("Failed to ingest row %d to %d: %s" , start , end , e )
217
+ result = future .result ()
218
+ if result :
219
+ logger .error ("Failed to ingest row %d to %d" , start , end )
209
220
else :
210
221
logger .info ("Successfully ingested row %d to %d" , start , end )
222
+ failed += result
211
223
212
- if failed :
213
- raise RuntimeError (
214
- f"Failed to ingest some data into FeatureGroup { self .feature_group_name } "
215
- )
224
+ return failed
216
225
217
- def run (self , wait = True , timeout = None ):
226
+ def run (self , wait = True , timeout = None ) -> List [ int ] :
218
227
"""Start the ingestion process.
219
228
220
229
Args:
221
230
wait (bool): whether to wait for the ingestion to finish or not.
222
231
timeout (Union[int, float]): ``concurrent.futures.TimeoutError`` will be raised
223
232
if timeout is reached.
233
+
234
+ Returns:
235
+ List of row indices that failed to be ingested.
224
236
"""
225
237
executor = ThreadPoolExecutor (max_workers = self .max_workers )
226
238
batch_size = math .ceil (self .data_frame .shape [0 ] / self .max_workers )
@@ -241,9 +253,11 @@ def run(self, wait=True, timeout=None):
241
253
] = (start_index , end_index )
242
254
243
255
self ._futures = futures
256
+ failed = []
244
257
if wait :
245
- self .wait (timeout = timeout )
258
+ failed = self .wait (timeout = timeout )
246
259
executor .shutdown (wait = False )
260
+ return failed
247
261
248
262
249
263
@attr .s
0 commit comments