@@ -160,6 +160,7 @@ class IngestionManagerPandas:
160
160
data_frame : DataFrame = attr .ib ()
161
161
max_workers : int = attr .ib (default = 1 )
162
162
_futures : Dict [Any , Any ] = attr .ib (init = False , factory = dict )
163
+ _failed_indices : List [int ] = attr .ib (factory = list )
163
164
164
165
@staticmethod
165
166
def _ingest_single_batch (
@@ -168,7 +169,7 @@ def _ingest_single_batch(
168
169
sagemaker_session : Session ,
169
170
start_index : int ,
170
171
end_index : int ,
171
- ):
172
+ ) -> List [ int ] :
172
173
"""Ingest a single batch of DataFrame rows into FeatureStore.
173
174
174
175
Args:
@@ -177,19 +178,38 @@ def _ingest_single_batch(
177
178
sagemaker_session (Session): session instance to perform boto calls.
178
179
start_index (int): starting position to ingest in this batch.
179
180
end_index (int): ending position to ingest in this batch.
181
+
182
+ Returns:
183
+ List of row indices that failed to be ingested.
180
184
"""
181
185
logger .info ("Started ingesting index %d to %d" , start_index , end_index )
182
- for row in data_frame [start_index :end_index ].itertuples (index = False ):
186
+ failed_rows = list ()
187
+ for row in data_frame [start_index :end_index ].itertuples ():
183
188
record = [
184
189
FeatureValue (
185
- feature_name = data_frame .columns [index ], value_as_string = str (row [index ])
190
+ feature_name = data_frame .columns [index - 1 ], value_as_string = str (row [index ])
186
191
)
187
- for index in range (len (row ))
192
+ for index in range (1 , len (row ))
188
193
if pd .notna (row [index ])
189
194
]
190
- sagemaker_session .put_record (
191
- feature_group_name = feature_group_name , record = [value .to_dict () for value in record ]
192
- )
195
+ try :
196
+ sagemaker_session .put_record (
197
+ feature_group_name = feature_group_name ,
198
+ record = [value .to_dict () for value in record ],
199
+ )
200
+ except Exception as e : # pylint: disable=broad-except
201
+ logger .error ("Failed to ingest row %d: %s" , row [0 ], e )
202
+ failed_rows .append (row [0 ])
203
+ return failed_rows
204
+
205
+ @property
206
+ def failed_rows (self ) -> List [int ]:
207
+ """Get rows that failed to ingest
208
+
209
+ Returns:
210
+ List of row indices that failed to be ingested.
211
+ """
212
+ return self ._failed_indices
193
213
194
214
def wait (self , timeout = None ):
195
215
"""Wait for the ingestion process to finish.
@@ -198,18 +218,17 @@ def wait(self, timeout=None):
198
218
timeout (Union[int, float]): ``concurrent.futures.TimeoutError`` will be raised
199
219
if timeout is reached.
200
220
"""
201
- failed = False
221
+ self . _failed_indices = list ()
202
222
for future in as_completed (self ._futures , timeout = timeout ):
203
223
start , end = self ._futures [future ]
204
- try :
205
- future .result ()
206
- except Exception as e : # pylint: disable=broad-except
207
- failed = True
208
- logger .error ("Failed to ingest row %d to %d: %s" , start , end , e )
224
+ result = future .result ()
225
+ if result :
226
+ logger .error ("Failed to ingest row %d to %d" , start , end )
209
227
else :
210
228
logger .info ("Successfully ingested row %d to %d" , start , end )
229
+ self ._failed_indices += result
211
230
212
- if failed :
231
+ if len ( self . _failed_indices ) > 0 :
213
232
raise RuntimeError (
214
233
f"Failed to ingest some data into FeatureGroup { self .feature_group_name } "
215
234
)
0 commit comments