Skip to content

Commit a0141ed

Browse files
feat(batch): add flag in SqsFifoProcessor to enable continuous message processing (#3954)
* Adding a flag to SqsFifoProcessor to allow message processing to continue * Adding docstring * Refactoring logic to skip execution when messages are part of a groupid with failed messages * Reducing complexity * Adding documentation * Addressing Mathieu's feedback * Addressing Mathieu's feedback * Addressing Ruben's feedback * chore: refactor * chore: refactor * Adding temp test to help Ruben test it * fix: condition * chore: moved exceptions --------- Co-authored-by: Ruben Fonseca <[email protected]>
1 parent 2f2ee62 commit a0141ed

File tree

5 files changed

+280
-44
lines changed

5 files changed

+280
-44
lines changed

aws_lambda_powertools/utilities/batch/exceptions.py

+16
Original file line numberDiff line numberDiff line change
@@ -36,3 +36,19 @@ def __init__(self, msg="", child_exceptions: List[ExceptionInfo] | None = None):
3636
def __str__(self):
3737
parent_exception_str = super(BatchProcessingError, self).__str__()
3838
return self.format_exceptions(parent_exception_str)
39+
40+
41+
class SQSFifoCircuitBreakerError(Exception):
42+
"""
43+
Signals a record not processed due to the SQS FIFO processing being interrupted
44+
"""
45+
46+
pass
47+
48+
49+
class SQSFifoMessageGroupCircuitBreakerError(Exception):
50+
"""
51+
Signals a record not processed due to the SQS FIFO message group processing being interrupted
52+
"""
53+
54+
pass

aws_lambda_powertools/utilities/batch/sqs_fifo_partial_processor.py

+55-33
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,14 @@
1-
from typing import List, Optional, Tuple
2-
3-
from aws_lambda_powertools.utilities.batch import BatchProcessor, EventType
1+
import logging
2+
from typing import Optional, Set
3+
4+
from aws_lambda_powertools.utilities.batch import BatchProcessor, EventType, ExceptionInfo, FailureResponse
5+
from aws_lambda_powertools.utilities.batch.exceptions import (
6+
SQSFifoCircuitBreakerError,
7+
SQSFifoMessageGroupCircuitBreakerError,
8+
)
49
from aws_lambda_powertools.utilities.batch.types import BatchSqsTypeModel
510

6-
7-
class SQSFifoCircuitBreakerError(Exception):
8-
"""
9-
Signals a record not processed due to the SQS FIFO processing being interrupted
10-
"""
11-
12-
pass
11+
logger = logging.getLogger(__name__)
1312

1413

1514
class SqsFifoPartialProcessor(BatchProcessor):
@@ -57,36 +56,59 @@ def lambda_handler(event, context: LambdaContext):
5756
None,
5857
)
5958

60-
def __init__(self, model: Optional["BatchSqsTypeModel"] = None):
61-
super().__init__(EventType.SQS, model)
59+
group_circuit_breaker_exc = (
60+
SQSFifoMessageGroupCircuitBreakerError,
61+
SQSFifoMessageGroupCircuitBreakerError("A previous record from this message group failed processing"),
62+
None,
63+
)
6264

63-
def process(self) -> List[Tuple]:
65+
def __init__(self, model: Optional["BatchSqsTypeModel"] = None, skip_group_on_error: bool = False):
6466
"""
65-
Call instance's handler for each record. When the first failed message is detected,
66-
the process is short-circuited, and the remaining messages are reported as failed items.
67+
Initialize the SqsFifoProcessor.
68+
69+
Parameters
70+
----------
71+
model: Optional["BatchSqsTypeModel"]
72+
An optional model for batch processing.
73+
skip_group_on_error: bool
74+
Determines whether to exclusively skip messages from the MessageGroupID that encountered processing failures
75+
Default is False.
76+
6777
"""
68-
result: List[Tuple] = []
78+
self._skip_group_on_error: bool = skip_group_on_error
79+
self._current_group_id = None
80+
self._failed_group_ids: Set[str] = set()
81+
super().__init__(EventType.SQS, model)
6982

70-
for i, record in enumerate(self.records):
71-
# If we have failed messages, it means that the last message failed.
72-
# We then short circuit the process, failing the remaining messages
73-
if self.fail_messages:
74-
return self._short_circuit_processing(i, result)
83+
def _process_record(self, record):
84+
self._current_group_id = record.get("attributes", {}).get("MessageGroupId")
7585

76-
# Otherwise, process the message normally
77-
result.append(self._process_record(record))
86+
# Short-circuits the process if:
87+
# - There are failed messages, OR
88+
# - The `skip_group_on_error` option is on, and the current message is part of a failed group.
89+
fail_entire_batch = bool(self.fail_messages) and not self._skip_group_on_error
90+
fail_group_id = self._skip_group_on_error and self._current_group_id in self._failed_group_ids
91+
if fail_entire_batch or fail_group_id:
92+
return self.failure_handler(
93+
record=self._to_batch_type(record, event_type=self.event_type, model=self.model),
94+
exception=self.group_circuit_breaker_exc if self._skip_group_on_error else self.circuit_breaker_exc,
95+
)
7896

79-
return result
97+
return super()._process_record(record)
8098

81-
def _short_circuit_processing(self, first_failure_index: int, result: List[Tuple]) -> List[Tuple]:
82-
"""
83-
Starting from the first failure index, fail all the remaining messages, and append them to the result list.
84-
"""
85-
remaining_records = self.records[first_failure_index:]
86-
for remaining_record in remaining_records:
87-
data = self._to_batch_type(record=remaining_record, event_type=self.event_type, model=self.model)
88-
result.append(self.failure_handler(record=data, exception=self.circuit_breaker_exc))
89-
return result
99+
def failure_handler(self, record, exception: ExceptionInfo) -> FailureResponse:
100+
# If we are failing a message and the `skip_group_on_error` is on, we store the failed group ID
101+
# This way, future messages with the same group ID will be failed automatically.
102+
if self._skip_group_on_error and self._current_group_id:
103+
self._failed_group_ids.add(self._current_group_id)
104+
105+
return super().failure_handler(record, exception)
106+
107+
def _clean(self):
108+
self._failed_group_ids.clear()
109+
self._current_group_id = None
110+
111+
super()._clean()
90112

91113
async def _async_process_record(self, record: dict):
92114
raise NotImplementedError()

docs/utilities/batch.md

+37-3
Original file line numberDiff line numberDiff line change
@@ -141,8 +141,11 @@ Processing batches from SQS works in three stages:
141141

142142
#### FIFO queues
143143

144-
When using [SQS FIFO queues](https://docs.aws.amazon.com/AWSSimpleQueueService/latest/SQSDeveloperGuide/FIFO-queues.html){target="_blank" rel="nofollow"}, we will stop processing messages after the first failure, and return all failed and unprocessed messages in `batchItemFailures`.
145-
This helps preserve the ordering of messages in your queue.
144+
When working with [SQS FIFO queues](https://docs.aws.amazon.com/AWSSimpleQueueService/latest/SQSDeveloperGuide/FIFO-queues.html){target="_blank"}, a batch may include messages from different group IDs.
145+
146+
By default, we will stop processing at the first failure and mark unprocessed messages as failed to preserve ordering. However, this behavior may not be optimal for customers who wish to proceed with processing messages from a different group ID.
147+
148+
Enable the `skip_group_on_error` option for seamless processing of messages from various group IDs. This setup ensures that messages from a failed group ID are sent back to SQS, enabling uninterrupted processing of messages from the subsequent group ID.
146149

147150
=== "Recommended"
148151

@@ -164,6 +167,12 @@ This helps preserve the ordering of messages in your queue.
164167
--8<-- "examples/batch_processing/src/getting_started_sqs_fifo_decorator.py"
165168
```
166169

170+
=== "Enabling skip_group_on_error flag"
171+
172+
```python hl_lines="2-6 9 23"
173+
--8<-- "examples/batch_processing/src/getting_started_sqs_fifo_skip_on_error.py"
174+
```
175+
167176
### Processing messages from Kinesis
168177

169178
Processing batches from Kinesis works in three stages:
@@ -311,7 +320,7 @@ sequenceDiagram
311320

312321
> Read more about [Batch Failure Reporting feature in AWS Lambda](https://docs.aws.amazon.com/lambda/latest/dg/with-sqs.html#services-sqs-batchfailurereporting){target="_blank"}.
313322
314-
Sequence diagram to explain how [`SqsFifoPartialProcessor` works](#fifo-queues) with SQS FIFO queues.
323+
Sequence diagram to explain how [`SqsFifoPartialProcessor` works](#fifo-queues) with SQS FIFO queues without `skip_group_on_error` flag.
315324

316325
<center>
317326
```mermaid
@@ -335,6 +344,31 @@ sequenceDiagram
335344
<i>SQS FIFO mechanism with Batch Item Failures</i>
336345
</center>
337346

347+
Sequence diagram to explain how [`SqsFifoPartialProcessor` works](#fifo-queues) with SQS FIFO queues with `skip_group_on_error` flag.
348+
349+
<center>
350+
```mermaid
351+
sequenceDiagram
352+
autonumber
353+
participant SQS queue
354+
participant Lambda service
355+
participant Lambda function
356+
Lambda service->>SQS queue: Poll
357+
Lambda service->>Lambda function: Invoke (batch event)
358+
activate Lambda function
359+
Lambda function-->Lambda function: Process 2 out of 10 batch items
360+
Lambda function--xLambda function: Fail on 3rd batch item
361+
Lambda function-->Lambda function: Process messages from another MessageGroupID
362+
Lambda function->>Lambda service: Report 3rd batch item and all messages within the same MessageGroupID as failure
363+
deactivate Lambda function
364+
activate SQS queue
365+
Lambda service->>SQS queue: Delete successful messages processed
366+
SQS queue-->>SQS queue: Failed messages return
367+
deactivate SQS queue
368+
```
369+
<i>SQS FIFO mechanism with Batch Item Failures</i>
370+
</center>
371+
338372
#### Kinesis and DynamoDB Streams
339373

340374
> Read more about [Batch Failure Reporting feature](https://docs.aws.amazon.com/lambda/latest/dg/with-kinesis.html#services-kinesis-batchfailurereporting){target="_blank"}.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
from aws_lambda_powertools import Logger, Tracer
2+
from aws_lambda_powertools.utilities.batch import (
3+
SqsFifoPartialProcessor,
4+
process_partial_response,
5+
)
6+
from aws_lambda_powertools.utilities.data_classes.sqs_event import SQSRecord
7+
from aws_lambda_powertools.utilities.typing import LambdaContext
8+
9+
processor = SqsFifoPartialProcessor(skip_group_on_error=True)
10+
tracer = Tracer()
11+
logger = Logger()
12+
13+
14+
@tracer.capture_method
15+
def record_handler(record: SQSRecord):
16+
payload: str = record.json_body # if json string data, otherwise record.body for str
17+
logger.info(payload)
18+
19+
20+
@logger.inject_lambda_context
21+
@tracer.capture_lambda_handler
22+
def lambda_handler(event, context: LambdaContext):
23+
return process_partial_response(event=event, record_handler=record_handler, processor=processor, context=context)

0 commit comments

Comments
 (0)