Skip to content

Commit b6edd91

Browse files
authored
Redriven Step Functions Trace Merging (#545)
Adds support for Step Functions trace merging in Redrive cases We previously used `hash(ExecutionId # StateName # StateEnteredTime)` for spanID calculation but these values are identical across redrives for a Lambda task state. The new approach also adds a `RedriveCount` to the end of the hash but omits this value when it is 0 to have easy backwards compatability.
1 parent ed8c462 commit b6edd91

File tree

2 files changed

+76
-10
lines changed

2 files changed

+76
-10
lines changed

datadog_lambda/tracing.py

+12-1
Original file line numberDiff line numberDiff line change
@@ -384,12 +384,23 @@ def _parse_high_64_bits(trace_tags: str) -> str:
384384

385385

386386
def _generate_sfn_parent_id(context: dict) -> int:
387+
"""
388+
The upstream Step Function can propagate its execution context to downstream Lambdas. The
389+
Lambda can use these details to share the same traceID and infer its parent's spanID.
390+
391+
Excluding redriveCount when its 0 to account for cases where customers are using an old
392+
version of the Lambda layer that doesn't use this value for its parentID generation.
393+
"""
387394
execution_id = context.get("Execution").get("Id")
395+
redrive_count = context.get("Execution").get("RedriveCount", 0)
388396
state_name = context.get("State").get("Name")
389397
state_entered_time = context.get("State").get("EnteredTime")
390398

399+
redrive_postfix = "" if redrive_count == 0 else f"#{redrive_count}"
400+
391401
return _deterministic_sha256_hash(
392-
f"{execution_id}#{state_name}#{state_entered_time}", HIGHER_64_BITS
402+
f"{execution_id}#{state_name}#{state_entered_time}{redrive_postfix}",
403+
HIGHER_64_BITS,
393404
)
394405

395406

tests/test_tracing.py

+64-9
Original file line numberDiff line numberDiff line change
@@ -619,30 +619,83 @@ def test_step_function_trace_data(self):
619619
lambda_ctx = get_mock_context()
620620
sfn_event = {
621621
"Execution": {
622-
"Id": "665c417c-1237-4742-aaca-8b3becbb9e75",
622+
"Id": "arn:aws:states:sa-east-1:425362996713:execution:abhinav-activity-state-machine:72a7ca3e-901c-41bb-b5a3-5f279b92a316",
623+
"Name": "72a7ca3e-901c-41bb-b5a3-5f279b92a316",
624+
"RoleArn": "arn:aws:iam::425362996713:role/service-role/StepFunctions-abhinav-activity-state-machine-role-22jpbgl6j",
625+
"StartTime": "2024-12-04T19:38:04.069Z",
623626
},
624-
"StateMachine": {},
625627
"State": {
626-
"Name": "my-awesome-state",
627-
"EnteredTime": "Mon Nov 13 12:43:33 PST 2023",
628+
"Name": "Lambda Invoke",
629+
"EnteredTime": "2024-12-04T19:38:04.118Z",
630+
"RetryCount": 0,
631+
},
632+
"StateMachine": {
633+
"Id": "arn:aws:states:sa-east-1:425362996713:stateMachine:abhinav-activity-state-machine",
634+
"Name": "abhinav-activity-state-machine",
628635
},
629636
}
630637
ctx, source, event_source = extract_dd_trace_context(sfn_event, lambda_ctx)
631638
self.assertEqual(source, "event")
632639
expected_context = Context(
633-
trace_id=3675572987363469717,
634-
span_id=6880978411788117524,
640+
trace_id=435175499815315247,
641+
span_id=3929055471293792800,
642+
sampling_priority=1,
643+
meta={"_dd.p.tid": "3e7a89d1b7310603"},
644+
)
645+
self.assertEqual(ctx, expected_context)
646+
self.assertEqual(
647+
get_dd_trace_context(),
648+
{
649+
TraceHeader.TRACE_ID: "435175499815315247",
650+
TraceHeader.PARENT_ID: "10713633173203262661",
651+
TraceHeader.SAMPLING_PRIORITY: "1",
652+
TraceHeader.TAGS: "_dd.p.tid=3e7a89d1b7310603",
653+
},
654+
)
655+
create_dd_dummy_metadata_subsegment(ctx, XraySubsegment.TRACE_KEY)
656+
self.mock_send_segment.assert_called_with(
657+
XraySubsegment.TRACE_KEY,
658+
expected_context,
659+
)
660+
661+
# https://github.com/DataDog/logs-backend/blob/c17618cb552fc369ca40282bae0a65803f82f694/domains/serverless/apps/logs-to-traces-reducer/src/test/resources/test-json-files/stepfunctions/RedriveTest/snapshots/RedriveLambdaSuccessTraceMerging.json#L46
662+
@with_trace_propagation_style("datadog")
663+
def test_step_function_trace_data_redrive(self):
664+
lambda_ctx = get_mock_context()
665+
sfn_event = {
666+
"Execution": {
667+
"Id": "arn:aws:states:sa-east-1:425362996713:execution:abhinav-activity-state-machine:72a7ca3e-901c-41bb-b5a3-5f279b92a316",
668+
"Name": "72a7ca3e-901c-41bb-b5a3-5f279b92a316",
669+
"RoleArn": "arn:aws:iam::425362996713:role/service-role/StepFunctions-abhinav-activity-state-machine-role-22jpbgl6j",
670+
"StartTime": "2024-12-04T19:38:04.069Z",
671+
"RedriveCount": 1,
672+
},
673+
"State": {
674+
"Name": "Lambda Invoke",
675+
"EnteredTime": "2024-12-04T19:38:04.118Z",
676+
"RetryCount": 0,
677+
},
678+
"StateMachine": {
679+
"Id": "arn:aws:states:sa-east-1:425362996713:stateMachine:abhinav-activity-state-machine",
680+
"Name": "abhinav-activity-state-machine",
681+
},
682+
}
683+
ctx, source, event_source = extract_dd_trace_context(sfn_event, lambda_ctx)
684+
self.assertEqual(source, "event")
685+
expected_context = Context(
686+
trace_id=435175499815315247,
687+
span_id=5063839446130725204,
635688
sampling_priority=1,
636-
meta={"_dd.p.tid": "e987c84b36b11ab"},
689+
meta={"_dd.p.tid": "3e7a89d1b7310603"},
637690
)
638691
self.assertEqual(ctx, expected_context)
639692
self.assertEqual(
640693
get_dd_trace_context(),
641694
{
642-
TraceHeader.TRACE_ID: "3675572987363469717",
695+
TraceHeader.TRACE_ID: "435175499815315247",
643696
TraceHeader.PARENT_ID: "10713633173203262661",
644697
TraceHeader.SAMPLING_PRIORITY: "1",
645-
TraceHeader.TAGS: "_dd.p.tid=e987c84b36b11ab",
698+
TraceHeader.TAGS: "_dd.p.tid=3e7a89d1b7310603",
646699
},
647700
)
648701
create_dd_dummy_metadata_subsegment(ctx, XraySubsegment.TRACE_KEY)
@@ -658,6 +711,7 @@ def test_step_function_trace_data_lambda_root(self):
658711
"_datadog": {
659712
"Execution": {
660713
"Id": "665c417c-1237-4742-aaca-8b3becbb9e75",
714+
"RedriveCount": 0,
661715
},
662716
"StateMachine": {},
663717
"State": {
@@ -700,6 +754,7 @@ def test_step_function_trace_data_sfn_root(self):
700754
"_datadog": {
701755
"Execution": {
702756
"Id": "665c417c-1237-4742-aaca-8b3becbb9e75",
757+
"RedriveCount": 0,
703758
},
704759
"StateMachine": {},
705760
"State": {

0 commit comments

Comments
 (0)