Skip to content

Commit fab65e6

Browse files
authored
feat(metrics): New normalization of keys, values, units (#2946)
1 parent a584653 commit fab65e6

File tree

2 files changed

+111
-41
lines changed

2 files changed

+111
-41
lines changed

sentry_sdk/metrics.py

+33-6
Original file line numberDiff line numberDiff line change
@@ -54,8 +54,6 @@
5454

5555

5656
_in_metrics = ContextVar("in_metrics", default=False)
57-
_sanitize_key = partial(re.compile(r"[^a-zA-Z0-9_/.-]+").sub, "_")
58-
_sanitize_value = partial(re.compile(r"[^\w\d\s_:/@\.{}\[\]$-]+", re.UNICODE).sub, "")
5957
_set = set # set is shadowed below
6058

6159
GOOD_TRANSACTION_SOURCES = frozenset(
@@ -67,6 +65,32 @@
6765
]
6866
)
6967

68+
_sanitize_unit = partial(re.compile(r"[^a-zA-Z0-9_]+").sub, "")
69+
_sanitize_metric_key = partial(re.compile(r"[^a-zA-Z0-9_\-.]+").sub, "_")
70+
_sanitize_tag_key = partial(re.compile(r"[^a-zA-Z0-9_\-.\/]+").sub, "")
71+
_TAG_VALUE_SANITIZATION_TABLE = {
72+
"\n": "\\n",
73+
"\r": "\\r",
74+
"\t": "\\t",
75+
"\\": "\\\\",
76+
"|": "\\u{7c}",
77+
",": "\\u{2c}",
78+
}
79+
80+
81+
def _sanitize_tag_value(value):
82+
# type: (str) -> str
83+
return "".join(
84+
[
85+
(
86+
_TAG_VALUE_SANITIZATION_TABLE[char]
87+
if char in _TAG_VALUE_SANITIZATION_TABLE
88+
else char
89+
)
90+
for char in value
91+
]
92+
)
93+
7094

7195
def get_code_location(stacklevel):
7296
# type: (int) -> Optional[Dict[str, Any]]
@@ -269,7 +293,8 @@ def _encode_metrics(flushable_buckets):
269293
for timestamp, buckets in flushable_buckets:
270294
for bucket_key, metric in iteritems(buckets):
271295
metric_type, metric_name, metric_unit, metric_tags = bucket_key
272-
metric_name = _sanitize_key(metric_name)
296+
metric_name = _sanitize_metric_key(metric_name)
297+
metric_unit = _sanitize_unit(metric_unit)
273298
_write(metric_name.encode("utf-8"))
274299
_write(b"@")
275300
_write(metric_unit.encode("utf-8"))
@@ -285,7 +310,7 @@ def _encode_metrics(flushable_buckets):
285310
_write(b"|#")
286311
first = True
287312
for tag_key, tag_value in metric_tags:
288-
tag_key = _sanitize_key(tag_key)
313+
tag_key = _sanitize_tag_key(tag_key)
289314
if not tag_key:
290315
continue
291316
if first:
@@ -294,7 +319,7 @@ def _encode_metrics(flushable_buckets):
294319
_write(b",")
295320
_write(tag_key.encode("utf-8"))
296321
_write(b":")
297-
_write(_sanitize_value(tag_value).encode("utf-8"))
322+
_write(_sanitize_tag_value(tag_value).encode("utf-8"))
298323

299324
_write(b"|T")
300325
_write(str(timestamp).encode("ascii"))
@@ -309,7 +334,9 @@ def _encode_locations(timestamp, code_locations):
309334

310335
for key, loc in code_locations:
311336
metric_type, name, unit = key
312-
mri = "{}:{}@{}".format(metric_type, _sanitize_key(name), unit)
337+
mri = "{}:{}@{}".format(
338+
metric_type, _sanitize_metric_key(name), _sanitize_unit(unit)
339+
)
313340

314341
loc["type"] = "location"
315342
mapping.setdefault(mri, []).append(loc)

tests/test_metrics.py

+78-35
Original file line numberDiff line numberDiff line change
@@ -677,56 +677,99 @@ def test_metric_summaries(
677677

678678
@minimum_python_37_with_gevent
679679
@pytest.mark.forked
680-
def test_tag_normalization(
681-
sentry_init, capture_envelopes, maybe_monkeypatched_threading
680+
@pytest.mark.parametrize(
681+
"metric_name,metric_unit,expected_name",
682+
[
683+
("first-metric", "nano-second", "first-metric@nanosecond"),
684+
("another_metric?", "nano second", "another_metric_@nanosecond"),
685+
(
686+
"metric",
687+
"nanosecond",
688+
"metric@nanosecond",
689+
),
690+
(
691+
"my.amaze.metric I guess",
692+
"nano|\nsecond",
693+
"my.amaze.metric_I_guess@nanosecond",
694+
),
695+
# fmt: off
696+
(u"métríc", u"nanöseconď", u"m_tr_c@nansecon"),
697+
# fmt: on
698+
],
699+
)
700+
def test_metric_name_normalization(
701+
sentry_init,
702+
capture_envelopes,
703+
metric_name,
704+
metric_unit,
705+
expected_name,
706+
maybe_monkeypatched_threading,
682707
):
683708
sentry_init(
684-
release="[email protected]",
685-
environment="not-fun-env",
686709
_experiments={"enable_metrics": True, "metric_code_locations": False},
687710
)
688-
ts = time.time()
689711
envelopes = capture_envelopes()
690712

691-
# fmt: off
692-
metrics.distribution("a", 1.0, tags={"foo-bar": "%$foo"}, timestamp=ts)
693-
metrics.distribution("b", 1.0, tags={"foo$$$bar": "blah{}"}, timestamp=ts)
694-
metrics.distribution("c", 1.0, tags={u"foö-bar": u"snöwmän"}, timestamp=ts)
695-
metrics.distribution("d", 1.0, tags={"route": "GET /foo"}, timestamp=ts)
696-
# fmt: on
713+
metrics.distribution(metric_name, 1.0, unit=metric_unit)
714+
697715
Hub.current.flush()
698716

699717
(envelope,) = envelopes
700718

701719
assert len(envelope.items) == 1
702720
assert envelope.items[0].headers["type"] == "statsd"
703-
m = parse_metrics(envelope.items[0].payload.get_bytes())
704721

705-
assert len(m) == 4
706-
assert m[0][4] == {
707-
"foo-bar": "$foo",
708-
"release": "[email protected]",
709-
"environment": "not-fun-env",
710-
}
722+
parsed_metrics = parse_metrics(envelope.items[0].payload.get_bytes())
723+
assert len(parsed_metrics) == 1
711724

712-
assert m[1][4] == {
713-
"foo_bar": "blah{}",
714-
"release": "[email protected]",
715-
"environment": "not-fun-env",
716-
}
725+
name = parsed_metrics[0][1]
726+
assert name == expected_name
717727

718-
# fmt: off
719-
assert m[2][4] == {
720-
"fo_-bar": u"snöwmän",
721-
"release": "[email protected]",
722-
"environment": "not-fun-env",
723-
}
724-
assert m[3][4] == {
725-
"release": "[email protected]",
726-
"environment": "not-fun-env",
727-
"route": "GET /foo",
728-
}
729-
# fmt: on
728+
729+
@minimum_python_37_with_gevent
730+
@pytest.mark.forked
731+
@pytest.mark.parametrize(
732+
"metric_tag,expected_tag",
733+
[
734+
({"f-oo|bar": "%$foo/"}, {"f-oobar": "%$foo/"}),
735+
({"foo$.$.$bar": "blah{}"}, {"foo..bar": "blah{}"}),
736+
# fmt: off
737+
({u"foö-bar": u"snöwmän"}, {u"fo-bar": u"snöwmän"},),
738+
# fmt: on
739+
({"route": "GET /foo"}, {"route": "GET /foo"}),
740+
({"__bar__": "this | or , that"}, {"__bar__": "this \\u{7c} or \\u{2c} that"}),
741+
({"foo/": "hello!\n\r\t\\"}, {"foo/": "hello!\\n\\r\\t\\\\"}),
742+
],
743+
)
744+
def test_metric_tag_normalization(
745+
sentry_init,
746+
capture_envelopes,
747+
metric_tag,
748+
expected_tag,
749+
maybe_monkeypatched_threading,
750+
):
751+
sentry_init(
752+
_experiments={"enable_metrics": True, "metric_code_locations": False},
753+
)
754+
envelopes = capture_envelopes()
755+
756+
metrics.distribution("a", 1.0, tags=metric_tag)
757+
758+
Hub.current.flush()
759+
760+
(envelope,) = envelopes
761+
762+
assert len(envelope.items) == 1
763+
assert envelope.items[0].headers["type"] == "statsd"
764+
765+
parsed_metrics = parse_metrics(envelope.items[0].payload.get_bytes())
766+
assert len(parsed_metrics) == 1
767+
768+
tags = parsed_metrics[0][4]
769+
770+
expected_tag_key, expected_tag_value = expected_tag.popitem()
771+
assert expected_tag_key in tags
772+
assert tags[expected_tag_key] == expected_tag_value
730773

731774

732775
@minimum_python_37_with_gevent

0 commit comments

Comments
 (0)