Skip to content

Commit 73b3040

Browse files
[3.11] gh-133767: Fix use-after-free in the unicode-escape decoder with an error handler (GH-129648) (GH-133944) (GH-134341)
If the error handler is used, a new bytes object is created to set as the object attribute of UnicodeDecodeError, and that bytes object then replaces the original data. A pointer to the decoded data will became invalid after destroying that temporary bytes object. So we need other way to return the first invalid escape from _PyUnicode_DecodeUnicodeEscapeInternal(). _PyBytes_DecodeEscape() does not have such issue, because it does not use the error handlers registry, but it should be changed for compatibility with _PyUnicode_DecodeUnicodeEscapeInternal(). (cherry picked from commit 9f69a58) (cherry picked from commit 6279eb8) (cherry picked from commit a75953b) Co-authored-by: Serhiy Storchaka <[email protected]>
1 parent 461ca2c commit 73b3040

File tree

8 files changed

+197
-57
lines changed

8 files changed

+197
-57
lines changed

Include/cpython/bytesobject.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,10 @@ PyAPI_FUNC(PyObject*) _PyBytes_FromHex(
2525
int use_bytearray);
2626

2727
/* Helper for PyBytes_DecodeEscape that detects invalid escape chars. */
28+
PyAPI_FUNC(PyObject*) _PyBytes_DecodeEscape2(const char *, Py_ssize_t,
29+
const char *,
30+
int *, const char **);
31+
// Export for binary compatibility.
2832
PyAPI_FUNC(PyObject *) _PyBytes_DecodeEscape(const char *, Py_ssize_t,
2933
const char *, const char **);
3034

Include/cpython/unicodeobject.h

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -914,6 +914,19 @@ PyAPI_FUNC(PyObject*) _PyUnicode_DecodeUnicodeEscapeStateful(
914914
);
915915
/* Helper for PyUnicode_DecodeUnicodeEscape that detects invalid escape
916916
chars. */
917+
PyAPI_FUNC(PyObject*) _PyUnicode_DecodeUnicodeEscapeInternal2(
918+
const char *string, /* Unicode-Escape encoded string */
919+
Py_ssize_t length, /* size of string */
920+
const char *errors, /* error handling */
921+
Py_ssize_t *consumed, /* bytes consumed */
922+
int *first_invalid_escape_char, /* on return, if not -1, contain the first
923+
invalid escaped char (<= 0xff) or invalid
924+
octal escape (> 0xff) in string. */
925+
const char **first_invalid_escape_ptr); /* on return, if not NULL, may
926+
point to the first invalid escaped
927+
char in string.
928+
May be NULL if errors is not NULL. */
929+
// Export for binary compatibility.
917930
PyAPI_FUNC(PyObject*) _PyUnicode_DecodeUnicodeEscapeInternal(
918931
const char *string, /* Unicode-Escape encoded string */
919932
Py_ssize_t length, /* size of string */

Lib/test/test_codeccallbacks.py

Lines changed: 37 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1124,7 +1124,7 @@ def test_bug828737(self):
11241124
text = 'abc<def>ghi'*n
11251125
text.translate(charmap)
11261126

1127-
def test_mutatingdecodehandler(self):
1127+
def test_mutating_decode_handler(self):
11281128
baddata = [
11291129
("ascii", b"\xff"),
11301130
("utf-7", b"++"),
@@ -1159,6 +1159,42 @@ def mutating(exc):
11591159
for (encoding, data) in baddata:
11601160
self.assertEqual(data.decode(encoding, "test.mutating"), "\u4242")
11611161

1162+
def test_mutating_decode_handler_unicode_escape(self):
1163+
decode = codecs.unicode_escape_decode
1164+
def mutating(exc):
1165+
if isinstance(exc, UnicodeDecodeError):
1166+
r = data.get(exc.object[:exc.end])
1167+
if r is not None:
1168+
exc.object = r[0] + exc.object[exc.end:]
1169+
return ('\u0404', r[1])
1170+
raise AssertionError("don't know how to handle %r" % exc)
1171+
1172+
codecs.register_error('test.mutating2', mutating)
1173+
data = {
1174+
br'\x0': (b'\\', 0),
1175+
br'\x3': (b'xxx\\', 3),
1176+
br'\x5': (b'x\\', 1),
1177+
}
1178+
def check(input, expected, msg):
1179+
with self.assertWarns(DeprecationWarning) as cm:
1180+
self.assertEqual(decode(input, 'test.mutating2'), (expected, len(input)))
1181+
self.assertIn(msg, str(cm.warning))
1182+
1183+
check(br'\x0n\z', '\u0404\n\\z', r"invalid escape sequence '\z'")
1184+
check(br'\x0n\501', '\u0404\n\u0141', r"invalid octal escape sequence '\501'")
1185+
check(br'\x0z', '\u0404\\z', r"invalid escape sequence '\z'")
1186+
1187+
check(br'\x3n\zr', '\u0404\n\\zr', r"invalid escape sequence '\z'")
1188+
check(br'\x3zr', '\u0404\\zr', r"invalid escape sequence '\z'")
1189+
check(br'\x3z5', '\u0404\\z5', r"invalid escape sequence '\z'")
1190+
check(memoryview(br'\x3z5x')[:-1], '\u0404\\z5', r"invalid escape sequence '\z'")
1191+
check(memoryview(br'\x3z5xy')[:-2], '\u0404\\z5', r"invalid escape sequence '\z'")
1192+
1193+
check(br'\x5n\z', '\u0404\n\\z', r"invalid escape sequence '\z'")
1194+
check(br'\x5n\501', '\u0404\n\u0141', r"invalid octal escape sequence '\501'")
1195+
check(br'\x5z', '\u0404\\z', r"invalid escape sequence '\z'")
1196+
check(memoryview(br'\x5zy')[:-1], '\u0404\\z', r"invalid escape sequence '\z'")
1197+
11621198
# issue32583
11631199
def test_crashing_decode_handler(self):
11641200
# better generating one more character to fill the extra space slot

Lib/test/test_codecs.py

Lines changed: 42 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1198,23 +1198,39 @@ def test_escape(self):
11981198
check(br"[\1010]", b"[A0]")
11991199
check(br"[\x41]", b"[A]")
12001200
check(br"[\x410]", b"[A0]")
1201+
1202+
def test_warnings(self):
1203+
decode = codecs.escape_decode
1204+
check = coding_checker(self, decode)
12011205
for i in range(97, 123):
12021206
b = bytes([i])
12031207
if b not in b'abfnrtvx':
1204-
with self.assertWarns(DeprecationWarning):
1208+
with self.assertWarnsRegex(DeprecationWarning,
1209+
r"invalid escape sequence '\\%c'" % i):
12051210
check(b"\\" + b, b"\\" + b)
1206-
with self.assertWarns(DeprecationWarning):
1211+
with self.assertWarnsRegex(DeprecationWarning,
1212+
r"invalid escape sequence '\\%c'" % (i-32)):
12071213
check(b"\\" + b.upper(), b"\\" + b.upper())
1208-
with self.assertWarns(DeprecationWarning):
1214+
with self.assertWarnsRegex(DeprecationWarning,
1215+
r"invalid escape sequence '\\8'"):
12091216
check(br"\8", b"\\8")
12101217
with self.assertWarns(DeprecationWarning):
12111218
check(br"\9", b"\\9")
1212-
with self.assertWarns(DeprecationWarning):
1219+
with self.assertWarnsRegex(DeprecationWarning,
1220+
r"invalid escape sequence '\\\xfa'") as cm:
12131221
check(b"\\\xfa", b"\\\xfa")
12141222
for i in range(0o400, 0o1000):
1215-
with self.assertWarns(DeprecationWarning):
1223+
with self.assertWarnsRegex(DeprecationWarning,
1224+
r"invalid octal escape sequence '\\%o'" % i):
12161225
check(rb'\%o' % i, bytes([i & 0o377]))
12171226

1227+
with self.assertWarnsRegex(DeprecationWarning,
1228+
r"invalid escape sequence '\\z'"):
1229+
self.assertEqual(decode(br'\x\z', 'ignore'), (b'\\z', 4))
1230+
with self.assertWarnsRegex(DeprecationWarning,
1231+
r"invalid octal escape sequence '\\501'"):
1232+
self.assertEqual(decode(br'\x\501', 'ignore'), (b'A', 6))
1233+
12181234
def test_errors(self):
12191235
decode = codecs.escape_decode
12201236
self.assertRaises(ValueError, decode, br"\x")
@@ -2487,24 +2503,40 @@ def test_escape_decode(self):
24872503
check(br"[\x410]", "[A0]")
24882504
check(br"\u20ac", "\u20ac")
24892505
check(br"\U0001d120", "\U0001d120")
2506+
2507+
def test_decode_warnings(self):
2508+
decode = codecs.unicode_escape_decode
2509+
check = coding_checker(self, decode)
24902510
for i in range(97, 123):
24912511
b = bytes([i])
24922512
if b not in b'abfnrtuvx':
2493-
with self.assertWarns(DeprecationWarning):
2513+
with self.assertWarnsRegex(DeprecationWarning,
2514+
r"invalid escape sequence '\\%c'" % i):
24942515
check(b"\\" + b, "\\" + chr(i))
24952516
if b.upper() not in b'UN':
2496-
with self.assertWarns(DeprecationWarning):
2517+
with self.assertWarnsRegex(DeprecationWarning,
2518+
r"invalid escape sequence '\\%c'" % (i-32)):
24972519
check(b"\\" + b.upper(), "\\" + chr(i-32))
2498-
with self.assertWarns(DeprecationWarning):
2520+
with self.assertWarnsRegex(DeprecationWarning,
2521+
r"invalid escape sequence '\\8'"):
24992522
check(br"\8", "\\8")
25002523
with self.assertWarns(DeprecationWarning):
25012524
check(br"\9", "\\9")
2502-
with self.assertWarns(DeprecationWarning):
2525+
with self.assertWarnsRegex(DeprecationWarning,
2526+
r"invalid escape sequence '\\\xfa'") as cm:
25032527
check(b"\\\xfa", "\\\xfa")
25042528
for i in range(0o400, 0o1000):
2505-
with self.assertWarns(DeprecationWarning):
2529+
with self.assertWarnsRegex(DeprecationWarning,
2530+
r"invalid octal escape sequence '\\%o'" % i):
25062531
check(rb'\%o' % i, chr(i))
25072532

2533+
with self.assertWarnsRegex(DeprecationWarning,
2534+
r"invalid escape sequence '\\z'"):
2535+
self.assertEqual(decode(br'\x\z', 'ignore'), ('\\z', 4))
2536+
with self.assertWarnsRegex(DeprecationWarning,
2537+
r"invalid octal escape sequence '\\501'"):
2538+
self.assertEqual(decode(br'\x\501', 'ignore'), ('\u0141', 6))
2539+
25082540
def test_decode_errors(self):
25092541
decode = codecs.unicode_escape_decode
25102542
for c, d in (b'x', 2), (b'u', 4), (b'U', 4):
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
Fix use-after-free in the "unicode-escape" decoder with a non-"strict" error
2+
handler.

Objects/bytesobject.c

Lines changed: 38 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1057,10 +1057,11 @@ _PyBytes_FormatEx(const char *format, Py_ssize_t format_len,
10571057
}
10581058

10591059
/* Unescape a backslash-escaped string. */
1060-
PyObject *_PyBytes_DecodeEscape(const char *s,
1060+
PyObject *_PyBytes_DecodeEscape2(const char *s,
10611061
Py_ssize_t len,
10621062
const char *errors,
1063-
const char **first_invalid_escape)
1063+
int *first_invalid_escape_char,
1064+
const char **first_invalid_escape_ptr)
10641065
{
10651066
int c;
10661067
char *p;
@@ -1074,7 +1075,8 @@ PyObject *_PyBytes_DecodeEscape(const char *s,
10741075
return NULL;
10751076
writer.overallocate = 1;
10761077

1077-
*first_invalid_escape = NULL;
1078+
*first_invalid_escape_char = -1;
1079+
*first_invalid_escape_ptr = NULL;
10781080

10791081
end = s + len;
10801082
while (s < end) {
@@ -1112,9 +1114,10 @@ PyObject *_PyBytes_DecodeEscape(const char *s,
11121114
c = (c<<3) + *s++ - '0';
11131115
}
11141116
if (c > 0377) {
1115-
if (*first_invalid_escape == NULL) {
1116-
*first_invalid_escape = s-3; /* Back up 3 chars, since we've
1117-
already incremented s. */
1117+
if (*first_invalid_escape_char == -1) {
1118+
*first_invalid_escape_char = c;
1119+
/* Back up 3 chars, since we've already incremented s. */
1120+
*first_invalid_escape_ptr = s - 3;
11181121
}
11191122
}
11201123
*p++ = c;
@@ -1155,9 +1158,10 @@ PyObject *_PyBytes_DecodeEscape(const char *s,
11551158
break;
11561159

11571160
default:
1158-
if (*first_invalid_escape == NULL) {
1159-
*first_invalid_escape = s-1; /* Back up one char, since we've
1160-
already incremented s. */
1161+
if (*first_invalid_escape_char == -1) {
1162+
*first_invalid_escape_char = (unsigned char)s[-1];
1163+
/* Back up one char, since we've already incremented s. */
1164+
*first_invalid_escape_ptr = s - 1;
11611165
}
11621166
*p++ = '\\';
11631167
s--;
@@ -1171,23 +1175,39 @@ PyObject *_PyBytes_DecodeEscape(const char *s,
11711175
return NULL;
11721176
}
11731177

1178+
// Export for binary compatibility.
1179+
PyObject *_PyBytes_DecodeEscape(const char *s,
1180+
Py_ssize_t len,
1181+
const char *errors,
1182+
const char **first_invalid_escape)
1183+
{
1184+
int first_invalid_escape_char;
1185+
return _PyBytes_DecodeEscape2(
1186+
s, len, errors,
1187+
&first_invalid_escape_char,
1188+
first_invalid_escape);
1189+
}
1190+
11741191
PyObject *PyBytes_DecodeEscape(const char *s,
11751192
Py_ssize_t len,
11761193
const char *errors,
11771194
Py_ssize_t Py_UNUSED(unicode),
11781195
const char *Py_UNUSED(recode_encoding))
11791196
{
1180-
const char* first_invalid_escape;
1181-
PyObject *result = _PyBytes_DecodeEscape(s, len, errors,
1182-
&first_invalid_escape);
1197+
int first_invalid_escape_char;
1198+
const char *first_invalid_escape_ptr;
1199+
PyObject *result = _PyBytes_DecodeEscape2(s, len, errors,
1200+
&first_invalid_escape_char,
1201+
&first_invalid_escape_ptr);
11831202
if (result == NULL)
11841203
return NULL;
1185-
if (first_invalid_escape != NULL) {
1186-
unsigned char c = *first_invalid_escape;
1187-
if ('4' <= c && c <= '7') {
1204+
if (first_invalid_escape_char != -1) {
1205+
if (first_invalid_escape_char > 0xff) {
1206+
char buf[12] = "";
1207+
snprintf(buf, sizeof buf, "%o", first_invalid_escape_char);
11881208
if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
1189-
"invalid octal escape sequence '\\%.3s'",
1190-
first_invalid_escape) < 0)
1209+
"invalid octal escape sequence '\\%s'",
1210+
buf) < 0)
11911211
{
11921212
Py_DECREF(result);
11931213
return NULL;
@@ -1196,7 +1216,7 @@ PyObject *PyBytes_DecodeEscape(const char *s,
11961216
else {
11971217
if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
11981218
"invalid escape sequence '\\%c'",
1199-
c) < 0)
1219+
first_invalid_escape_char) < 0)
12001220
{
12011221
Py_DECREF(result);
12021222
return NULL;

0 commit comments

Comments
 (0)