Skip to content

Commit 5d1aa08

Browse files
matthiashuschleKrzysztof Chomski
authored and
Krzysztof Chomski
committed
BUG: to_json - prevent various segfault conditions (GH14256) (pandas-dev#17857)
1 parent 5717be9 commit 5d1aa08

File tree

5 files changed

+62
-6
lines changed

5 files changed

+62
-6
lines changed

doc/source/whatsnew/v0.21.0.txt

+2
Original file line numberDiff line numberDiff line change
@@ -956,6 +956,7 @@ I/O
956956
- Bug in :meth:`DataFrame.to_html` with ``notebook=True`` where DataFrames with named indices or non-MultiIndex indices had undesired horizontal or vertical alignment for column or row labels, respectively (:issue:`16792`)
957957
- Bug in :meth:`DataFrame.to_html` in which there was no validation of the ``justify`` parameter (:issue:`17527`)
958958
- Bug in :func:`HDFStore.select` when reading a contiguous mixed-data table featuring VLArray (:issue:`17021`)
959+
- Bug in :func:`to_json` where several conditions (including objects with unprintable symbols, objects with deep recursion, overlong labels) caused segfaults instead of raising the appropriate exception (:issue:`14256`)
959960

960961
Plotting
961962
^^^^^^^^
@@ -1033,3 +1034,4 @@ Other
10331034
^^^^^
10341035
- Bug where some inplace operators were not being wrapped and produced a copy when invoked (:issue:`12962`)
10351036
- Bug in :func:`eval` where the ``inplace`` parameter was being incorrectly handled (:issue:`16732`)
1037+

pandas/_libs/src/ujson/lib/ultrajson.h

+7
Original file line numberDiff line numberDiff line change
@@ -307,4 +307,11 @@ EXPORTFUNCTION JSOBJ JSON_DecodeObject(JSONObjectDecoder *dec,
307307
const char *buffer, size_t cbBuffer);
308308
EXPORTFUNCTION void encode(JSOBJ, JSONObjectEncoder *, const char *, size_t);
309309

310+
#define Buffer_Reserve(__enc, __len) \
311+
if ((size_t)((__enc)->end - (__enc)->offset) < (size_t)(__len)) { \
312+
Buffer_Realloc((__enc), (__len)); \
313+
}
314+
315+
void Buffer_Realloc(JSONObjectEncoder *enc, size_t cbNeeded);
316+
310317
#endif // PANDAS__LIBS_SRC_UJSON_LIB_ULTRAJSON_H_

pandas/_libs/src/ujson/lib/ultrajsonenc.c

+2-5
Original file line numberDiff line numberDiff line change
@@ -714,11 +714,6 @@ int Buffer_EscapeStringValidated(JSOBJ obj, JSONObjectEncoder *enc,
714714
}
715715
}
716716

717-
#define Buffer_Reserve(__enc, __len) \
718-
if ((size_t)((__enc)->end - (__enc)->offset) < (size_t)(__len)) { \
719-
Buffer_Realloc((__enc), (__len)); \
720-
}
721-
722717
#define Buffer_AppendCharUnchecked(__enc, __chr) *((__enc)->offset++) = __chr;
723718

724719
FASTCALL_ATTR INLINE_PREFIX void FASTCALL_MSVC strreverse(char *begin,
@@ -976,6 +971,7 @@ void encode(JSOBJ obj, JSONObjectEncoder *enc, const char *name,
976971
}
977972

978973
enc->iterEnd(obj, &tc);
974+
Buffer_Reserve(enc, 2);
979975
Buffer_AppendCharUnchecked(enc, ']');
980976
break;
981977
}
@@ -1003,6 +999,7 @@ void encode(JSOBJ obj, JSONObjectEncoder *enc, const char *name,
1003999
}
10041000

10051001
enc->iterEnd(obj, &tc);
1002+
Buffer_Reserve(enc, 2);
10061003
Buffer_AppendCharUnchecked(enc, '}');
10071004
break;
10081005
}

pandas/_libs/src/ujson/python/objToJSON.c

+6-1
Original file line numberDiff line numberDiff line change
@@ -783,6 +783,7 @@ static void NpyArr_getLabel(JSOBJ obj, JSONTypeContext *tc, size_t *outLen,
783783
JSONObjectEncoder *enc = (JSONObjectEncoder *)tc->encoder;
784784
PRINTMARK();
785785
*outLen = strlen(labels[idx]);
786+
Buffer_Reserve(enc, *outLen);
786787
memcpy(enc->offset, labels[idx], sizeof(char) * (*outLen));
787788
enc->offset += *outLen;
788789
*outLen = 0;
@@ -879,7 +880,7 @@ int PdBlock_iterNext(JSOBJ obj, JSONTypeContext *tc) {
879880
NpyArrContext *npyarr;
880881
PRINTMARK();
881882

882-
if (PyErr_Occurred()) {
883+
if (PyErr_Occurred() || ((JSONObjectEncoder *)tc->encoder)->errorMsg) {
883884
return 0;
884885
}
885886

@@ -1224,6 +1225,10 @@ int Dir_iterNext(JSOBJ _obj, JSONTypeContext *tc) {
12241225
PyObject *attrName;
12251226
char *attrStr;
12261227

1228+
if (PyErr_Occurred() || ((JSONObjectEncoder *)tc->encoder)->errorMsg) {
1229+
return 0;
1230+
}
1231+
12271232
if (itemValue) {
12281233
Py_DECREF(GET_TC(tc)->itemValue);
12291234
GET_TC(tc)->itemValue = itemValue = NULL;

pandas/tests/io/json/test_pandas.py

+45
Original file line numberDiff line numberDiff line change
@@ -511,6 +511,51 @@ def test_blocks_compat_GH9037(self):
511511
by_blocks=True,
512512
check_exact=True)
513513

514+
def test_frame_nonprintable_bytes(self):
515+
# GH14256: failing column caused segfaults, if it is not the last one
516+
517+
class BinaryThing(object):
518+
519+
def __init__(self, hexed):
520+
self.hexed = hexed
521+
if compat.PY2:
522+
self.binary = hexed.decode('hex')
523+
else:
524+
self.binary = bytes.fromhex(hexed)
525+
526+
def __str__(self):
527+
return self.hexed
528+
529+
hexed = '574b4454ba8c5eb4f98a8f45'
530+
binthing = BinaryThing(hexed)
531+
532+
# verify the proper conversion of printable content
533+
df_printable = DataFrame({'A': [binthing.hexed]})
534+
assert df_printable.to_json() == '{"A":{"0":"%s"}}' % hexed
535+
536+
# check if non-printable content throws appropriate Exception
537+
df_nonprintable = DataFrame({'A': [binthing]})
538+
with pytest.raises(OverflowError):
539+
df_nonprintable.to_json()
540+
541+
# the same with multiple columns threw segfaults
542+
df_mixed = DataFrame({'A': [binthing], 'B': [1]},
543+
columns=['A', 'B'])
544+
with pytest.raises(OverflowError):
545+
df_mixed.to_json()
546+
547+
# default_handler should resolve exceptions for non-string types
548+
assert df_nonprintable.to_json(default_handler=str) == \
549+
'{"A":{"0":"%s"}}' % hexed
550+
assert df_mixed.to_json(default_handler=str) == \
551+
'{"A":{"0":"%s"},"B":{"0":1}}' % hexed
552+
553+
def test_label_overflow(self):
554+
# GH14256: buffer length not checked when writing label
555+
df = pd.DataFrame({'foo': [1337], 'bar' * 100000: [1]})
556+
assert df.to_json() == \
557+
'{"%s":{"0":1},"foo":{"0":1337}}' % ('bar' * 100000)
558+
514559
def test_series_non_unique_index(self):
515560
s = Series(['a', 'b'], index=[1, 1])
516561

0 commit comments

Comments
 (0)