Skip to content

Commit 1a18420

Browse files
rouzazarijreback
authored andcommitted
BUG: Fix to_json lines with escaped characters
Updates existing to_json methodology by adding is_escaping variable, which ensures escaped chars are handled correctly. xref #14693 closes #15096 Author: Rouz Azari <[email protected]> Closes #15117 from rouzazari/to_json_lines_with_escaping and squashes the following commits: d114455 [Rouz Azari] BUG: Fix to_json lines with escaped characters
1 parent ab0d236 commit 1a18420

File tree

3 files changed

+15
-2
lines changed

3 files changed

+15
-2
lines changed

doc/source/whatsnew/v0.20.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -351,6 +351,7 @@ Bug Fixes
351351
- Require at least 0.23 version of cython to avoid problems with character encodings (:issue:`14699`)
352352
- Bug in converting object elements of array-like objects to unsigned 64-bit integers (:issue:`4471`, :issue:`14982`)
353353
- Bug in ``pd.pivot_table()`` where no error was raised when values argument was not in the columns (:issue:`14938`)
354+
- Bug in ``.to_json()`` where ``lines=True`` and contents (keys or values) contain escaped characters (:issue:`15096`)
354355

355356
- Bug in ``DataFrame.groupby().describe()`` when grouping on ``Index`` containing tuples (:issue:`14848`)
356357
- Bug in creating a ``MultiIndex`` with tuples and not passing a list of names; this will now raise ``ValueError`` (:issue:`15110`)

pandas/io/tests/json/test_pandas.py

+9
Original file line numberDiff line numberDiff line change
@@ -972,6 +972,15 @@ def test_to_jsonl(self):
972972
self.assertEqual(result, expected)
973973
assert_frame_equal(pd.read_json(result, lines=True), df)
974974

975+
# GH15096: escaped characters in columns and data
976+
df = DataFrame([["foo\\", "bar"], ['foo"', "bar"]],
977+
columns=["a\\", 'b'])
978+
result = df.to_json(orient="records", lines=True)
979+
expected = ('{"a\\\\":"foo\\\\","b":"bar"}\n'
980+
'{"a\\\\":"foo\\"","b":"bar"}')
981+
self.assertEqual(result, expected)
982+
assert_frame_equal(pd.read_json(result, lines=True), df)
983+
975984
def test_latin_encoding(self):
976985
if compat.PY2:
977986
self.assertRaisesRegexp(

pandas/lib.pyx

+5-2
Original file line numberDiff line numberDiff line change
@@ -1098,7 +1098,8 @@ def convert_json_to_lines(object arr):
10981098
to quotes & brackets
10991099
"""
11001100
cdef:
1101-
Py_ssize_t i = 0, num_open_brackets_seen = 0, in_quotes = 0, length
1101+
Py_ssize_t i = 0, num_open_brackets_seen = 0, length
1102+
bint in_quotes = 0, is_escaping = 0
11021103
ndarray[uint8_t] narr
11031104
unsigned char v, comma, left_bracket, right_brack, newline
11041105

@@ -1113,8 +1114,10 @@ def convert_json_to_lines(object arr):
11131114
length = narr.shape[0]
11141115
for i in range(length):
11151116
v = narr[i]
1116-
if v == quote and i > 0 and narr[i - 1] != backslash:
1117+
if v == quote and i > 0 and not is_escaping:
11171118
in_quotes = ~in_quotes
1119+
if v == backslash or is_escaping:
1120+
is_escaping = ~is_escaping
11181121
if v == comma: # commas that should be \n
11191122
if num_open_brackets_seen == 0 and not in_quotes:
11201123
narr[i] = newline

0 commit comments

Comments
 (0)