From d11445527a0ccf15e1ad16c1f0211c062a4c7bbf Mon Sep 17 00:00:00 2001 From: Rouz Azari Date: Thu, 12 Jan 2017 06:50:59 -0800 Subject: [PATCH] BUG: Fix to_json lines with escaped characters Updates existing to_json methodology by adding is_escaping variable, which ensures escaped chars are handled correctly. - Includes test for escaped characters in keys and values (i.e. columns and data). - Includes bug fix in whatsnew - Revised type of in_quotes and is_escaping to bint xref #14693 xref #15096 --- doc/source/whatsnew/v0.20.0.txt | 1 + pandas/io/tests/json/test_pandas.py | 9 +++++++++ pandas/lib.pyx | 7 +++++-- 3 files changed, 15 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index b157112b6ff37..9ea7b740bae8f 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -351,6 +351,7 @@ Bug Fixes - Require at least 0.23 version of cython to avoid problems with character encodings (:issue:`14699`) - Bug in converting object elements of array-like objects to unsigned 64-bit integers (:issue:`4471`, :issue:`14982`) - Bug in ``pd.pivot_table()`` where no error was raised when values argument was not in the columns (:issue:`14938`) +- Bug in ``.to_json()`` where ``lines=True`` and contents (keys or values) contain escaped characters (:issue:`15096`) - Bug in ``DataFrame.groupby().describe()`` when grouping on ``Index`` containing tuples (:issue:`14848`) - Bug in creating a ``MultiIndex`` with tuples and not passing a list of names; this will now raise ``ValueError`` (:issue:`15110`) diff --git a/pandas/io/tests/json/test_pandas.py b/pandas/io/tests/json/test_pandas.py index d7f903153fdae..aaa9752dc6d46 100644 --- a/pandas/io/tests/json/test_pandas.py +++ b/pandas/io/tests/json/test_pandas.py @@ -972,6 +972,15 @@ def test_to_jsonl(self): self.assertEqual(result, expected) assert_frame_equal(pd.read_json(result, lines=True), df) + # GH15096: escaped characters in columns and data + df = DataFrame([["foo\\", "bar"], ['foo"', "bar"]], + columns=["a\\", 'b']) + result = df.to_json(orient="records", lines=True) + expected = ('{"a\\\\":"foo\\\\","b":"bar"}\n' + '{"a\\\\":"foo\\"","b":"bar"}') + self.assertEqual(result, expected) + assert_frame_equal(pd.read_json(result, lines=True), df) + def test_latin_encoding(self): if compat.PY2: self.assertRaisesRegexp( diff --git a/pandas/lib.pyx b/pandas/lib.pyx index 761969491cfc7..fce6a3d03287e 100644 --- a/pandas/lib.pyx +++ b/pandas/lib.pyx @@ -1098,7 +1098,8 @@ def convert_json_to_lines(object arr): to quotes & brackets """ cdef: - Py_ssize_t i = 0, num_open_brackets_seen = 0, in_quotes = 0, length + Py_ssize_t i = 0, num_open_brackets_seen = 0, length + bint in_quotes = 0, is_escaping = 0 ndarray[uint8_t] narr unsigned char v, comma, left_bracket, right_brack, newline @@ -1113,8 +1114,10 @@ def convert_json_to_lines(object arr): length = narr.shape[0] for i in range(length): v = narr[i] - if v == quote and i > 0 and narr[i - 1] != backslash: + if v == quote and i > 0 and not is_escaping: in_quotes = ~in_quotes + if v == backslash or is_escaping: + is_escaping = ~is_escaping if v == comma: # commas that should be \n if num_open_brackets_seen == 0 and not in_quotes: narr[i] = newline