BUG: Fix to_json lines with escaped characters

rouzazari · jreback · commit 1a18420d97f1 · 2017-01-13T15:08:21.000-05:00
Updates existing to_json methodology by adding is_escaping variable, which ensures escaped chars are handled correctly. xref #14693 closes #15096 Author: Rouz Azari <rouz.azari@gmail.com> Closes #15117 from rouzazari/to_json_lines_with_escaping and squashes the following commits: d114455 [Rouz Azari] BUG: Fix to_json lines with escaped characters
diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt
@@ -351,6 +351,7 @@ Bug Fixes
 - Require at least 0.23 version of cython to avoid problems with character encodings (:issue:`14699`)
 - Bug in converting object elements of array-like objects to unsigned 64-bit integers (:issue:`4471`, :issue:`14982`)
 - Bug in ``pd.pivot_table()`` where no error was raised when values argument was not in the columns (:issue:`14938`)
+- Bug in ``.to_json()`` where ``lines=True`` and contents (keys or values) contain escaped characters (:issue:`15096`)
 
 - Bug in ``DataFrame.groupby().describe()`` when grouping on ``Index`` containing tuples (:issue:`14848`)
 - Bug in creating a ``MultiIndex`` with tuples and not passing a list of names; this will now raise ``ValueError`` (:issue:`15110`)
diff --git a/pandas/io/tests/json/test_pandas.py b/pandas/io/tests/json/test_pandas.py
@@ -972,6 +972,15 @@ def test_to_jsonl(self):
         self.assertEqual(result, expected)
         assert_frame_equal(pd.read_json(result, lines=True), df)
 
+        # GH15096: escaped characters in columns and data
+        df = DataFrame([["foo\\", "bar"], ['foo"', "bar"]],
+                       columns=["a\\", 'b'])
+        result = df.to_json(orient="records", lines=True)
+        expected = ('{"a\\\\":"foo\\\\","b":"bar"}\n'
+                    '{"a\\\\":"foo\\"","b":"bar"}')
+        self.assertEqual(result, expected)
+        assert_frame_equal(pd.read_json(result, lines=True), df)
+
     def test_latin_encoding(self):
         if compat.PY2:
             self.assertRaisesRegexp(
diff --git a/pandas/lib.pyx b/pandas/lib.pyx
@@ -1098,7 +1098,8 @@ def convert_json_to_lines(object arr):
     to quotes & brackets
     """
     cdef:
-        Py_ssize_t i = 0, num_open_brackets_seen = 0, in_quotes = 0, length
+        Py_ssize_t i = 0, num_open_brackets_seen = 0, length
+        bint in_quotes = 0, is_escaping = 0
         ndarray[uint8_t] narr
         unsigned char v, comma, left_bracket, right_brack, newline
 
@@ -1113,8 +1114,10 @@ def convert_json_to_lines(object arr):
     length = narr.shape[0]
     for i in range(length):
         v = narr[i]
-        if v == quote and i > 0 and narr[i - 1] != backslash:
+        if v == quote and i > 0 and not is_escaping:
             in_quotes = ~in_quotes
+        if v == backslash or is_escaping:
+            is_escaping = ~is_escaping
         if v == comma: # commas that should be \n
             if num_open_brackets_seen == 0 and not in_quotes:
                 narr[i] = newline