From a5ee0f2819031dfcec188e81c33eb510e5dd453a Mon Sep 17 00:00:00 2001 From: Josh Owen Date: Wed, 26 Oct 2016 17:44:11 -0400 Subject: [PATCH 1/9] handle edge case where prior character is an escaped backslash --- pandas/lib.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/lib.pyx b/pandas/lib.pyx index b09a1c2755a06..d6e24bd6ad055 100644 --- a/pandas/lib.pyx +++ b/pandas/lib.pyx @@ -1111,7 +1111,7 @@ def convert_json_to_lines(object arr): length = narr.shape[0] for i in range(length): v = narr[i] - if v == quote and i > 0 and narr[i - 1] != backslash: + if v == quote and i > 0 and narr[i - 1] != backslash and narr[i - 2] != backslash: in_quotes = ~in_quotes if v == comma: # commas that should be \n if num_open_brackets_seen == 0 and not in_quotes: From 2d06e250366d9891f6601940ca52ab948e1cc4c0 Mon Sep 17 00:00:00 2001 From: Josh Owen Date: Wed, 26 Oct 2016 17:47:57 -0400 Subject: [PATCH 2/9] avoid out of bounds --- pandas/lib.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/lib.pyx b/pandas/lib.pyx index d6e24bd6ad055..5489338c5d174 100644 --- a/pandas/lib.pyx +++ b/pandas/lib.pyx @@ -1111,7 +1111,7 @@ def convert_json_to_lines(object arr): length = narr.shape[0] for i in range(length): v = narr[i] - if v == quote and i > 0 and narr[i - 1] != backslash and narr[i - 2] != backslash: + if v == quote and i > 0 and narr[i - 1] != backslash and i > 1 and narr[i - 2] != backslash: in_quotes = ~in_quotes if v == comma: # commas that should be \n if num_open_brackets_seen == 0 and not in_quotes: From 3a7bc179beaacd04ce4749d60273e26fb6036fc9 Mon Sep 17 00:00:00 2001 From: Josh Owen Date: Wed, 26 Oct 2016 18:10:45 -0400 Subject: [PATCH 3/9] just check that i > 1, add test --- pandas/io/tests/json/test_pandas.py | 4 ++-- pandas/lib.pyx | 7 +++++-- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/pandas/io/tests/json/test_pandas.py b/pandas/io/tests/json/test_pandas.py index 117ac2324d0e0..a8679854f4395 100644 --- a/pandas/io/tests/json/test_pandas.py +++ b/pandas/io/tests/json/test_pandas.py @@ -962,9 +962,9 @@ def test_to_jsonl(self): expected = '{"a":1,"b":2}\n{"a":1,"b":2}' self.assertEqual(result, expected) - df = DataFrame([["foo}", "bar"], ['foo"', "bar"]], columns=['a', 'b']) + df = DataFrame([["foo}", "bar"], ['foo"', "bar"], ['foo\\', "bar"]], columns=['a', 'b']) result = df.to_json(orient="records", lines=True) - expected = '{"a":"foo}","b":"bar"}\n{"a":"foo\\"","b":"bar"}' + expected = '{"a":"foo}","b":"bar"}\n{"a":"foo\\"","b":"bar"}\n{"a":"foo\\","b":"bar"}' self.assertEqual(result, expected) assert_frame_equal(pd.read_json(result, lines=True), df) diff --git a/pandas/lib.pyx b/pandas/lib.pyx index 5489338c5d174..1b2c0ac07ba21 100644 --- a/pandas/lib.pyx +++ b/pandas/lib.pyx @@ -1111,8 +1111,11 @@ def convert_json_to_lines(object arr): length = narr.shape[0] for i in range(length): v = narr[i] - if v == quote and i > 0 and narr[i - 1] != backslash and i > 1 and narr[i - 2] != backslash: - in_quotes = ~in_quotes + if v == quote and \ + ((i > 1 and narr[i - 1] != backslash and + narr[i - 2] != backslash) or + (i > 0 and narr[i - 1] != backslash)): + in_quotes = ~in_quotes if v == comma: # commas that should be \n if num_open_brackets_seen == 0 and not in_quotes: narr[i] = newline From b198a785a1b7273b706ea523e2d75073373abbbf Mon Sep 17 00:00:00 2001 From: Josh Owen Date: Wed, 26 Oct 2016 18:49:52 -0400 Subject: [PATCH 4/9] correctly handle trailing backslash --- pandas/lib.pyx | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/pandas/lib.pyx b/pandas/lib.pyx index 1b2c0ac07ba21..b98b1d87e6be8 100644 --- a/pandas/lib.pyx +++ b/pandas/lib.pyx @@ -1111,10 +1111,8 @@ def convert_json_to_lines(object arr): length = narr.shape[0] for i in range(length): v = narr[i] - if v == quote and \ - ((i > 1 and narr[i - 1] != backslash and - narr[i - 2] != backslash) or - (i > 0 and narr[i - 1] != backslash)): + if v == quote and i > 1 and (narr[i - 1] != backslash or + narr[i - 2] != backslash): in_quotes = ~in_quotes if v == comma: # commas that should be \n if num_open_brackets_seen == 0 and not in_quotes: From 933dd5056dea60cb8279f60f1c30c8771f0d383c Mon Sep 17 00:00:00 2001 From: Josh Owen Date: Wed, 26 Oct 2016 18:51:29 -0400 Subject: [PATCH 5/9] fixed typo --- pandas/lib.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/lib.pyx b/pandas/lib.pyx index b98b1d87e6be8..e8ae23ca65105 100644 --- a/pandas/lib.pyx +++ b/pandas/lib.pyx @@ -1112,7 +1112,7 @@ def convert_json_to_lines(object arr): for i in range(length): v = narr[i] if v == quote and i > 1 and (narr[i - 1] != backslash or - narr[i - 2] != backslash): + narr[i - 2] == backslash): in_quotes = ~in_quotes if v == comma: # commas that should be \n if num_open_brackets_seen == 0 and not in_quotes: From cc86b356208cb10ff4524578d6089265297cd858 Mon Sep 17 00:00:00 2001 From: Josh Owen Date: Wed, 26 Oct 2016 19:41:03 -0400 Subject: [PATCH 6/9] yet another logic change --- pandas/lib.pyx | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/lib.pyx b/pandas/lib.pyx index e8ae23ca65105..a02a91b8c09b2 100644 --- a/pandas/lib.pyx +++ b/pandas/lib.pyx @@ -1111,8 +1111,9 @@ def convert_json_to_lines(object arr): length = narr.shape[0] for i in range(length): v = narr[i] - if v == quote and i > 1 and (narr[i - 1] != backslash or - narr[i - 2] == backslash): + if v == quote: + if not (i > 0 and narr[i - 1] == backslash and + i + 1 < length and narr[i + 1] != comma): in_quotes = ~in_quotes if v == comma: # commas that should be \n if num_open_brackets_seen == 0 and not in_quotes: From 87b2798c0888ed3c756869da764f9fd4486d5952 Mon Sep 17 00:00:00 2001 From: Josh Owen Date: Fri, 18 Nov 2016 17:48:02 -0500 Subject: [PATCH 7/9] fixed expected data --- pandas/io/tests/json/test_pandas.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/tests/json/test_pandas.py b/pandas/io/tests/json/test_pandas.py index eaf8c7f809f56..2cb1d711890dd 100644 --- a/pandas/io/tests/json/test_pandas.py +++ b/pandas/io/tests/json/test_pandas.py @@ -964,7 +964,7 @@ def test_to_jsonl(self): df = DataFrame([["foo}", "bar"], ['foo"', "bar"], ['foo\\', "bar"]], columns=['a', 'b']) result = df.to_json(orient="records", lines=True) - expected = '{"a":"foo}","b":"bar"}\n{"a":"foo\\"","b":"bar"}\n{"a":"foo\\","b":"bar"}' + expected = '{"a":"foo}","b":"bar"}\n{"a":"foo\\"","b":"bar"}\n{"a":"foo\\\\","b":"bar"}' self.assertEqual(result, expected) assert_frame_equal(pd.read_json(result, lines=True), df) From abd9e8227bb1b0beb5baeaffe06385d1ef2c9a19 Mon Sep 17 00:00:00 2001 From: Josh Owen Date: Fri, 18 Nov 2016 18:13:09 -0500 Subject: [PATCH 8/9] lint --- pandas/io/tests/json/test_pandas.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pandas/io/tests/json/test_pandas.py b/pandas/io/tests/json/test_pandas.py index 2cb1d711890dd..496113659c7a7 100644 --- a/pandas/io/tests/json/test_pandas.py +++ b/pandas/io/tests/json/test_pandas.py @@ -962,9 +962,11 @@ def test_to_jsonl(self): expected = '{"a":1,"b":2}\n{"a":1,"b":2}' self.assertEqual(result, expected) - df = DataFrame([["foo}", "bar"], ['foo"', "bar"], ['foo\\', "bar"]], columns=['a', 'b']) + df = DataFrame([["foo}", "bar"], ['foo"', "bar"], ['foo\\', "bar"]], + columns=['a', 'b']) result = df.to_json(orient="records", lines=True) - expected = '{"a":"foo}","b":"bar"}\n{"a":"foo\\"","b":"bar"}\n{"a":"foo\\\\","b":"bar"}' + expected = ('{"a":"foo}","b":"bar"}\n{"a":"foo\\"","b":"bar"}\n' + '{"a":"foo\\\\","b":"bar"}') self.assertEqual(result, expected) assert_frame_equal(pd.read_json(result, lines=True), df) From 9908a7c1ecbf0329f54025a200ca1c74fc88d62c Mon Sep 17 00:00:00 2001 From: Josh Owen Date: Sun, 20 Nov 2016 20:15:55 -0500 Subject: [PATCH 9/9] added whatsnew entry --- doc/source/whatsnew/v0.19.2.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.19.2.txt b/doc/source/whatsnew/v0.19.2.txt index ecbd6e9b3b288..257cfd41f88dc 100644 --- a/doc/source/whatsnew/v0.19.2.txt +++ b/doc/source/whatsnew/v0.19.2.txt @@ -59,7 +59,7 @@ Bug Fixes - Bug in clipboard functions on Windows 10 and python 3 (:issue:`14362`, :issue:`12807`) - Bug in ``.to_clipboard()`` and Excel compat (:issue:`12529`) - +- Bug in to_json with lines=true containing backslashed quotes (:issue:`14693`)