From 58074fdbe01f61ba00d00b1602dce5021bed24d7 Mon Sep 17 00:00:00 2001 From: Josh Owen Date: Mon, 10 Oct 2016 18:54:34 -0400 Subject: [PATCH 01/11] fix for quoted special characters --- pandas/io/json.py | 13 +++++++++---- pandas/io/tests/json/test_pandas.py | 5 +++++ 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/pandas/io/json.py b/pandas/io/json.py index e697351484f68..7909cd63fe005 100644 --- a/pandas/io/json.py +++ b/pandas/io/json.py @@ -607,14 +607,19 @@ def _convert_to_line_delimits(s): s = s[1:-1] num_open_brackets_seen = 0 commas_to_replace = [] + in_quotes = False for idx, char in enumerate(s): # iter through to find all - if char == ',': # commas that should be \n - if num_open_brackets_seen == 0: + if char == '"': + in_quotes = ~in_quotes + elif char == ',': # commas that should be \n + if num_open_brackets_seen == 0 and not in_quotes: commas_to_replace.append(idx) elif char == '{': - num_open_brackets_seen += 1 + if not in_quotes: + num_open_brackets_seen += 1 elif char == '}': - num_open_brackets_seen -= 1 + if not in_quotes: + num_open_brackets_seen -= 1 s_arr = np.array(list(s)) # Turn to an array to set s_arr[commas_to_replace] = '\n' # all commas at once. s = ''.join(s_arr) diff --git a/pandas/io/tests/json/test_pandas.py b/pandas/io/tests/json/test_pandas.py index 47bdd25572fc7..f5b1019a7281b 100644 --- a/pandas/io/tests/json/test_pandas.py +++ b/pandas/io/tests/json/test_pandas.py @@ -962,6 +962,11 @@ def test_to_jsonl(self): expected = '{"a":1,"b":2}\n{"a":1,"b":2}' self.assertEqual(result, expected) + df = DataFrame([["foo}", "bar"], ["foo", "bar"]], columns=['a', 'b']) + result = df.to_json(orient="records", lines=True) + expected = '{"a":"foo},"b":"bar"}\n{"a":"foo","b":"bar"}' + self.assertEqual(result, expected) + def test_latin_encoding(self): if compat.PY2: self.assertRaisesRegexp( From 9b150b591e1af4f8aa333408147aab2eec201e0f Mon Sep 17 00:00:00 2001 From: Josh Owen Date: Mon, 10 Oct 2016 19:09:14 -0400 Subject: [PATCH 02/11] fix typo in expected output --- pandas/io/tests/json/test_pandas.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/tests/json/test_pandas.py b/pandas/io/tests/json/test_pandas.py index f5b1019a7281b..6d17c526f5def 100644 --- a/pandas/io/tests/json/test_pandas.py +++ b/pandas/io/tests/json/test_pandas.py @@ -964,7 +964,7 @@ def test_to_jsonl(self): df = DataFrame([["foo}", "bar"], ["foo", "bar"]], columns=['a', 'b']) result = df.to_json(orient="records", lines=True) - expected = '{"a":"foo},"b":"bar"}\n{"a":"foo","b":"bar"}' + expected = '{"a":"foo}","b":"bar"}\n{"a":"foo","b":"bar"}' self.assertEqual(result, expected) def test_latin_encoding(self): From d2724d3ba61b3cb60413545318d7bfd67625e485 Mon Sep 17 00:00:00 2001 From: Josh Owen Date: Mon, 10 Oct 2016 19:12:40 -0400 Subject: [PATCH 03/11] added whatsnew entry --- doc/source/whatsnew/v0.19.1.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v0.19.1.txt b/doc/source/whatsnew/v0.19.1.txt index 3edb8c1fa9071..843dc980d420c 100644 --- a/doc/source/whatsnew/v0.19.1.txt +++ b/doc/source/whatsnew/v0.19.1.txt @@ -45,3 +45,4 @@ Bug Fixes - Bug in ``pd.concat`` where names of the ``keys`` were not propagated to the resulting ``MultiIndex`` (:issue:`14252`) - Bug in ``MultiIndex.set_levels`` where illegal level values were still set after raising an error (:issue:`13754`) +- Bug in ``DataFrame.to_json`` where ``lines=True`` and a value contained a ``}`` character (:issue:`14391`) From 3a9dd5d996ba55c72f9db0dcad4a1017c1957c88 Mon Sep 17 00:00:00 2001 From: Josh Owen Date: Tue, 11 Oct 2016 09:45:50 -0400 Subject: [PATCH 04/11] added asv benchmark --- asv_bench/benchmarks/packers.py | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/asv_bench/benchmarks/packers.py b/asv_bench/benchmarks/packers.py index 3f80c4c0c6338..50226a35bd902 100644 --- a/asv_bench/benchmarks/packers.py +++ b/asv_bench/benchmarks/packers.py @@ -546,6 +546,35 @@ def remove(self, f): except: pass +class packers_write_json_lines(object): + goal_time = 0.2 + + def setup(self): + self.f = '__test__.msg' + self.N = 100000 + self.C = 5 + self.index = date_range('20000101', periods=self.N, freq='H') + self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) + self.N = 100000 + self.C = 5 + self.index = date_range('20000101', periods=self.N, freq='H') + self.df2 = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) + self.df2['object'] = [('%08x' % randrange((16 ** 8))) for _ in range(self.N)] + self.remove(self.f) + self.df.index = np.arange(self.N) + + def time_packers_write_json_lines(self): + self.df.to_json(self.f, orient="records", lines=True) + + def teardown(self): + self.remove(self.f) + + def remove(self, f): + try: + os.remove(self.f) + except: + pass + class packers_write_json_T(object): goal_time = 0.2 From 3560a8e4460b597fe051dde9c310343137ef4f10 Mon Sep 17 00:00:00 2001 From: Josh Owen Date: Tue, 11 Oct 2016 10:08:58 -0400 Subject: [PATCH 05/11] added whitespace --- asv_bench/benchmarks/packers.py | 1 + 1 file changed, 1 insertion(+) diff --git a/asv_bench/benchmarks/packers.py b/asv_bench/benchmarks/packers.py index 50226a35bd902..1ef5b7f285d65 100644 --- a/asv_bench/benchmarks/packers.py +++ b/asv_bench/benchmarks/packers.py @@ -546,6 +546,7 @@ def remove(self, f): except: pass + class packers_write_json_lines(object): goal_time = 0.2 From 2aefa8572a6e0affebc79a6f01beda242ab1dbbc Mon Sep 17 00:00:00 2001 From: Josh Owen Date: Wed, 12 Oct 2016 04:56:10 -0400 Subject: [PATCH 06/11] handle double quotes in strings --- pandas/io/json.py | 2 +- pandas/io/tests/json/test_pandas.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/io/json.py b/pandas/io/json.py index 7909cd63fe005..2572a4bb742a2 100644 --- a/pandas/io/json.py +++ b/pandas/io/json.py @@ -609,7 +609,7 @@ def _convert_to_line_delimits(s): commas_to_replace = [] in_quotes = False for idx, char in enumerate(s): # iter through to find all - if char == '"': + if char == '"' and idx > 0 and s[idx-1] != '\\': in_quotes = ~in_quotes elif char == ',': # commas that should be \n if num_open_brackets_seen == 0 and not in_quotes: diff --git a/pandas/io/tests/json/test_pandas.py b/pandas/io/tests/json/test_pandas.py index 6d17c526f5def..b007c5b578c7d 100644 --- a/pandas/io/tests/json/test_pandas.py +++ b/pandas/io/tests/json/test_pandas.py @@ -962,9 +962,9 @@ def test_to_jsonl(self): expected = '{"a":1,"b":2}\n{"a":1,"b":2}' self.assertEqual(result, expected) - df = DataFrame([["foo}", "bar"], ["foo", "bar"]], columns=['a', 'b']) + df = DataFrame([["foo}", "bar"], ["foo\"", "bar"]], columns=['a', 'b']) result = df.to_json(orient="records", lines=True) - expected = '{"a":"foo}","b":"bar"}\n{"a":"foo","b":"bar"}' + expected = '{"a":"foo}","b":"bar"}\n{"a":"foo\"","b":"bar"}' self.assertEqual(result, expected) def test_latin_encoding(self): From 444e6c2d9cafe2ede61d07daa893cf499c86b104 Mon Sep 17 00:00:00 2001 From: Josh Owen Date: Wed, 12 Oct 2016 05:35:55 -0400 Subject: [PATCH 07/11] fix typo --- pandas/io/tests/json/test_pandas.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/io/tests/json/test_pandas.py b/pandas/io/tests/json/test_pandas.py index b007c5b578c7d..3de4c2e4e8e46 100644 --- a/pandas/io/tests/json/test_pandas.py +++ b/pandas/io/tests/json/test_pandas.py @@ -962,9 +962,9 @@ def test_to_jsonl(self): expected = '{"a":1,"b":2}\n{"a":1,"b":2}' self.assertEqual(result, expected) - df = DataFrame([["foo}", "bar"], ["foo\"", "bar"]], columns=['a', 'b']) + df = DataFrame([["foo}", "bar"], ['foo"', "bar"]], columns=['a', 'b']) result = df.to_json(orient="records", lines=True) - expected = '{"a":"foo}","b":"bar"}\n{"a":"foo\"","b":"bar"}' + expected = '{"a":"foo}","b":"bar"}\n{"a":"foo"","b":"bar"}' self.assertEqual(result, expected) def test_latin_encoding(self): From be43f39d84a9bcda333d3c6d3925ffa2e65924f4 Mon Sep 17 00:00:00 2001 From: Josh Owen Date: Wed, 12 Oct 2016 06:41:28 -0400 Subject: [PATCH 08/11] fixed string formatting --- pandas/io/tests/json/test_pandas.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/tests/json/test_pandas.py b/pandas/io/tests/json/test_pandas.py index 3de4c2e4e8e46..9ba08103f023a 100644 --- a/pandas/io/tests/json/test_pandas.py +++ b/pandas/io/tests/json/test_pandas.py @@ -964,7 +964,7 @@ def test_to_jsonl(self): df = DataFrame([["foo}", "bar"], ['foo"', "bar"]], columns=['a', 'b']) result = df.to_json(orient="records", lines=True) - expected = '{"a":"foo}","b":"bar"}\n{"a":"foo"","b":"bar"}' + expected = '{"a":"foo}","b":"bar"}\n{"a":"foo\\"","b":"bar"}' self.assertEqual(result, expected) def test_latin_encoding(self): From edb148824da7a321c6e76abf464a6993fe7be714 Mon Sep 17 00:00:00 2001 From: Josh Owen Date: Wed, 12 Oct 2016 08:26:24 -0400 Subject: [PATCH 09/11] lint --- pandas/io/json.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/json.py b/pandas/io/json.py index 2572a4bb742a2..66a8e76c09a6f 100644 --- a/pandas/io/json.py +++ b/pandas/io/json.py @@ -609,7 +609,7 @@ def _convert_to_line_delimits(s): commas_to_replace = [] in_quotes = False for idx, char in enumerate(s): # iter through to find all - if char == '"' and idx > 0 and s[idx-1] != '\\': + if char == '"' and idx > 0 and s[idx - 1] != '\\': in_quotes = ~in_quotes elif char == ',': # commas that should be \n if num_open_brackets_seen == 0 and not in_quotes: From 8b057ff85bb635a425c0189a031000a4d9fdb7d4 Mon Sep 17 00:00:00 2001 From: Josh Owen Date: Wed, 12 Oct 2016 16:51:01 -0400 Subject: [PATCH 10/11] remove duplicate and assert round_trip works --- asv_bench/benchmarks/packers.py | 3 --- pandas/io/tests/json/test_pandas.py | 1 + 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/asv_bench/benchmarks/packers.py b/asv_bench/benchmarks/packers.py index 1ef5b7f285d65..2bc1bc5101cce 100644 --- a/asv_bench/benchmarks/packers.py +++ b/asv_bench/benchmarks/packers.py @@ -556,9 +556,6 @@ def setup(self): self.C = 5 self.index = date_range('20000101', periods=self.N, freq='H') self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') self.df2 = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) self.df2['object'] = [('%08x' % randrange((16 ** 8))) for _ in range(self.N)] self.remove(self.f) diff --git a/pandas/io/tests/json/test_pandas.py b/pandas/io/tests/json/test_pandas.py index 9ba08103f023a..65311b5160aa7 100644 --- a/pandas/io/tests/json/test_pandas.py +++ b/pandas/io/tests/json/test_pandas.py @@ -966,6 +966,7 @@ def test_to_jsonl(self): result = df.to_json(orient="records", lines=True) expected = '{"a":"foo}","b":"bar"}\n{"a":"foo\\"","b":"bar"}' self.assertEqual(result, expected) + assert_frame_equal(pd.read_json(result, lines=True), df) def test_latin_encoding(self): if compat.PY2: From 8591cb99f74bcda4523915481db292cc801c4a97 Mon Sep 17 00:00:00 2001 From: Josh Owen Date: Wed, 12 Oct 2016 17:04:58 -0400 Subject: [PATCH 11/11] remove duplicated code --- asv_bench/benchmarks/packers.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/asv_bench/benchmarks/packers.py b/asv_bench/benchmarks/packers.py index 2bc1bc5101cce..5419571c75b43 100644 --- a/asv_bench/benchmarks/packers.py +++ b/asv_bench/benchmarks/packers.py @@ -556,8 +556,6 @@ def setup(self): self.C = 5 self.index = date_range('20000101', periods=self.N, freq='H') self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.df2 = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.df2['object'] = [('%08x' % randrange((16 ** 8))) for _ in range(self.N)] self.remove(self.f) self.df.index = np.arange(self.N)