diff --git a/asv_bench/benchmarks/packers.py b/asv_bench/benchmarks/packers.py index 3f80c4c0c6338..5419571c75b43 100644 --- a/asv_bench/benchmarks/packers.py +++ b/asv_bench/benchmarks/packers.py @@ -547,6 +547,31 @@ def remove(self, f): pass +class packers_write_json_lines(object): + goal_time = 0.2 + + def setup(self): + self.f = '__test__.msg' + self.N = 100000 + self.C = 5 + self.index = date_range('20000101', periods=self.N, freq='H') + self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) + self.remove(self.f) + self.df.index = np.arange(self.N) + + def time_packers_write_json_lines(self): + self.df.to_json(self.f, orient="records", lines=True) + + def teardown(self): + self.remove(self.f) + + def remove(self, f): + try: + os.remove(self.f) + except: + pass + + class packers_write_json_T(object): goal_time = 0.2 diff --git a/doc/source/whatsnew/v0.19.1.txt b/doc/source/whatsnew/v0.19.1.txt index 3edb8c1fa9071..843dc980d420c 100644 --- a/doc/source/whatsnew/v0.19.1.txt +++ b/doc/source/whatsnew/v0.19.1.txt @@ -45,3 +45,4 @@ Bug Fixes - Bug in ``pd.concat`` where names of the ``keys`` were not propagated to the resulting ``MultiIndex`` (:issue:`14252`) - Bug in ``MultiIndex.set_levels`` where illegal level values were still set after raising an error (:issue:`13754`) +- Bug in ``DataFrame.to_json`` where ``lines=True`` and a value contained a ``}`` character (:issue:`14391`) diff --git a/pandas/io/json.py b/pandas/io/json.py index e697351484f68..66a8e76c09a6f 100644 --- a/pandas/io/json.py +++ b/pandas/io/json.py @@ -607,14 +607,19 @@ def _convert_to_line_delimits(s): s = s[1:-1] num_open_brackets_seen = 0 commas_to_replace = [] + in_quotes = False for idx, char in enumerate(s): # iter through to find all - if char == ',': # commas that should be \n - if num_open_brackets_seen == 0: + if char == '"' and idx > 0 and s[idx - 1] != '\\': + in_quotes = ~in_quotes + elif char == ',': # commas that should be \n + if num_open_brackets_seen == 0 and not in_quotes: commas_to_replace.append(idx) elif char == '{': - num_open_brackets_seen += 1 + if not in_quotes: + num_open_brackets_seen += 1 elif char == '}': - num_open_brackets_seen -= 1 + if not in_quotes: + num_open_brackets_seen -= 1 s_arr = np.array(list(s)) # Turn to an array to set s_arr[commas_to_replace] = '\n' # all commas at once. s = ''.join(s_arr) diff --git a/pandas/io/tests/json/test_pandas.py b/pandas/io/tests/json/test_pandas.py index 47bdd25572fc7..65311b5160aa7 100644 --- a/pandas/io/tests/json/test_pandas.py +++ b/pandas/io/tests/json/test_pandas.py @@ -962,6 +962,12 @@ def test_to_jsonl(self): expected = '{"a":1,"b":2}\n{"a":1,"b":2}' self.assertEqual(result, expected) + df = DataFrame([["foo}", "bar"], ['foo"', "bar"]], columns=['a', 'b']) + result = df.to_json(orient="records", lines=True) + expected = '{"a":"foo}","b":"bar"}\n{"a":"foo\\"","b":"bar"}' + self.assertEqual(result, expected) + assert_frame_equal(pd.read_json(result, lines=True), df) + def test_latin_encoding(self): if compat.PY2: self.assertRaisesRegexp(