From e9219936e74d6f17eff10a8577e1263a52d18bb4 Mon Sep 17 00:00:00 2001 From: Fangchen Li Date: Mon, 15 Jun 2020 16:02:04 -0500 Subject: [PATCH 1/8] BUG: reading line-format JSON from file url #27135 --- pandas/io/json/_json.py | 2 ++ pandas/tests/io/json/data/line_delimited.json | 4 ++++ pandas/tests/io/json/test_readlines.py | 12 ++++++++++++ 3 files changed, 18 insertions(+) create mode 100644 pandas/tests/io/json/data/line_delimited.json diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index b973553a767ba..900d6cf8fdc29 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -690,6 +690,8 @@ def _preprocess_data(self, data): if hasattr(data, "read") and (not self.chunksize or not self.nrows): data = data.read() if not hasattr(data, "read") and (self.chunksize or self.nrows): + if isinstance(data, bytes): + data = data.decode() data = StringIO(data) return data diff --git a/pandas/tests/io/json/data/line_delimited.json b/pandas/tests/io/json/data/line_delimited.json new file mode 100644 index 0000000000000..108f8c7c5fba6 --- /dev/null +++ b/pandas/tests/io/json/data/line_delimited.json @@ -0,0 +1,4 @@ + + {"a": 1, "b": 2} + {"a": 3, "b": 4} + {"a": 5, "b": 6} diff --git a/pandas/tests/io/json/test_readlines.py b/pandas/tests/io/json/test_readlines.py index 53462eaaada8d..43dab79f0aa3d 100644 --- a/pandas/tests/io/json/test_readlines.py +++ b/pandas/tests/io/json/test_readlines.py @@ -219,3 +219,15 @@ def test_readjson_nrows_requires_lines(): msg = "nrows can only be passed if lines=True" with pytest.raises(ValueError, match=msg): pd.read_json(jsonl, lines=False, nrows=2) + + +def test_readjson_lines_chunks_fileurl(datapath): + # GH 27135 + # Test reading line-format JSON from file url + os_path = datapath("io", "json", "data", "line_delimited.json") + file_url = "file://localhost" + os_path + path_reader = pd.read_json(os_path, lines=True, chunksize=1) + df_list = list(path_reader) + url_reader = pd.read_json(file_url, lines=True, chunksize=1) + for index, chuck in enumerate(url_reader): + tm.assert_frame_equal(chuck, df_list[index]) From 08e438346e9c6e2bf7686b950d3141422d360a30 Mon Sep 17 00:00:00 2001 From: Fangchen Li Date: Mon, 15 Jun 2020 17:34:15 -0500 Subject: [PATCH 2/8] BUG: move type check to _get_data_from_filepath #27135 --- pandas/io/json/_json.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index 900d6cf8fdc29..3371473359857 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -1,6 +1,6 @@ from collections import abc import functools -from io import StringIO +from io import StringIO, BytesIO from itertools import islice import os from typing import Any, Callable, Optional, Type @@ -690,8 +690,6 @@ def _preprocess_data(self, data): if hasattr(data, "read") and (not self.chunksize or not self.nrows): data = data.read() if not hasattr(data, "read") and (self.chunksize or self.nrows): - if isinstance(data, bytes): - data = data.decode() data = StringIO(data) return data @@ -726,6 +724,9 @@ def _get_data_from_filepath(self, filepath_or_buffer): self.should_close = True self.open_stream = data + if isinstance(data, BytesIO): + data = data.getvalue().decode() + return data def _combine_lines(self, lines) -> str: From 93a46db3e435366fdce5c486d0e5f7048f09fc86 Mon Sep 17 00:00:00 2001 From: Fangchen Li Date: Mon, 15 Jun 2020 19:34:32 -0500 Subject: [PATCH 3/8] BUG: sort import and file url for windows #27135 --- pandas/io/json/_json.py | 2 +- pandas/tests/io/json/test_readlines.py | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index 3371473359857..ff37c36962aec 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -1,6 +1,6 @@ from collections import abc import functools -from io import StringIO, BytesIO +from io import BytesIO, StringIO from itertools import islice import os from typing import Any, Callable, Optional, Type diff --git a/pandas/tests/io/json/test_readlines.py b/pandas/tests/io/json/test_readlines.py index 43dab79f0aa3d..913e7c97600a3 100644 --- a/pandas/tests/io/json/test_readlines.py +++ b/pandas/tests/io/json/test_readlines.py @@ -1,4 +1,6 @@ from io import StringIO +from pathlib import PureWindowsPath +from platform import system import pytest @@ -226,6 +228,8 @@ def test_readjson_lines_chunks_fileurl(datapath): # Test reading line-format JSON from file url os_path = datapath("io", "json", "data", "line_delimited.json") file_url = "file://localhost" + os_path + if system() == "Windows": + file_url = PureWindowsPath(file_url) path_reader = pd.read_json(os_path, lines=True, chunksize=1) df_list = list(path_reader) url_reader = pd.read_json(file_url, lines=True, chunksize=1) From 3b28e5ee578374c9a3a62500c7cd3cd48d60571f Mon Sep 17 00:00:00 2001 From: Fangchen Li Date: Tue, 16 Jun 2020 11:51:48 -0500 Subject: [PATCH 4/8] BUG: explicit test case #27135 --- pandas/tests/io/json/test_readlines.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/pandas/tests/io/json/test_readlines.py b/pandas/tests/io/json/test_readlines.py index 913e7c97600a3..2d58f98c1caaa 100644 --- a/pandas/tests/io/json/test_readlines.py +++ b/pandas/tests/io/json/test_readlines.py @@ -226,12 +226,15 @@ def test_readjson_nrows_requires_lines(): def test_readjson_lines_chunks_fileurl(datapath): # GH 27135 # Test reading line-format JSON from file url + df_list_expected = [ + pd.DataFrame([[1, 2]], columns=["a", "b"], index=[0]), + pd.DataFrame([[3, 4]], columns=["a", "b"], index=[1]), + pd.DataFrame([[5, 6]], columns=["a", "b"], index=[2]), + ] os_path = datapath("io", "json", "data", "line_delimited.json") file_url = "file://localhost" + os_path if system() == "Windows": file_url = PureWindowsPath(file_url) - path_reader = pd.read_json(os_path, lines=True, chunksize=1) - df_list = list(path_reader) url_reader = pd.read_json(file_url, lines=True, chunksize=1) for index, chuck in enumerate(url_reader): - tm.assert_frame_equal(chuck, df_list[index]) + tm.assert_frame_equal(chuck, df_list_expected[index]) From b34bc15f613cf036e52285409eb4dacca78ef9a0 Mon Sep 17 00:00:00 2001 From: Fangchen Li Date: Tue, 16 Jun 2020 11:53:08 -0500 Subject: [PATCH 5/8] BUG: update json file #27135 --- pandas/tests/io/json/data/line_delimited.json | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/tests/io/json/data/line_delimited.json b/pandas/tests/io/json/data/line_delimited.json index 108f8c7c5fba6..be84245329583 100644 --- a/pandas/tests/io/json/data/line_delimited.json +++ b/pandas/tests/io/json/data/line_delimited.json @@ -1,4 +1,3 @@ - {"a": 1, "b": 2} {"a": 3, "b": 4} {"a": 5, "b": 6} From d7b06d5c1f1a81f5204ea3f2ff2a7f34cea472ab Mon Sep 17 00:00:00 2001 From: Fangchen Li Date: Tue, 16 Jun 2020 14:30:03 -0500 Subject: [PATCH 6/8] BUG: construct path #27135 --- pandas/tests/io/json/test_readlines.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/pandas/tests/io/json/test_readlines.py b/pandas/tests/io/json/test_readlines.py index 2d58f98c1caaa..b475fa2c514ff 100644 --- a/pandas/tests/io/json/test_readlines.py +++ b/pandas/tests/io/json/test_readlines.py @@ -1,6 +1,5 @@ from io import StringIO -from pathlib import PureWindowsPath -from platform import system +from pathlib import Path import pytest @@ -232,9 +231,7 @@ def test_readjson_lines_chunks_fileurl(datapath): pd.DataFrame([[5, 6]], columns=["a", "b"], index=[2]), ] os_path = datapath("io", "json", "data", "line_delimited.json") - file_url = "file://localhost" + os_path - if system() == "Windows": - file_url = PureWindowsPath(file_url) + file_url = Path(os_path).as_uri() url_reader = pd.read_json(file_url, lines=True, chunksize=1) for index, chuck in enumerate(url_reader): tm.assert_frame_equal(chuck, df_list_expected[index]) From f58c4267d37d88d05bfbe20b8cb835815d314854 Mon Sep 17 00:00:00 2001 From: Fangchen Li Date: Wed, 17 Jun 2020 16:55:44 -0500 Subject: [PATCH 7/8] add entry to whatsnews doc 1.10 (#27135) --- doc/source/whatsnew/v1.1.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 10522ff797c59..27a7ffa2b3b4e 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -988,6 +988,7 @@ I/O - Bug in :meth:`~SQLDatabase.execute` was raising a ``ProgrammingError`` for some DB-API drivers when the SQL statement contained the `%` character and no parameters were present (:issue:`34211`) - Bug in :meth:`~pandas.io.stata.StataReader` which resulted in categorical variables with difference dtypes when reading data using an iterator. (:issue:`31544`) - :meth:`HDFStore.keys` has now an optional `include` parameter that allows the retrieval of all native HDF5 table names (:issue:`29916`) +- :meth:`read_json` now could read line-delimited JSON from file url. (:issue:`27135`) Plotting ^^^^^^^^ From dab1d401a1cd56d46d228c6bc85d909c13fd09d5 Mon Sep 17 00:00:00 2001 From: Fangchen Li Date: Sat, 27 Jun 2020 01:54:00 -0500 Subject: [PATCH 8/8] whatsnew 1.1 (#27135) --- doc/source/whatsnew/v1.1.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index c5eb2febe8ae9..70c45acec9f35 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -1040,6 +1040,7 @@ I/O - Bug in :meth:`read_excel` for ODS files removes 0.0 values (:issue:`27222`) - Bug in :meth:`ujson.encode` was raising an `OverflowError` with numbers larger than sys.maxsize (:issue: `34395`) - Bug in :meth:`HDFStore.append_to_multiple` was raising a ``ValueError`` when the min_itemsize parameter is set (:issue:`11238`) +- :meth:`read_json` now could read line-delimited json file from a file url while `lines` and `chunksize` are set. Plotting ^^^^^^^^