diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index c5eb2febe8ae9..70c45acec9f35 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -1040,6 +1040,7 @@ I/O - Bug in :meth:`read_excel` for ODS files removes 0.0 values (:issue:`27222`) - Bug in :meth:`ujson.encode` was raising an `OverflowError` with numbers larger than sys.maxsize (:issue: `34395`) - Bug in :meth:`HDFStore.append_to_multiple` was raising a ``ValueError`` when the min_itemsize parameter is set (:issue:`11238`) +- :meth:`read_json` now could read line-delimited json file from a file url while `lines` and `chunksize` are set. Plotting ^^^^^^^^ diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index b973553a767ba..ff37c36962aec 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -1,6 +1,6 @@ from collections import abc import functools -from io import StringIO +from io import BytesIO, StringIO from itertools import islice import os from typing import Any, Callable, Optional, Type @@ -724,6 +724,9 @@ def _get_data_from_filepath(self, filepath_or_buffer): self.should_close = True self.open_stream = data + if isinstance(data, BytesIO): + data = data.getvalue().decode() + return data def _combine_lines(self, lines) -> str: diff --git a/pandas/tests/io/json/data/line_delimited.json b/pandas/tests/io/json/data/line_delimited.json new file mode 100644 index 0000000000000..be84245329583 --- /dev/null +++ b/pandas/tests/io/json/data/line_delimited.json @@ -0,0 +1,3 @@ + {"a": 1, "b": 2} + {"a": 3, "b": 4} + {"a": 5, "b": 6} diff --git a/pandas/tests/io/json/test_readlines.py b/pandas/tests/io/json/test_readlines.py index 53462eaaada8d..b475fa2c514ff 100644 --- a/pandas/tests/io/json/test_readlines.py +++ b/pandas/tests/io/json/test_readlines.py @@ -1,4 +1,5 @@ from io import StringIO +from pathlib import Path import pytest @@ -219,3 +220,18 @@ def test_readjson_nrows_requires_lines(): msg = "nrows can only be passed if lines=True" with pytest.raises(ValueError, match=msg): pd.read_json(jsonl, lines=False, nrows=2) + + +def test_readjson_lines_chunks_fileurl(datapath): + # GH 27135 + # Test reading line-format JSON from file url + df_list_expected = [ + pd.DataFrame([[1, 2]], columns=["a", "b"], index=[0]), + pd.DataFrame([[3, 4]], columns=["a", "b"], index=[1]), + pd.DataFrame([[5, 6]], columns=["a", "b"], index=[2]), + ] + os_path = datapath("io", "json", "data", "line_delimited.json") + file_url = Path(os_path).as_uri() + url_reader = pd.read_json(file_url, lines=True, chunksize=1) + for index, chuck in enumerate(url_reader): + tm.assert_frame_equal(chuck, df_list_expected[index])