diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 4a6cf117fd196..1abd24a6fbc10 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -733,6 +733,7 @@ I/O - Bug in :meth:`read_csv` where the order of the ``na_values`` makes an inconsistency when ``na_values`` is a list non-string values. (:issue:`59303`) - Bug in :meth:`read_excel` raising ``ValueError`` when passing array of boolean values when ``dtype="boolean"``. (:issue:`58159`) - Bug in :meth:`read_html` where ``rowspan`` in header row causes incorrect conversion to ``DataFrame``. (:issue:`60210`) +- Bug in :meth:`read_json` ignoring the given ``dtype`` when ``engine="pyarrow"`` (:issue:`59516`) - Bug in :meth:`read_json` not validating the ``typ`` argument to not be exactly ``"frame"`` or ``"series"`` (:issue:`59124`) - Bug in :meth:`read_json` where extreme value integers in string format were incorrectly parsed as a different integer number (:issue:`20608`) - Bug in :meth:`read_stata` raising ``KeyError`` when input file is stored in big-endian format and contains strL data. (:issue:`58638`) diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index e032e26d771d7..6b4f6c05c3123 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -32,10 +32,12 @@ from pandas.core.dtypes.common import ( ensure_str, is_string_dtype, + pandas_dtype, ) from pandas.core.dtypes.dtypes import PeriodDtype from pandas import ( + ArrowDtype, DataFrame, Index, MultiIndex, @@ -942,29 +944,61 @@ def read(self) -> DataFrame | Series: obj: DataFrame | Series with self: if self.engine == "pyarrow": - pyarrow_json = import_optional_dependency("pyarrow.json") - pa_table = pyarrow_json.read_json(self.data) - return arrow_table_to_pandas(pa_table, dtype_backend=self.dtype_backend) + obj = self._read_pyarrow() elif self.engine == "ujson": - if self.lines: - if self.chunksize: - obj = concat(self) - elif self.nrows: - lines = list(islice(self.data, self.nrows)) - lines_json = self._combine_lines(lines) - obj = self._get_object_parser(lines_json) - else: - data = ensure_str(self.data) - data_lines = data.split("\n") - obj = self._get_object_parser(self._combine_lines(data_lines)) - else: - obj = self._get_object_parser(self.data) - if self.dtype_backend is not lib.no_default: - return obj.convert_dtypes( - infer_objects=False, dtype_backend=self.dtype_backend - ) - else: - return obj + obj = self._read_ujson() + + return obj + + def _read_pyarrow(self) -> DataFrame: + """ + Read JSON using the pyarrow engine. + """ + pyarrow_json = import_optional_dependency("pyarrow.json") + options = None + + if isinstance(self.dtype, dict): + pa = import_optional_dependency("pyarrow") + fields = [] + for field, dtype in self.dtype.items(): + pd_dtype = pandas_dtype(dtype) + if isinstance(pd_dtype, ArrowDtype): + fields.append((field, pd_dtype.pyarrow_dtype)) + + schema = pa.schema(fields) + options = pyarrow_json.ParseOptions( + explicit_schema=schema, unexpected_field_behavior="infer" + ) + + pa_table = pyarrow_json.read_json(self.data, parse_options=options) + df = arrow_table_to_pandas(pa_table, dtype_backend=self.dtype_backend) + + return df + + def _read_ujson(self) -> DataFrame | Series: + """ + Read JSON using the ujson engine. + """ + obj: DataFrame | Series + if self.lines: + if self.chunksize: + obj = concat(self) + elif self.nrows: + lines = list(islice(self.data, self.nrows)) + lines_json = self._combine_lines(lines) + obj = self._get_object_parser(lines_json) + else: + data = ensure_str(self.data) + data_lines = data.split("\n") + obj = self._get_object_parser(self._combine_lines(data_lines)) + else: + obj = self._get_object_parser(self.data) + if self.dtype_backend is not lib.no_default: + return obj.convert_dtypes( + infer_objects=False, dtype_backend=self.dtype_backend + ) + else: + return obj def _get_object_parser(self, json: str) -> DataFrame | Series: """ diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 144b36166261b..fde9940ea78eb 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -1,6 +1,9 @@ import datetime from datetime import timedelta -from io import StringIO +from io import ( + BytesIO, + StringIO, +) import json import os import sys @@ -2183,6 +2186,30 @@ def test_read_json_dtype_backend( # string_storage setting -> ignore that for checking the result tm.assert_frame_equal(result, expected, check_column_type=False) + @td.skip_if_no("pyarrow") + @pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning") + def test_read_json_pyarrow_with_dtype(self): + dtype = {"a": "int32[pyarrow]", "b": "int64[pyarrow]"} + json = b'{"a": 1, "b": 2}\n' + + df = read_json( + BytesIO(json), + dtype=dtype, + lines=True, + engine="pyarrow", + dtype_backend="pyarrow", + ) + + result = df.dtypes + expected = Series( + data=[ + pd.ArrowDtype.construct_from_string("int32[pyarrow]"), + pd.ArrowDtype.construct_from_string("int64[pyarrow]"), + ], + index=["a", "b"], + ) + tm.assert_series_equal(result, expected) + @pytest.mark.parametrize("orient", ["split", "records", "index"]) def test_read_json_nullable_series(self, string_storage, dtype_backend, orient): # GH#50750