From 835edf643162493918ac46b6dc2e712e427ccbcd Mon Sep 17 00:00:00 2001 From: william larkin <56956489+will-larkin@users.noreply.github.com> Date: Mon, 24 Feb 2025 11:58:51 +0000 Subject: [PATCH 01/26] fix: pass dtypes to read_json with pyarrow engine --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/io/json/_json.py | 73 ++++++++++++++++++++--------- pandas/tests/io/json/test_pandas.py | 22 +++++++++ 3 files changed, 74 insertions(+), 22 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 090be6dc250ba..a7c4ab67c73b7 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -732,6 +732,7 @@ I/O - Bug in :meth:`read_stata` where the missing code for double was not recognised for format versions 105 and prior (:issue:`58149`) - Bug in :meth:`set_option` where setting the pandas option ``display.html.use_mathjax`` to ``False`` has no effect (:issue:`59884`) - Bug in :meth:`to_excel` where :class:`MultiIndex` columns would be merged to a single row when ``merge_cells=False`` is passed (:issue:`60274`) +- Bug in :meth:`read_json` ignoring the given ``dtype`` when ``engine="pyarrow"`` (:issue:`59516`) Period ^^^^^^ diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index e032e26d771d7..b28f29944eb34 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -32,6 +32,7 @@ from pandas.core.dtypes.common import ( ensure_str, is_string_dtype, + pandas_dtype, ) from pandas.core.dtypes.dtypes import PeriodDtype @@ -43,6 +44,7 @@ isna, notna, to_datetime, + ArrowDtype, ) from pandas.core.reshape.concat import concat from pandas.core.shared_docs import _shared_docs @@ -942,29 +944,56 @@ def read(self) -> DataFrame | Series: obj: DataFrame | Series with self: if self.engine == "pyarrow": - pyarrow_json = import_optional_dependency("pyarrow.json") - pa_table = pyarrow_json.read_json(self.data) - return arrow_table_to_pandas(pa_table, dtype_backend=self.dtype_backend) + obj = self._read_pyarrow() elif self.engine == "ujson": - if self.lines: - if self.chunksize: - obj = concat(self) - elif self.nrows: - lines = list(islice(self.data, self.nrows)) - lines_json = self._combine_lines(lines) - obj = self._get_object_parser(lines_json) - else: - data = ensure_str(self.data) - data_lines = data.split("\n") - obj = self._get_object_parser(self._combine_lines(data_lines)) - else: - obj = self._get_object_parser(self.data) - if self.dtype_backend is not lib.no_default: - return obj.convert_dtypes( - infer_objects=False, dtype_backend=self.dtype_backend - ) - else: - return obj + obj = self._read_ujson() + + return obj + + def _read_pyarrow(self) -> DataFrame: + """ + Read JSON using the pyarrow engine. + """ + pyarrow_json = import_optional_dependency("pyarrow.json") + options = None + + if isinstance(self.dtype, dict): + pa = import_optional_dependency("pyarrow") + fields = [ + (field, pandas_dtype(dtype).pyarrow_dtype) + for field, dtype in self.dtype.items() + if isinstance(pandas_dtype(dtype), ArrowDtype) + ] + + schema = pa.schema(fields) + options = pyarrow_json.ParseOptions(explicit_schema=schema) + + pa_table = pyarrow_json.read_json(self.data, parse_options=options) + return arrow_table_to_pandas(pa_table, dtype_backend=self.dtype_backend) + + def _read_ujson(self) -> DataFrame | Series: + """ + Read JSON using the ujson engine. + """ + if self.lines: + if self.chunksize: + obj = concat(self) + elif self.nrows: + lines = list(islice(self.data, self.nrows)) + lines_json = self._combine_lines(lines) + obj = self._get_object_parser(lines_json) + else: + data = ensure_str(self.data) + data_lines = data.split("\n") + obj = self._get_object_parser(self._combine_lines(data_lines)) + else: + obj = self._get_object_parser(self.data) + if self.dtype_backend is not lib.no_default: + return obj.convert_dtypes( + infer_objects=False, dtype_backend=self.dtype_backend + ) + else: + return obj def _get_object_parser(self, json: str) -> DataFrame | Series: """ diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 144b36166261b..c03f203deba11 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -2183,6 +2183,28 @@ def test_read_json_dtype_backend( # string_storage setting -> ignore that for checking the result tm.assert_frame_equal(result, expected, check_column_type=False) + @td.skip_if_no("pyarrow") # type: ignore + def test_read_json_pyarrow_with_dtype(self, datapath): + dtype = {"a": "int32[pyarrow]", "b": "int64[pyarrow]"} + + df = read_json( + datapath("io", "json", "data", "line_delimited.json"), + dtype=dtype, + lines=True, + engine="pyarrow", + dtype_backend="pyarrow", + ) + + result = df.dtypes + expected = Series( + [ + pd.ArrowDtype.construct_from_string("int32[pyarrow]"), + pd.ArrowDtype.construct_from_string("int64[pyarrow]"), + ], + index=["a", "b"], + ) + tm.assert_series_equal(result, expected) + @pytest.mark.parametrize("orient", ["split", "records", "index"]) def test_read_json_nullable_series(self, string_storage, dtype_backend, orient): # GH#50750 From 074b3cb876ca23d25578a73b805965a715437415 Mon Sep 17 00:00:00 2001 From: william larkin <56956489+will-larkin@users.noreply.github.com> Date: Mon, 24 Feb 2025 12:14:37 +0000 Subject: [PATCH 02/26] fix: code checks --- doc/source/whatsnew/v3.0.0.rst | 2 +- pandas/io/json/_json.py | 8 ++++---- pandas/tests/io/json/test_pandas.py | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index a7c4ab67c73b7..f3f6bb9bf08d5 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -725,6 +725,7 @@ I/O - Bug in :meth:`read_csv` where the order of the ``na_values`` makes an inconsistency when ``na_values`` is a list non-string values. (:issue:`59303`) - Bug in :meth:`read_excel` raising ``ValueError`` when passing array of boolean values when ``dtype="boolean"``. (:issue:`58159`) - Bug in :meth:`read_html` where ``rowspan`` in header row causes incorrect conversion to ``DataFrame``. (:issue:`60210`) +- Bug in :meth:`read_json` ignoring the given ``dtype`` when ``engine="pyarrow"`` (:issue:`59516`) - Bug in :meth:`read_json` not validating the ``typ`` argument to not be exactly ``"frame"`` or ``"series"`` (:issue:`59124`) - Bug in :meth:`read_json` where extreme value integers in string format were incorrectly parsed as a different integer number (:issue:`20608`) - Bug in :meth:`read_stata` raising ``KeyError`` when input file is stored in big-endian format and contains strL data. (:issue:`58638`) @@ -732,7 +733,6 @@ I/O - Bug in :meth:`read_stata` where the missing code for double was not recognised for format versions 105 and prior (:issue:`58149`) - Bug in :meth:`set_option` where setting the pandas option ``display.html.use_mathjax`` to ``False`` has no effect (:issue:`59884`) - Bug in :meth:`to_excel` where :class:`MultiIndex` columns would be merged to a single row when ``merge_cells=False`` is passed (:issue:`60274`) -- Bug in :meth:`read_json` ignoring the given ``dtype`` when ``engine="pyarrow"`` (:issue:`59516`) Period ^^^^^^ diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index b28f29944eb34..5256b91388198 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -37,6 +37,7 @@ from pandas.core.dtypes.dtypes import PeriodDtype from pandas import ( + ArrowDtype, DataFrame, Index, MultiIndex, @@ -44,7 +45,6 @@ isna, notna, to_datetime, - ArrowDtype, ) from pandas.core.reshape.concat import concat from pandas.core.shared_docs import _shared_docs @@ -947,7 +947,7 @@ def read(self) -> DataFrame | Series: obj = self._read_pyarrow() elif self.engine == "ujson": obj = self._read_ujson() - + return obj def _read_pyarrow(self) -> DataFrame: @@ -967,10 +967,10 @@ def _read_pyarrow(self) -> DataFrame: schema = pa.schema(fields) options = pyarrow_json.ParseOptions(explicit_schema=schema) - + pa_table = pyarrow_json.read_json(self.data, parse_options=options) return arrow_table_to_pandas(pa_table, dtype_backend=self.dtype_backend) - + def _read_ujson(self) -> DataFrame | Series: """ Read JSON using the ujson engine. diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index c03f203deba11..0d2ef94be2d5c 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -2183,7 +2183,7 @@ def test_read_json_dtype_backend( # string_storage setting -> ignore that for checking the result tm.assert_frame_equal(result, expected, check_column_type=False) - @td.skip_if_no("pyarrow") # type: ignore + @td.skip_if_no("pyarrow") def test_read_json_pyarrow_with_dtype(self, datapath): dtype = {"a": "int32[pyarrow]", "b": "int64[pyarrow]"} From 28fa3322cb24856abc501977117cdd496dfdbe9c Mon Sep 17 00:00:00 2001 From: william larkin <56956489+will-larkin@users.noreply.github.com> Date: Mon, 24 Feb 2025 13:57:41 +0000 Subject: [PATCH 03/26] fix: commit checks --- pandas/io/json/_json.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index 5256b91388198..7934d625851d1 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -959,11 +959,11 @@ def _read_pyarrow(self) -> DataFrame: if isinstance(self.dtype, dict): pa = import_optional_dependency("pyarrow") - fields = [ - (field, pandas_dtype(dtype).pyarrow_dtype) - for field, dtype in self.dtype.items() - if isinstance(pandas_dtype(dtype), ArrowDtype) - ] + fields = [] + for field, dtype in self.dtype.items(): + pd_dtype = pandas_dtype(dtype) + if isinstance(pd_dtype, ArrowDtype): + fields.append((field, pd_dtype.pyarrow_dtype)) schema = pa.schema(fields) options = pyarrow_json.ParseOptions(explicit_schema=schema) From 5a8158b703b67c3ce5f0a4496871823a6b48457f Mon Sep 17 00:00:00 2001 From: william larkin <56956489+will-larkin@users.noreply.github.com> Date: Mon, 24 Feb 2025 14:17:01 +0000 Subject: [PATCH 04/26] fix: commit checks --- pandas/io/json/_json.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index 7934d625851d1..9069de4896b6b 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -981,13 +981,13 @@ def _read_ujson(self) -> DataFrame | Series: elif self.nrows: lines = list(islice(self.data, self.nrows)) lines_json = self._combine_lines(lines) - obj = self._get_object_parser(lines_json) + obj: DataFrame | Series = self._get_object_parser(lines_json) else: data = ensure_str(self.data) data_lines = data.split("\n") - obj = self._get_object_parser(self._combine_lines(data_lines)) + obj: DataFrame | Series = self._get_object_parser(self._combine_lines(data_lines)) else: - obj = self._get_object_parser(self.data) + obj: DataFrame | Series = self._get_object_parser(self.data) if self.dtype_backend is not lib.no_default: return obj.convert_dtypes( infer_objects=False, dtype_backend=self.dtype_backend From 73f18a4f888da76b436bdc91fca2c6dcd37b615a Mon Sep 17 00:00:00 2001 From: william larkin <56956489+will-larkin@users.noreply.github.com> Date: Mon, 24 Feb 2025 14:27:14 +0000 Subject: [PATCH 05/26] fix: commit checks --- pandas/io/json/_json.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index 9069de4896b6b..e9ec4d7881414 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -981,13 +981,19 @@ def _read_ujson(self) -> DataFrame | Series: elif self.nrows: lines = list(islice(self.data, self.nrows)) lines_json = self._combine_lines(lines) - obj: DataFrame | Series = self._get_object_parser(lines_json) + obj: DataFrame | Series = self._get_object_parser( + lines_json + ) else: data = ensure_str(self.data) data_lines = data.split("\n") - obj: DataFrame | Series = self._get_object_parser(self._combine_lines(data_lines)) + obj: DataFrame | Series = self._get_object_parser( + self._combine_lines(data_lines) + ) else: - obj: DataFrame | Series = self._get_object_parser(self.data) + obj: DataFrame | Series = self._get_object_parser( + self.data + ) if self.dtype_backend is not lib.no_default: return obj.convert_dtypes( infer_objects=False, dtype_backend=self.dtype_backend From 72675c92644784cad5d8654b6e0a5a68260de519 Mon Sep 17 00:00:00 2001 From: william larkin <56956489+will-larkin@users.noreply.github.com> Date: Mon, 24 Feb 2025 14:34:55 +0000 Subject: [PATCH 06/26] fic: formatting --- pandas/io/json/_json.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index e9ec4d7881414..b9325c485b554 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -981,9 +981,7 @@ def _read_ujson(self) -> DataFrame | Series: elif self.nrows: lines = list(islice(self.data, self.nrows)) lines_json = self._combine_lines(lines) - obj: DataFrame | Series = self._get_object_parser( - lines_json - ) + obj: DataFrame | Series = self._get_object_parser(lines_json) else: data = ensure_str(self.data) data_lines = data.split("\n") @@ -991,9 +989,7 @@ def _read_ujson(self) -> DataFrame | Series: self._combine_lines(data_lines) ) else: - obj: DataFrame | Series = self._get_object_parser( - self.data - ) + obj: DataFrame | Series = self._get_object_parser(self.data) if self.dtype_backend is not lib.no_default: return obj.convert_dtypes( infer_objects=False, dtype_backend=self.dtype_backend From bf830f5ab70060e55a6506efd1dbda070b465cdd Mon Sep 17 00:00:00 2001 From: william larkin <56956489+will-larkin@users.noreply.github.com> Date: Mon, 24 Feb 2025 14:59:46 +0000 Subject: [PATCH 07/26] fix: commit checks --- pandas/io/json/_json.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index b9325c485b554..4ad6d3a3a4b4a 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -975,21 +975,20 @@ def _read_ujson(self) -> DataFrame | Series: """ Read JSON using the ujson engine. """ + obj: DataFrame | Series if self.lines: if self.chunksize: obj = concat(self) elif self.nrows: lines = list(islice(self.data, self.nrows)) lines_json = self._combine_lines(lines) - obj: DataFrame | Series = self._get_object_parser(lines_json) + obj = self._get_object_parser(lines_json) else: data = ensure_str(self.data) data_lines = data.split("\n") - obj: DataFrame | Series = self._get_object_parser( - self._combine_lines(data_lines) - ) + obj = self._get_object_parser(self._combine_lines(data_lines)) else: - obj: DataFrame | Series = self._get_object_parser(self.data) + obj = self._get_object_parser(self.data) if self.dtype_backend is not lib.no_default: return obj.convert_dtypes( infer_objects=False, dtype_backend=self.dtype_backend From 46369f24834f5c1abfab323a22c00aa98b1bc6be Mon Sep 17 00:00:00 2001 From: william larkin <56956489+will-larkin@users.noreply.github.com> Date: Tue, 4 Mar 2025 08:45:59 +0000 Subject: [PATCH 08/26] feat: change type conversion --- pandas/io/json/_json.py | 20 +++++++------------- 1 file changed, 7 insertions(+), 13 deletions(-) diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index 4ad6d3a3a4b4a..055a572650eb7 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -957,19 +957,13 @@ def _read_pyarrow(self) -> DataFrame: pyarrow_json = import_optional_dependency("pyarrow.json") options = None - if isinstance(self.dtype, dict): - pa = import_optional_dependency("pyarrow") - fields = [] - for field, dtype in self.dtype.items(): - pd_dtype = pandas_dtype(dtype) - if isinstance(pd_dtype, ArrowDtype): - fields.append((field, pd_dtype.pyarrow_dtype)) - - schema = pa.schema(fields) - options = pyarrow_json.ParseOptions(explicit_schema=schema) - - pa_table = pyarrow_json.read_json(self.data, parse_options=options) - return arrow_table_to_pandas(pa_table, dtype_backend=self.dtype_backend) + pa_table = pyarrow_json.read_json(self.data) + df = arrow_table_to_pandas(pa_table, dtype_backend=self.dtype_backend) + + if self.dtype: + df = df.astype(self.dtype) + + return df def _read_ujson(self) -> DataFrame | Series: """ From 025fb30d0ec4200fb6a68eb7ad7a93e35c99f962 Mon Sep 17 00:00:00 2001 From: william larkin <56956489+will-larkin@users.noreply.github.com> Date: Tue, 4 Mar 2025 08:50:27 +0000 Subject: [PATCH 09/26] Update _json.py --- pandas/io/json/_json.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index 055a572650eb7..2162309f44a9b 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -955,7 +955,6 @@ def _read_pyarrow(self) -> DataFrame: Read JSON using the pyarrow engine. """ pyarrow_json = import_optional_dependency("pyarrow.json") - options = None pa_table = pyarrow_json.read_json(self.data) df = arrow_table_to_pandas(pa_table, dtype_backend=self.dtype_backend) From e1d202d1b16d2ba8713bd83d6ce9bd90757e20d3 Mon Sep 17 00:00:00 2001 From: william larkin <56956489+will-larkin@users.noreply.github.com> Date: Tue, 4 Mar 2025 09:13:45 +0000 Subject: [PATCH 10/26] Update _json.py --- pandas/io/json/_json.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index 2162309f44a9b..eccb69ff71018 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -32,12 +32,10 @@ from pandas.core.dtypes.common import ( ensure_str, is_string_dtype, - pandas_dtype, ) from pandas.core.dtypes.dtypes import PeriodDtype from pandas import ( - ArrowDtype, DataFrame, Index, MultiIndex, From 3954c842f1592bef6258050594629d023d06aaf2 Mon Sep 17 00:00:00 2001 From: william larkin <56956489+will-larkin@users.noreply.github.com> Date: Tue, 4 Mar 2025 17:29:58 +0000 Subject: [PATCH 11/26] Update _json.py --- pandas/io/json/_json.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index eccb69ff71018..8152cb0875ec6 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -957,7 +957,7 @@ def _read_pyarrow(self) -> DataFrame: pa_table = pyarrow_json.read_json(self.data) df = arrow_table_to_pandas(pa_table, dtype_backend=self.dtype_backend) - if self.dtype: + if isinstance(dict, self.dtype): df = df.astype(self.dtype) return df From 2572a3287c23e4e656374ef58ee526213405d8c1 Mon Sep 17 00:00:00 2001 From: william larkin <56956489+will-larkin@users.noreply.github.com> Date: Tue, 4 Mar 2025 18:25:52 +0000 Subject: [PATCH 12/26] Update _json.py --- pandas/io/json/_json.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index 8152cb0875ec6..ec65b75cf6f4d 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -957,7 +957,7 @@ def _read_pyarrow(self) -> DataFrame: pa_table = pyarrow_json.read_json(self.data) df = arrow_table_to_pandas(pa_table, dtype_backend=self.dtype_backend) - if isinstance(dict, self.dtype): + if isinstance(self.dtype, dict): df = df.astype(self.dtype) return df From 0d85bfe9e35bbfaa405fdac3c96540131e460ae3 Mon Sep 17 00:00:00 2001 From: william larkin <56956489+will-larkin@users.noreply.github.com> Date: Tue, 4 Mar 2025 19:52:18 +0000 Subject: [PATCH 13/26] Update pandas/tests/io/json/test_pandas.py Co-authored-by: William Ayd --- pandas/tests/io/json/test_pandas.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 0d2ef94be2d5c..d9be640d1f2d2 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -2196,7 +2196,8 @@ def test_read_json_pyarrow_with_dtype(self, datapath): ) result = df.dtypes - expected = Series( + pa = pytest.importorskip("pyarrow") + expected = Series([pd.ArrowDtype(pa.int32()), pd.ArrowDtype(pa.int64())], ...) [ pd.ArrowDtype.construct_from_string("int32[pyarrow]"), pd.ArrowDtype.construct_from_string("int64[pyarrow]"), From 00f2085592a70837255791ee96e0ff31c70afba0 Mon Sep 17 00:00:00 2001 From: william larkin <56956489+will-larkin@users.noreply.github.com> Date: Tue, 4 Mar 2025 20:18:39 +0000 Subject: [PATCH 14/26] Update test_pandas.py --- pandas/tests/io/json/test_pandas.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 0d2ef94be2d5c..d2811968c991b 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -2186,9 +2186,10 @@ def test_read_json_dtype_backend( @td.skip_if_no("pyarrow") def test_read_json_pyarrow_with_dtype(self, datapath): dtype = {"a": "int32[pyarrow]", "b": "int64[pyarrow]"} + json = '{"a": 1, "b": 2}' df = read_json( - datapath("io", "json", "data", "line_delimited.json"), + StringIO(json), dtype=dtype, lines=True, engine="pyarrow", From 18f69c56243fda86cfb85086496c9ba682bbae07 Mon Sep 17 00:00:00 2001 From: william larkin <56956489+will-larkin@users.noreply.github.com> Date: Tue, 4 Mar 2025 20:38:13 +0000 Subject: [PATCH 15/26] Update test_pandas.py --- pandas/tests/io/json/test_pandas.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 60751692db676..5acccb9245aa1 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -2198,7 +2198,8 @@ def test_read_json_pyarrow_with_dtype(self, datapath): result = df.dtypes pa = pytest.importorskip("pyarrow") - expected = Series([pd.ArrowDtype(pa.int32()), pd.ArrowDtype(pa.int64())], ...) + expected = Series( + [pd.ArrowDtype(pa.int32()), pd.ArrowDtype(pa.int64())], [ pd.ArrowDtype.construct_from_string("int32[pyarrow]"), pd.ArrowDtype.construct_from_string("int64[pyarrow]"), From de9726699b6bd10ca5d88291c62f5257b4b7f474 Mon Sep 17 00:00:00 2001 From: william larkin <56956489+will-larkin@users.noreply.github.com> Date: Wed, 5 Mar 2025 07:44:17 +0000 Subject: [PATCH 16/26] Update test_pandas.py --- pandas/tests/io/json/test_pandas.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 5acccb9245aa1..6edd41aa4df4b 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -2184,7 +2184,7 @@ def test_read_json_dtype_backend( tm.assert_frame_equal(result, expected, check_column_type=False) @td.skip_if_no("pyarrow") - def test_read_json_pyarrow_with_dtype(self, datapath): + def test_read_json_pyarrow_with_dtype(self): dtype = {"a": "int32[pyarrow]", "b": "int64[pyarrow]"} json = '{"a": 1, "b": 2}' From e87097f59c119dd0f864e6409f470196980bfa4b Mon Sep 17 00:00:00 2001 From: william larkin <56956489+will-larkin@users.noreply.github.com> Date: Wed, 5 Mar 2025 08:16:05 +0000 Subject: [PATCH 17/26] Update test_pandas.py --- pandas/tests/io/json/test_pandas.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 6edd41aa4df4b..264339c102e3f 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -1,6 +1,6 @@ import datetime from datetime import timedelta -from io import StringIO +from io import StringIO, BytesIO import json import os import sys @@ -2186,10 +2186,10 @@ def test_read_json_dtype_backend( @td.skip_if_no("pyarrow") def test_read_json_pyarrow_with_dtype(self): dtype = {"a": "int32[pyarrow]", "b": "int64[pyarrow]"} - json = '{"a": 1, "b": 2}' + json = b'{"a": 1, "b": 2}\n' df = read_json( - StringIO(json), + BytesIO(json), dtype=dtype, lines=True, engine="pyarrow", @@ -2199,8 +2199,7 @@ def test_read_json_pyarrow_with_dtype(self): result = df.dtypes pa = pytest.importorskip("pyarrow") expected = Series( - [pd.ArrowDtype(pa.int32()), pd.ArrowDtype(pa.int64())], - [ + data=[ pd.ArrowDtype.construct_from_string("int32[pyarrow]"), pd.ArrowDtype.construct_from_string("int64[pyarrow]"), ], From a855a59895f64ff88c0490680daa89edd81c42de Mon Sep 17 00:00:00 2001 From: william larkin <56956489+will-larkin@users.noreply.github.com> Date: Wed, 5 Mar 2025 08:25:54 +0000 Subject: [PATCH 18/26] Update test_pandas.py --- pandas/tests/io/json/test_pandas.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 264339c102e3f..4fb75eea63316 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -1,6 +1,9 @@ import datetime from datetime import timedelta -from io import StringIO, BytesIO +from io import ( + BytesIO, + StringIO, +) import json import os import sys @@ -2197,7 +2200,6 @@ def test_read_json_pyarrow_with_dtype(self): ) result = df.dtypes - pa = pytest.importorskip("pyarrow") expected = Series( data=[ pd.ArrowDtype.construct_from_string("int32[pyarrow]"), From a4b7f95e4dfbdf118ae5b3a4b2699c1ca1d7d680 Mon Sep 17 00:00:00 2001 From: william larkin <56956489+will-larkin@users.noreply.github.com> Date: Wed, 5 Mar 2025 18:50:50 +0000 Subject: [PATCH 19/26] Update test_pandas.py --- pandas/tests/io/json/test_pandas.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 4fb75eea63316..1fb2f6a2cc8b6 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -2187,6 +2187,7 @@ def test_read_json_dtype_backend( tm.assert_frame_equal(result, expected, check_column_type=False) @td.skip_if_no("pyarrow") + @pytest.mark.filterwarnings("ignore::DeprecationWarning") def test_read_json_pyarrow_with_dtype(self): dtype = {"a": "int32[pyarrow]", "b": "int64[pyarrow]"} json = b'{"a": 1, "b": 2}\n' From 6406840522458e74765b1126d81af4663d2c6c30 Mon Sep 17 00:00:00 2001 From: william larkin <56956489+will-larkin@users.noreply.github.com> Date: Thu, 6 Mar 2025 07:41:31 +0000 Subject: [PATCH 20/26] Update _json.py --- pandas/io/json/_json.py | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index ec65b75cf6f4d..6b4f6c05c3123 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -32,10 +32,12 @@ from pandas.core.dtypes.common import ( ensure_str, is_string_dtype, + pandas_dtype, ) from pandas.core.dtypes.dtypes import PeriodDtype from pandas import ( + ArrowDtype, DataFrame, Index, MultiIndex, @@ -953,12 +955,23 @@ def _read_pyarrow(self) -> DataFrame: Read JSON using the pyarrow engine. """ pyarrow_json = import_optional_dependency("pyarrow.json") - - pa_table = pyarrow_json.read_json(self.data) - df = arrow_table_to_pandas(pa_table, dtype_backend=self.dtype_backend) + options = None if isinstance(self.dtype, dict): - df = df.astype(self.dtype) + pa = import_optional_dependency("pyarrow") + fields = [] + for field, dtype in self.dtype.items(): + pd_dtype = pandas_dtype(dtype) + if isinstance(pd_dtype, ArrowDtype): + fields.append((field, pd_dtype.pyarrow_dtype)) + + schema = pa.schema(fields) + options = pyarrow_json.ParseOptions( + explicit_schema=schema, unexpected_field_behavior="infer" + ) + + pa_table = pyarrow_json.read_json(self.data, parse_options=options) + df = arrow_table_to_pandas(pa_table, dtype_backend=self.dtype_backend) return df From 4626ad743f4b4bc9c6949a051e3893ff8f6d2a84 Mon Sep 17 00:00:00 2001 From: william larkin <56956489+will-larkin@users.noreply.github.com> Date: Fri, 7 Mar 2025 16:57:58 +0000 Subject: [PATCH 21/26] Update test_pandas.py --- pandas/tests/io/json/test_pandas.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 1fb2f6a2cc8b6..a023f8642ed83 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -2188,7 +2188,13 @@ def test_read_json_dtype_backend( @td.skip_if_no("pyarrow") @pytest.mark.filterwarnings("ignore::DeprecationWarning") - def test_read_json_pyarrow_with_dtype(self): + def test_read_json_pyarrow_with_dtype(self, request): + pa = pytest.importorskip("pyarrow") + version_tuple = tuple(map(int, pa.__version__.split('.'))) + + if version_tuple[0] < 16: + request.applymarker(pytest.mark.filterwarnings("ignore::DeprecationWarning")) + dtype = {"a": "int32[pyarrow]", "b": "int64[pyarrow]"} json = b'{"a": 1, "b": 2}\n' From 7d7171b2acb7bfe946fb4d74efd9d39a8c20ebe2 Mon Sep 17 00:00:00 2001 From: william larkin <56956489+will-larkin@users.noreply.github.com> Date: Fri, 7 Mar 2025 17:04:18 +0000 Subject: [PATCH 22/26] Update test_pandas.py --- pandas/tests/io/json/test_pandas.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index a023f8642ed83..56fb4602acb57 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -2190,10 +2190,12 @@ def test_read_json_dtype_backend( @pytest.mark.filterwarnings("ignore::DeprecationWarning") def test_read_json_pyarrow_with_dtype(self, request): pa = pytest.importorskip("pyarrow") - version_tuple = tuple(map(int, pa.__version__.split('.'))) + version_tuple = tuple(map(int, pa.__version__.split("."))) if version_tuple[0] < 16: - request.applymarker(pytest.mark.filterwarnings("ignore::DeprecationWarning")) + request.applymarker( + pytest.mark.filterwarnings("ignore::DeprecationWarning") + ) dtype = {"a": "int32[pyarrow]", "b": "int64[pyarrow]"} json = b'{"a": 1, "b": 2}\n' From 883b84b62089bf7d8d4d77cd81ccf9516260920b Mon Sep 17 00:00:00 2001 From: william larkin <56956489+will-larkin@users.noreply.github.com> Date: Fri, 7 Mar 2025 17:44:22 +0000 Subject: [PATCH 23/26] Update test_pandas.py --- pandas/tests/io/json/test_pandas.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 56fb4602acb57..b2e603dbc0aa7 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -30,6 +30,7 @@ read_json, ) import pandas._testing as tm +from pandas.util.version import Version from pandas.io.json import ujson_dumps @@ -2187,12 +2188,10 @@ def test_read_json_dtype_backend( tm.assert_frame_equal(result, expected, check_column_type=False) @td.skip_if_no("pyarrow") - @pytest.mark.filterwarnings("ignore::DeprecationWarning") def test_read_json_pyarrow_with_dtype(self, request): pa = pytest.importorskip("pyarrow") - version_tuple = tuple(map(int, pa.__version__.split("."))) - if version_tuple[0] < 16: + if Version(pa.__version__) < Version("16.0"): request.applymarker( pytest.mark.filterwarnings("ignore::DeprecationWarning") ) From 80881ae28fc5675646e4ccc9b0692d54c8d278b6 Mon Sep 17 00:00:00 2001 From: william larkin <56956489+will-larkin@users.noreply.github.com> Date: Fri, 7 Mar 2025 18:09:48 +0000 Subject: [PATCH 24/26] Update test_pandas.py --- pandas/tests/io/json/test_pandas.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index b2e603dbc0aa7..cc14cfabd2852 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -2191,7 +2191,7 @@ def test_read_json_dtype_backend( def test_read_json_pyarrow_with_dtype(self, request): pa = pytest.importorskip("pyarrow") - if Version(pa.__version__) < Version("16.0"): + if Version(pa.__version__) <= Version("16.0"): request.applymarker( pytest.mark.filterwarnings("ignore::DeprecationWarning") ) From 8df8914a56231f3ceff461256bc76717cfdf001f Mon Sep 17 00:00:00 2001 From: william larkin <56956489+will-larkin@users.noreply.github.com> Date: Wed, 19 Mar 2025 08:40:07 +0000 Subject: [PATCH 25/26] Update test_pandas.py --- pandas/tests/io/json/test_pandas.py | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index cc14cfabd2852..dd270b9c4639e 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -2188,14 +2188,8 @@ def test_read_json_dtype_backend( tm.assert_frame_equal(result, expected, check_column_type=False) @td.skip_if_no("pyarrow") - def test_read_json_pyarrow_with_dtype(self, request): - pa = pytest.importorskip("pyarrow") - - if Version(pa.__version__) <= Version("16.0"): - request.applymarker( - pytest.mark.filterwarnings("ignore::DeprecationWarning") - ) - + @pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning") + def test_read_json_pyarrow_with_dtype(self): dtype = {"a": "int32[pyarrow]", "b": "int64[pyarrow]"} json = b'{"a": 1, "b": 2}\n' From 5c581fc885e32e520876f85f907e36d5232b9cf1 Mon Sep 17 00:00:00 2001 From: william larkin <56956489+will-larkin@users.noreply.github.com> Date: Wed, 19 Mar 2025 08:43:08 +0000 Subject: [PATCH 26/26] Update test_pandas.py --- pandas/tests/io/json/test_pandas.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index dd270b9c4639e..fde9940ea78eb 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -30,7 +30,6 @@ read_json, ) import pandas._testing as tm -from pandas.util.version import Version from pandas.io.json import ujson_dumps