Skip to content

BUG: fix read_json ignoring the dtype with the pyarrow engine #60997

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 34 commits into from
Mar 19, 2025
Merged
Show file tree
Hide file tree
Changes from 27 commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
835edf6
fix: pass dtypes to read_json with pyarrow engine
will-larkin Feb 24, 2025
30d21de
Merge branch 'pandas-dev:main' into main
will-larkin Feb 24, 2025
074b3cb
fix: code checks
will-larkin Feb 24, 2025
c3f9e7f
Merge branch 'main' of https://github.com/will-larkin/pandas-will-larkin
will-larkin Feb 24, 2025
28fa332
fix: commit checks
will-larkin Feb 24, 2025
5a8158b
fix: commit checks
will-larkin Feb 24, 2025
73f18a4
fix: commit checks
will-larkin Feb 24, 2025
72675c9
fic: formatting
will-larkin Feb 24, 2025
bf830f5
fix: commit checks
will-larkin Feb 24, 2025
2828ccb
Merge branch 'main' into main
will-larkin Feb 26, 2025
46369f2
feat: change type conversion
will-larkin Mar 4, 2025
ef069f5
Merge branch 'main' of https://github.com/will-larkin/pandas-will-larkin
will-larkin Mar 4, 2025
025fb30
Update _json.py
will-larkin Mar 4, 2025
e1d202d
Update _json.py
will-larkin Mar 4, 2025
1d71fe6
Merge branch 'main' into main
will-larkin Mar 4, 2025
3954c84
Update _json.py
will-larkin Mar 4, 2025
eb6b283
Merge branch 'main' of https://github.com/will-larkin/pandas-will-larkin
will-larkin Mar 4, 2025
2572a32
Update _json.py
will-larkin Mar 4, 2025
0d85bfe
Update pandas/tests/io/json/test_pandas.py
will-larkin Mar 4, 2025
00f2085
Update test_pandas.py
will-larkin Mar 4, 2025
04ff6da
Merge branch 'main' of https://github.com/will-larkin/pandas-will-larkin
will-larkin Mar 4, 2025
18f69c5
Update test_pandas.py
will-larkin Mar 4, 2025
de97266
Update test_pandas.py
will-larkin Mar 5, 2025
abc2418
Merge remote-tracking branch 'upstream/main'
will-larkin Mar 5, 2025
e87097f
Update test_pandas.py
will-larkin Mar 5, 2025
a855a59
Update test_pandas.py
will-larkin Mar 5, 2025
a4b7f95
Update test_pandas.py
will-larkin Mar 5, 2025
6406840
Update _json.py
will-larkin Mar 6, 2025
4626ad7
Update test_pandas.py
will-larkin Mar 7, 2025
7d7171b
Update test_pandas.py
will-larkin Mar 7, 2025
883b84b
Update test_pandas.py
will-larkin Mar 7, 2025
80881ae
Update test_pandas.py
will-larkin Mar 7, 2025
8df8914
Update test_pandas.py
will-larkin Mar 19, 2025
5c581fc
Update test_pandas.py
will-larkin Mar 19, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v3.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -733,6 +733,7 @@ I/O
- Bug in :meth:`read_csv` where the order of the ``na_values`` makes an inconsistency when ``na_values`` is a list non-string values. (:issue:`59303`)
- Bug in :meth:`read_excel` raising ``ValueError`` when passing array of boolean values when ``dtype="boolean"``. (:issue:`58159`)
- Bug in :meth:`read_html` where ``rowspan`` in header row causes incorrect conversion to ``DataFrame``. (:issue:`60210`)
- Bug in :meth:`read_json` ignoring the given ``dtype`` when ``engine="pyarrow"`` (:issue:`59516`)
- Bug in :meth:`read_json` not validating the ``typ`` argument to not be exactly ``"frame"`` or ``"series"`` (:issue:`59124`)
- Bug in :meth:`read_json` where extreme value integers in string format were incorrectly parsed as a different integer number (:issue:`20608`)
- Bug in :meth:`read_stata` raising ``KeyError`` when input file is stored in big-endian format and contains strL data. (:issue:`58638`)
Expand Down
65 changes: 43 additions & 22 deletions pandas/io/json/_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -942,29 +942,50 @@ def read(self) -> DataFrame | Series:
obj: DataFrame | Series
with self:
if self.engine == "pyarrow":
pyarrow_json = import_optional_dependency("pyarrow.json")
pa_table = pyarrow_json.read_json(self.data)
return arrow_table_to_pandas(pa_table, dtype_backend=self.dtype_backend)
obj = self._read_pyarrow()
elif self.engine == "ujson":
if self.lines:
if self.chunksize:
obj = concat(self)
elif self.nrows:
lines = list(islice(self.data, self.nrows))
lines_json = self._combine_lines(lines)
obj = self._get_object_parser(lines_json)
else:
data = ensure_str(self.data)
data_lines = data.split("\n")
obj = self._get_object_parser(self._combine_lines(data_lines))
else:
obj = self._get_object_parser(self.data)
if self.dtype_backend is not lib.no_default:
return obj.convert_dtypes(
infer_objects=False, dtype_backend=self.dtype_backend
)
else:
return obj
obj = self._read_ujson()

return obj

def _read_pyarrow(self) -> DataFrame:
"""
Read JSON using the pyarrow engine.
"""
pyarrow_json = import_optional_dependency("pyarrow.json")

pa_table = pyarrow_json.read_json(self.data)
df = arrow_table_to_pandas(pa_table, dtype_backend=self.dtype_backend)

if isinstance(self.dtype, dict):
df = df.astype(self.dtype)

return df

def _read_ujson(self) -> DataFrame | Series:
"""
Read JSON using the ujson engine.
"""
obj: DataFrame | Series
if self.lines:
if self.chunksize:
obj = concat(self)
elif self.nrows:
lines = list(islice(self.data, self.nrows))
lines_json = self._combine_lines(lines)
obj = self._get_object_parser(lines_json)
else:
data = ensure_str(self.data)
data_lines = data.split("\n")
obj = self._get_object_parser(self._combine_lines(data_lines))
else:
obj = self._get_object_parser(self.data)
if self.dtype_backend is not lib.no_default:
return obj.convert_dtypes(
infer_objects=False, dtype_backend=self.dtype_backend
)
else:
return obj

def _get_object_parser(self, json: str) -> DataFrame | Series:
"""
Expand Down
29 changes: 28 additions & 1 deletion pandas/tests/io/json/test_pandas.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
import datetime
from datetime import timedelta
from io import StringIO
from io import (
BytesIO,
StringIO,
)
import json
import os
import sys
Expand Down Expand Up @@ -2183,6 +2186,30 @@ def test_read_json_dtype_backend(
# string_storage setting -> ignore that for checking the result
tm.assert_frame_equal(result, expected, check_column_type=False)

@td.skip_if_no("pyarrow")
@pytest.mark.filterwarnings("ignore::DeprecationWarning")
Copy link
Member

@mroeschke mroeschke Mar 5, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you add this filterwarnings mark only if pyarrow < 16?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I couldn't find a way to do that with pytest filterwarnings

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

def test_foo(..., request):
     if pa_version_under16p0:
         request.applymarker(pytest.mark.filterwarnings(...))

def test_read_json_pyarrow_with_dtype(self):
dtype = {"a": "int32[pyarrow]", "b": "int64[pyarrow]"}
json = b'{"a": 1, "b": 2}\n'

df = read_json(
BytesIO(json),
dtype=dtype,
lines=True,
engine="pyarrow",
dtype_backend="pyarrow",
)

result = df.dtypes
expected = Series(
data=[
pd.ArrowDtype.construct_from_string("int32[pyarrow]"),
pd.ArrowDtype.construct_from_string("int64[pyarrow]"),
],
index=["a", "b"],
)
tm.assert_series_equal(result, expected)

@pytest.mark.parametrize("orient", ["split", "records", "index"])
def test_read_json_nullable_series(self, string_storage, dtype_backend, orient):
# GH#50750
Expand Down
Loading