Skip to content

Commit 7f8baa0

Browse files
authored
ENH: Add use_nullable_dtypes option to read_json (#50750)
* ENH: Add use_nullable_dtypes option to read_json * Add gh ref * Add test
1 parent ef4cf72 commit 7f8baa0

File tree

3 files changed

+129
-3
lines changed

3 files changed

+129
-3
lines changed

doc/source/whatsnew/v2.0.0.rst

+2
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@ The ``use_nullable_dtypes`` keyword argument has been expanded to the following
4141
* :func:`read_excel`
4242
* :func:`read_html`
4343
* :func:`read_xml`
44+
* :func:`read_json`
4445
* :func:`read_sql`
4546
* :func:`read_sql_query`
4647
* :func:`read_sql_table`
@@ -56,6 +57,7 @@ to select the nullable dtypes implementation.
5657
* :func:`read_excel`
5758
* :func:`read_html`
5859
* :func:`read_xml`
60+
* :func:`read_json`
5961
* :func:`read_parquet`
6062
* :func:`read_orc`
6163
* :func:`read_feather`

pandas/io/json/_json.py

+37-3
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@
4343
ensure_str,
4444
is_period_dtype,
4545
)
46+
from pandas.core.dtypes.generic import ABCIndex
4647

4748
from pandas import (
4849
DataFrame,
@@ -396,6 +397,7 @@ def read_json(
396397
compression: CompressionOptions = ...,
397398
nrows: int | None = ...,
398399
storage_options: StorageOptions = ...,
400+
use_nullable_dtypes: bool = ...,
399401
) -> JsonReader[Literal["frame"]]:
400402
...
401403

@@ -419,6 +421,7 @@ def read_json(
419421
compression: CompressionOptions = ...,
420422
nrows: int | None = ...,
421423
storage_options: StorageOptions = ...,
424+
use_nullable_dtypes: bool = ...,
422425
) -> JsonReader[Literal["series"]]:
423426
...
424427

@@ -442,6 +445,7 @@ def read_json(
442445
compression: CompressionOptions = ...,
443446
nrows: int | None = ...,
444447
storage_options: StorageOptions = ...,
448+
use_nullable_dtypes: bool = ...,
445449
) -> Series:
446450
...
447451

@@ -465,6 +469,7 @@ def read_json(
465469
compression: CompressionOptions = ...,
466470
nrows: int | None = ...,
467471
storage_options: StorageOptions = ...,
472+
use_nullable_dtypes: bool = ...,
468473
) -> DataFrame:
469474
...
470475

@@ -491,6 +496,7 @@ def read_json(
491496
compression: CompressionOptions = "infer",
492497
nrows: int | None = None,
493498
storage_options: StorageOptions = None,
499+
use_nullable_dtypes: bool = False,
494500
) -> DataFrame | Series | JsonReader:
495501
"""
496502
Convert a JSON string to pandas object.
@@ -629,6 +635,19 @@ def read_json(
629635
630636
.. versionadded:: 1.2.0
631637
638+
use_nullable_dtypes : bool = False
639+
Whether or not to use nullable dtypes as default when reading data. If
640+
set to True, nullable dtypes are used for all dtypes that have a nullable
641+
implementation, even if no nulls are present.
642+
643+
The nullable dtype implementation can be configured by calling
644+
``pd.set_option("mode.dtype_backend", "pandas")`` to use
645+
numpy-backed nullable dtypes or
646+
``pd.set_option("mode.dtype_backend", "pyarrow")`` to use
647+
pyarrow-backed nullable dtypes (using ``pd.ArrowDtype``).
648+
649+
.. versionadded:: 2.0
650+
632651
Returns
633652
-------
634653
Series or DataFrame
@@ -740,6 +759,7 @@ def read_json(
740759
nrows=nrows,
741760
storage_options=storage_options,
742761
encoding_errors=encoding_errors,
762+
use_nullable_dtypes=use_nullable_dtypes,
743763
)
744764

745765
if chunksize:
@@ -775,6 +795,7 @@ def __init__(
775795
nrows: int | None,
776796
storage_options: StorageOptions = None,
777797
encoding_errors: str | None = "strict",
798+
use_nullable_dtypes: bool = False,
778799
) -> None:
779800

780801
self.orient = orient
@@ -794,6 +815,7 @@ def __init__(
794815
self.nrows = nrows
795816
self.encoding_errors = encoding_errors
796817
self.handles: IOHandles[str] | None = None
818+
self.use_nullable_dtypes = use_nullable_dtypes
797819

798820
if self.chunksize is not None:
799821
self.chunksize = validate_integer("chunksize", self.chunksize, 1)
@@ -903,7 +925,10 @@ def read(self) -> DataFrame | Series:
903925
obj = self._get_object_parser(self._combine_lines(data_lines))
904926
else:
905927
obj = self._get_object_parser(self.data)
906-
return obj
928+
if self.use_nullable_dtypes:
929+
return obj.convert_dtypes(infer_objects=False)
930+
else:
931+
return obj
907932

908933
def _get_object_parser(self, json) -> DataFrame | Series:
909934
"""
@@ -919,6 +944,7 @@ def _get_object_parser(self, json) -> DataFrame | Series:
919944
"keep_default_dates": self.keep_default_dates,
920945
"precise_float": self.precise_float,
921946
"date_unit": self.date_unit,
947+
"use_nullable_dtypes": self.use_nullable_dtypes,
922948
}
923949
obj = None
924950
if typ == "frame":
@@ -977,7 +1003,10 @@ def __next__(self) -> DataFrame | Series:
9771003
self.close()
9781004
raise ex
9791005

980-
return obj
1006+
if self.use_nullable_dtypes:
1007+
return obj.convert_dtypes(infer_objects=False)
1008+
else:
1009+
return obj
9811010

9821011
def __enter__(self) -> JsonReader[FrameSeriesStrT]:
9831012
return self
@@ -1013,6 +1042,7 @@ def __init__(
10131042
keep_default_dates: bool = False,
10141043
precise_float: bool = False,
10151044
date_unit=None,
1045+
use_nullable_dtypes: bool = False,
10161046
) -> None:
10171047
self.json = json
10181048

@@ -1037,6 +1067,7 @@ def __init__(
10371067
self.date_unit = date_unit
10381068
self.keep_default_dates = keep_default_dates
10391069
self.obj: DataFrame | Series | None = None
1070+
self.use_nullable_dtypes = use_nullable_dtypes
10401071

10411072
def check_keys_split(self, decoded) -> None:
10421073
"""
@@ -1119,7 +1150,10 @@ def _try_convert_data(
11191150
if result:
11201151
return new_data, True
11211152

1122-
if data.dtype == "object":
1153+
if self.use_nullable_dtypes and not isinstance(data, ABCIndex):
1154+
# Fall through for conversion later on
1155+
return data, True
1156+
elif data.dtype == "object":
11231157

11241158
# try float
11251159
try:

pandas/tests/io/json/test_pandas.py

+90
Original file line numberDiff line numberDiff line change
@@ -15,13 +15,18 @@
1515

1616
import pandas as pd
1717
from pandas import (
18+
NA,
1819
DataFrame,
1920
DatetimeIndex,
2021
Series,
2122
Timestamp,
2223
read_json,
2324
)
2425
import pandas._testing as tm
26+
from pandas.core.arrays import (
27+
ArrowStringArray,
28+
StringArray,
29+
)
2530

2631

2732
def assert_json_roundtrip_equal(result, expected, orient):
@@ -1863,3 +1868,88 @@ def test_json_uint64(self):
18631868
df = DataFrame(data={"col1": [13342205958987758245, 12388075603347835679]})
18641869
result = df.to_json(orient="split")
18651870
assert result == expected
1871+
1872+
@pytest.mark.parametrize("dtype_backend", ["pandas", "pyarrow"])
1873+
@pytest.mark.parametrize(
1874+
"orient", ["split", "records", "values", "index", "columns"]
1875+
)
1876+
def test_read_json_nullable(self, string_storage, dtype_backend, orient):
1877+
# GH#50750
1878+
pa = pytest.importorskip("pyarrow")
1879+
df = DataFrame(
1880+
{
1881+
"a": Series([1, np.nan, 3], dtype="Int64"),
1882+
"b": Series([1, 2, 3], dtype="Int64"),
1883+
"c": Series([1.5, np.nan, 2.5], dtype="Float64"),
1884+
"d": Series([1.5, 2.0, 2.5], dtype="Float64"),
1885+
"e": [True, False, None],
1886+
"f": [True, False, True],
1887+
"g": ["a", "b", "c"],
1888+
"h": ["a", "b", None],
1889+
}
1890+
)
1891+
1892+
if string_storage == "python":
1893+
string_array = StringArray(np.array(["a", "b", "c"], dtype=np.object_))
1894+
string_array_na = StringArray(np.array(["a", "b", NA], dtype=np.object_))
1895+
1896+
else:
1897+
string_array = ArrowStringArray(pa.array(["a", "b", "c"]))
1898+
string_array_na = ArrowStringArray(pa.array(["a", "b", None]))
1899+
1900+
out = df.to_json(orient=orient)
1901+
with pd.option_context("mode.string_storage", string_storage):
1902+
with pd.option_context("mode.dtype_backend", dtype_backend):
1903+
result = read_json(out, use_nullable_dtypes=True, orient=orient)
1904+
1905+
expected = DataFrame(
1906+
{
1907+
"a": Series([1, np.nan, 3], dtype="Int64"),
1908+
"b": Series([1, 2, 3], dtype="Int64"),
1909+
"c": Series([1.5, np.nan, 2.5], dtype="Float64"),
1910+
"d": Series([1.5, 2.0, 2.5], dtype="Float64"),
1911+
"e": Series([True, False, NA], dtype="boolean"),
1912+
"f": Series([True, False, True], dtype="boolean"),
1913+
"g": string_array,
1914+
"h": string_array_na,
1915+
}
1916+
)
1917+
1918+
if dtype_backend == "pyarrow":
1919+
1920+
from pandas.arrays import ArrowExtensionArray
1921+
1922+
expected = DataFrame(
1923+
{
1924+
col: ArrowExtensionArray(pa.array(expected[col], from_pandas=True))
1925+
for col in expected.columns
1926+
}
1927+
)
1928+
1929+
if orient == "values":
1930+
expected.columns = list(range(0, 8))
1931+
1932+
tm.assert_frame_equal(result, expected)
1933+
1934+
@pytest.mark.parametrize("dtype_backend", ["pandas", "pyarrow"])
1935+
@pytest.mark.parametrize("orient", ["split", "records", "index"])
1936+
def test_read_json_nullable_series(self, string_storage, dtype_backend, orient):
1937+
# GH#50750
1938+
pa = pytest.importorskip("pyarrow")
1939+
ser = Series([1, np.nan, 3], dtype="Int64")
1940+
1941+
out = ser.to_json(orient=orient)
1942+
with pd.option_context("mode.string_storage", string_storage):
1943+
with pd.option_context("mode.dtype_backend", dtype_backend):
1944+
result = read_json(
1945+
out, use_nullable_dtypes=True, orient=orient, typ="series"
1946+
)
1947+
1948+
expected = Series([1, np.nan, 3], dtype="Int64")
1949+
1950+
if dtype_backend == "pyarrow":
1951+
from pandas.arrays import ArrowExtensionArray
1952+
1953+
expected = Series(ArrowExtensionArray(pa.array(expected, from_pandas=True)))
1954+
1955+
tm.assert_series_equal(result, expected)

0 commit comments

Comments
 (0)