Skip to content

Commit 755a99b

Browse files
authored
ENH: Add use_nullable_dtypes to read_spss (#51115)
1 parent 1acdf4b commit 755a99b

File tree

3 files changed

+52
-0
lines changed

3 files changed

+52
-0
lines changed

doc/source/whatsnew/v2.0.0.rst

+2
Original file line numberDiff line numberDiff line change
@@ -123,6 +123,7 @@ The ``use_nullable_dtypes`` keyword argument has been expanded to the following
123123
* :func:`read_sql_table`
124124
* :func:`read_orc`
125125
* :func:`read_feather`
126+
* :func:`read_spss`
126127
* :func:`to_numeric`
127128

128129
To simplify opting-in to nullable dtypes for these functions, a new option ``nullable_dtypes`` was added that allows setting
@@ -151,6 +152,7 @@ to select the nullable dtypes implementation.
151152
* :func:`read_parquet`
152153
* :func:`read_orc`
153154
* :func:`read_feather`
155+
* :func:`read_spss`
154156
* :func:`to_numeric`
155157

156158

pandas/io/spss.py

+26
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,9 @@
66
Sequence,
77
)
88

9+
from pandas._config import using_nullable_dtypes
10+
11+
from pandas._libs import lib
912
from pandas.compat._optional import import_optional_dependency
1013

1114
from pandas.core.dtypes.inference import is_list_like
@@ -20,6 +23,7 @@ def read_spss(
2023
path: str | Path,
2124
usecols: Sequence[str] | None = None,
2225
convert_categoricals: bool = True,
26+
use_nullable_dtypes: bool | lib.NoDefault = lib.no_default,
2327
) -> DataFrame:
2428
"""
2529
Load an SPSS file from the file path, returning a DataFrame.
@@ -32,13 +36,33 @@ def read_spss(
3236
Return a subset of the columns. If None, return all columns.
3337
convert_categoricals : bool, default is True
3438
Convert categorical columns into pd.Categorical.
39+
use_nullable_dtypes : bool = False
40+
Whether to use nullable dtypes as default when reading data. If
41+
set to True, nullable dtypes are used for all dtypes that have a nullable
42+
implementation, even if no nulls are present.
43+
44+
.. note::
45+
46+
The nullable dtype implementation can be configured by calling
47+
``pd.set_option("mode.dtype_backend", "pandas")`` to use
48+
numpy-backed nullable dtypes or
49+
``pd.set_option("mode.dtype_backend", "pyarrow")`` to use
50+
pyarrow-backed nullable dtypes (using ``pd.ArrowDtype``).
51+
52+
.. versionadded:: 2.0
3553
3654
Returns
3755
-------
3856
DataFrame
3957
"""
4058
pyreadstat = import_optional_dependency("pyreadstat")
4159

60+
use_nullable_dtypes = (
61+
use_nullable_dtypes
62+
if use_nullable_dtypes is not lib.no_default
63+
else using_nullable_dtypes()
64+
)
65+
4266
if usecols is not None:
4367
if not is_list_like(usecols):
4468
raise TypeError("usecols must be list-like.")
@@ -47,4 +71,6 @@ def read_spss(
4771
df, _ = pyreadstat.read_sav(
4872
stringify_path(path), usecols=usecols, apply_value_formats=convert_categoricals
4973
)
74+
if use_nullable_dtypes:
75+
df = df.convert_dtypes()
5076
return df

pandas/tests/io/test_spss.py

+24
Original file line numberDiff line numberDiff line change
@@ -80,3 +80,27 @@ def test_spss_usecols(datapath):
8080

8181
with pytest.raises(TypeError, match="usecols must be list-like."):
8282
pd.read_spss(fname, usecols="VAR00002")
83+
84+
85+
@pytest.mark.parametrize("dtype_backend", ["pandas", "pyarrow"])
86+
def test_spss_umlauts_use_nullable_dtypes(datapath, dtype_backend):
87+
# test file from the Haven project (https://haven.tidyverse.org/)
88+
fname = datapath("io", "data", "spss", "umlauts.sav")
89+
90+
with pd.option_context("mode.dtype_backend", dtype_backend):
91+
df = pd.read_spss(fname, convert_categoricals=False, use_nullable_dtypes=True)
92+
expected = pd.DataFrame({"var1": [1.0, 2.0, 1.0, 3.0]}, dtype="Int64")
93+
94+
if dtype_backend == "pyarrow":
95+
pa = pytest.importorskip("pyarrow")
96+
97+
from pandas.arrays import ArrowExtensionArray
98+
99+
expected = pd.DataFrame(
100+
{
101+
col: ArrowExtensionArray(pa.array(expected[col], from_pandas=True))
102+
for col in expected.columns
103+
}
104+
)
105+
106+
tm.assert_frame_equal(df, expected)

0 commit comments

Comments
 (0)