ENH: Add use_nullable_dtypes to read_spss (#51115)

phofl · web-flow · commit 755a99b258ee · 2023-02-02T10:25:06.000-08:00
diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst
@@ -123,6 +123,7 @@ The ``use_nullable_dtypes`` keyword argument has been expanded to the following
 * :func:`read_sql_table`
 * :func:`read_orc`
 * :func:`read_feather`
+* :func:`read_spss`
 * :func:`to_numeric`
 
 To simplify opting-in to nullable dtypes for these functions, a new option ``nullable_dtypes`` was added that allows setting
@@ -151,6 +152,7 @@ to select the nullable dtypes implementation.
 * :func:`read_parquet`
 * :func:`read_orc`
 * :func:`read_feather`
+* :func:`read_spss`
 * :func:`to_numeric`
 
 
diff --git a/pandas/io/spss.py b/pandas/io/spss.py
@@ -6,6 +6,9 @@
     Sequence,
 )
 
+from pandas._config import using_nullable_dtypes
+
+from pandas._libs import lib
 from pandas.compat._optional import import_optional_dependency
 
 from pandas.core.dtypes.inference import is_list_like
@@ -20,6 +23,7 @@ def read_spss(
     path: str | Path,
     usecols: Sequence[str] | None = None,
     convert_categoricals: bool = True,
+    use_nullable_dtypes: bool | lib.NoDefault = lib.no_default,
 ) -> DataFrame:
     """
     Load an SPSS file from the file path, returning a DataFrame.
@@ -32,13 +36,33 @@ def read_spss(
         Return a subset of the columns. If None, return all columns.
     convert_categoricals : bool, default is True
         Convert categorical columns into pd.Categorical.
+    use_nullable_dtypes : bool = False
+        Whether to use nullable dtypes as default when reading data. If
+        set to True, nullable dtypes are used for all dtypes that have a nullable
+        implementation, even if no nulls are present.
+
+        .. note::
+
+            The nullable dtype implementation can be configured by calling
+            ``pd.set_option("mode.dtype_backend", "pandas")`` to use
+            numpy-backed nullable dtypes or
+            ``pd.set_option("mode.dtype_backend", "pyarrow")`` to use
+            pyarrow-backed nullable dtypes (using ``pd.ArrowDtype``).
+
+        .. versionadded:: 2.0
 
     Returns
     -------
     DataFrame
     """
     pyreadstat = import_optional_dependency("pyreadstat")
 
+    use_nullable_dtypes = (
+        use_nullable_dtypes
+        if use_nullable_dtypes is not lib.no_default
+        else using_nullable_dtypes()
+    )
+
     if usecols is not None:
         if not is_list_like(usecols):
             raise TypeError("usecols must be list-like.")
@@ -47,4 +71,6 @@ def read_spss(
     df, _ = pyreadstat.read_sav(
         stringify_path(path), usecols=usecols, apply_value_formats=convert_categoricals
     )
+    if use_nullable_dtypes:
+        df = df.convert_dtypes()
     return df
diff --git a/pandas/tests/io/test_spss.py b/pandas/tests/io/test_spss.py
@@ -80,3 +80,27 @@ def test_spss_usecols(datapath):
 
     with pytest.raises(TypeError, match="usecols must be list-like."):
         pd.read_spss(fname, usecols="VAR00002")
+
+
+@pytest.mark.parametrize("dtype_backend", ["pandas", "pyarrow"])
+def test_spss_umlauts_use_nullable_dtypes(datapath, dtype_backend):
+    # test file from the Haven project (https://haven.tidyverse.org/)
+    fname = datapath("io", "data", "spss", "umlauts.sav")
+
+    with pd.option_context("mode.dtype_backend", dtype_backend):
+        df = pd.read_spss(fname, convert_categoricals=False, use_nullable_dtypes=True)
+    expected = pd.DataFrame({"var1": [1.0, 2.0, 1.0, 3.0]}, dtype="Int64")
+
+    if dtype_backend == "pyarrow":
+        pa = pytest.importorskip("pyarrow")
+
+        from pandas.arrays import ArrowExtensionArray
+
+        expected = pd.DataFrame(
+            {
+                col: ArrowExtensionArray(pa.array(expected[col], from_pandas=True))
+                for col in expected.columns
+            }
+        )
+
+    tm.assert_frame_equal(df, expected)