EHN: read_spss stores the metadata in df.attrs (#55472)

yuanx749 · web-flow · commit b284101fbcfe · 2023-10-10T11:57:46.000-07:00
* EHN: read_spss stores the metadata in df.attrs

* filter warning

* Make separate variable
diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst
@@ -76,6 +76,7 @@ Other enhancements
 
 - :attr:`Series.attrs` / :attr:`DataFrame.attrs` now uses a deepcopy for propagating ``attrs`` (:issue:`54134`).
 - :func:`read_csv` now supports ``on_bad_lines`` parameter with ``engine="pyarrow"``. (:issue:`54480`)
+- :func:`read_spss` now returns a :class:`DataFrame` that stores the metadata in :attr:`DataFrame.attrs`. (:issue:`54264`)
 - :meth:`ExtensionArray._explode` interface method added to allow extension type implementations of the ``explode`` method (:issue:`54833`)
 - :meth:`ExtensionArray.duplicated` added to allow extension type implementations of the ``duplicated`` method (:issue:`55255`)
 - DataFrame.apply now allows the usage of numba (via ``engine="numba"``) to JIT compile the passed function, allowing for potential speedups (:issue:`54666`)
diff --git a/pandas/io/spss.py b/pandas/io/spss.py
@@ -63,9 +63,10 @@ def read_spss(
             raise TypeError("usecols must be list-like.")
         usecols = list(usecols)  # pyreadstat requires a list
 
-    df, _ = pyreadstat.read_sav(
+    df, metadata = pyreadstat.read_sav(
         stringify_path(path), usecols=usecols, apply_value_formats=convert_categoricals
     )
+    df.attrs = metadata.__dict__
     if dtype_backend is not lib.no_default:
         df = df.convert_dtypes(dtype_backend=dtype_backend)
     return df
diff --git a/pandas/tests/io/test_spss.py b/pandas/tests/io/test_spss.py
@@ -116,3 +116,35 @@ def test_invalid_dtype_backend():
     )
     with pytest.raises(ValueError, match=msg):
         pd.read_spss("test", dtype_backend="numpy")
+
+
+@pytest.mark.filterwarnings("ignore::pandas.errors.ChainedAssignmentError")
+def test_spss_metadata(datapath):
+    # GH 54264
+    fname = datapath("io", "data", "spss", "labelled-num.sav")
+
+    df = pd.read_spss(fname)
+    metadata = {
+        "column_names": ["VAR00002"],
+        "column_labels": [None],
+        "column_names_to_labels": {"VAR00002": None},
+        "file_encoding": "UTF-8",
+        "number_columns": 1,
+        "number_rows": 1,
+        "variable_value_labels": {"VAR00002": {1.0: "This is one"}},
+        "value_labels": {"labels0": {1.0: "This is one"}},
+        "variable_to_label": {"VAR00002": "labels0"},
+        "notes": [],
+        "original_variable_types": {"VAR00002": "F8.0"},
+        "readstat_variable_types": {"VAR00002": "double"},
+        "table_name": None,
+        "missing_ranges": {},
+        "missing_user_values": {},
+        "variable_storage_width": {"VAR00002": 8},
+        "variable_display_width": {"VAR00002": 8},
+        "variable_alignment": {"VAR00002": "unknown"},
+        "variable_measure": {"VAR00002": "unknown"},
+        "file_label": None,
+        "file_format": "sav/zsav",
+    }
+    assert df.attrs == metadata