Skip to content

Commit b284101

Browse files
authored
EHN: read_spss stores the metadata in df.attrs (#55472)
* EHN: read_spss stores the metadata in df.attrs * filter warning * Make separate variable
1 parent 66a54a3 commit b284101

File tree

3 files changed

+35
-1
lines changed

3 files changed

+35
-1
lines changed

doc/source/whatsnew/v2.2.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,7 @@ Other enhancements
7676

7777
- :attr:`Series.attrs` / :attr:`DataFrame.attrs` now uses a deepcopy for propagating ``attrs`` (:issue:`54134`).
7878
- :func:`read_csv` now supports ``on_bad_lines`` parameter with ``engine="pyarrow"``. (:issue:`54480`)
79+
- :func:`read_spss` now returns a :class:`DataFrame` that stores the metadata in :attr:`DataFrame.attrs`. (:issue:`54264`)
7980
- :meth:`ExtensionArray._explode` interface method added to allow extension type implementations of the ``explode`` method (:issue:`54833`)
8081
- :meth:`ExtensionArray.duplicated` added to allow extension type implementations of the ``duplicated`` method (:issue:`55255`)
8182
- DataFrame.apply now allows the usage of numba (via ``engine="numba"``) to JIT compile the passed function, allowing for potential speedups (:issue:`54666`)

pandas/io/spss.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -63,9 +63,10 @@ def read_spss(
6363
raise TypeError("usecols must be list-like.")
6464
usecols = list(usecols) # pyreadstat requires a list
6565

66-
df, _ = pyreadstat.read_sav(
66+
df, metadata = pyreadstat.read_sav(
6767
stringify_path(path), usecols=usecols, apply_value_formats=convert_categoricals
6868
)
69+
df.attrs = metadata.__dict__
6970
if dtype_backend is not lib.no_default:
7071
df = df.convert_dtypes(dtype_backend=dtype_backend)
7172
return df

pandas/tests/io/test_spss.py

+32
Original file line numberDiff line numberDiff line change
@@ -116,3 +116,35 @@ def test_invalid_dtype_backend():
116116
)
117117
with pytest.raises(ValueError, match=msg):
118118
pd.read_spss("test", dtype_backend="numpy")
119+
120+
121+
@pytest.mark.filterwarnings("ignore::pandas.errors.ChainedAssignmentError")
122+
def test_spss_metadata(datapath):
123+
# GH 54264
124+
fname = datapath("io", "data", "spss", "labelled-num.sav")
125+
126+
df = pd.read_spss(fname)
127+
metadata = {
128+
"column_names": ["VAR00002"],
129+
"column_labels": [None],
130+
"column_names_to_labels": {"VAR00002": None},
131+
"file_encoding": "UTF-8",
132+
"number_columns": 1,
133+
"number_rows": 1,
134+
"variable_value_labels": {"VAR00002": {1.0: "This is one"}},
135+
"value_labels": {"labels0": {1.0: "This is one"}},
136+
"variable_to_label": {"VAR00002": "labels0"},
137+
"notes": [],
138+
"original_variable_types": {"VAR00002": "F8.0"},
139+
"readstat_variable_types": {"VAR00002": "double"},
140+
"table_name": None,
141+
"missing_ranges": {},
142+
"missing_user_values": {},
143+
"variable_storage_width": {"VAR00002": 8},
144+
"variable_display_width": {"VAR00002": 8},
145+
"variable_alignment": {"VAR00002": "unknown"},
146+
"variable_measure": {"VAR00002": "unknown"},
147+
"file_label": None,
148+
"file_format": "sav/zsav",
149+
}
150+
assert df.attrs == metadata

0 commit comments

Comments
 (0)