Skip to content

Commit 583026b

Browse files
authored
ENH: Add support for reading value labels from 108-format and prior Stata dta files (#58155)
* ENH: Add support for reading value labels from 108-format and prior Stata dta files * Add type hints for value label dictionary * Apply changes suggested by pylint * Clarify that only the 108 format has both 8 character (plus null terminator) label names and uses the newer value label layout * Split function for reading value labels into newer and older format versions * Remove duplicate line * Update type hints for value label dictionary keys to match read content * Indicate versions each value label helper function applies to via docstrings * Seek to value table location within version specific helper functions * Wait until value labels are read before setting flag * Move value label dictionary initialisation to class __init__
1 parent 1e9053d commit 583026b

File tree

6 files changed

+95
-20
lines changed

6 files changed

+95
-20
lines changed

doc/source/whatsnew/v3.0.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ Other enhancements
3333
- :meth:`Styler.set_tooltips` provides alternative method to storing tooltips by using title attribute of td elements. (:issue:`56981`)
3434
- Allow dictionaries to be passed to :meth:`pandas.Series.str.replace` via ``pat`` parameter (:issue:`51748`)
3535
- Support passing a :class:`Series` input to :func:`json_normalize` that retains the :class:`Series` :class:`Index` (:issue:`51452`)
36+
- Support reading value labels from Stata 108-format (Stata 6) and earlier files (:issue:`58154`)
3637
- Users can globally disable any ``PerformanceWarning`` by setting the option ``mode.performance_warnings`` to ``False`` (:issue:`56920`)
3738
- :meth:`Styler.format_index_names` can now be used to format the index and column names (:issue:`48936` and :issue:`47489`)
3839
-

pandas/io/stata.py

+47-19
Original file line numberDiff line numberDiff line change
@@ -1122,6 +1122,7 @@ def __init__(
11221122
# State variables for the file
11231123
self._close_file: Callable[[], None] | None = None
11241124
self._column_selector_set = False
1125+
self._value_label_dict: dict[str, dict[int, str]] = {}
11251126
self._value_labels_read = False
11261127
self._dtype: np.dtype | None = None
11271128
self._lines_read = 0
@@ -1502,36 +1503,26 @@ def _decode(self, s: bytes) -> str:
15021503
)
15031504
return s.decode("latin-1")
15041505

1505-
def _read_value_labels(self) -> None:
1506-
self._ensure_open()
1507-
if self._value_labels_read:
1508-
# Don't read twice
1509-
return
1510-
if self._format_version <= 108:
1511-
# Value labels are not supported in version 108 and earlier.
1512-
self._value_labels_read = True
1513-
self._value_label_dict: dict[str, dict[float, str]] = {}
1514-
return
1515-
1506+
def _read_new_value_labels(self) -> None:
1507+
"""Reads value labels with variable length strings (108 and later format)"""
15161508
if self._format_version >= 117:
15171509
self._path_or_buf.seek(self._seek_value_labels)
15181510
else:
15191511
assert self._dtype is not None
15201512
offset = self._nobs * self._dtype.itemsize
15211513
self._path_or_buf.seek(self._data_location + offset)
15221514

1523-
self._value_labels_read = True
1524-
self._value_label_dict = {}
1525-
15261515
while True:
15271516
if self._format_version >= 117:
15281517
if self._path_or_buf.read(5) == b"</val": # <lbl>
15291518
break # end of value label table
15301519

15311520
slength = self._path_or_buf.read(4)
15321521
if not slength:
1533-
break # end of value label table (format < 117)
1534-
if self._format_version <= 117:
1522+
break # end of value label table (format < 117), or end-of-file
1523+
if self._format_version == 108:
1524+
labname = self._decode(self._path_or_buf.read(9))
1525+
elif self._format_version <= 117:
15351526
labname = self._decode(self._path_or_buf.read(33))
15361527
else:
15371528
labname = self._decode(self._path_or_buf.read(129))
@@ -1555,8 +1546,45 @@ def _read_value_labels(self) -> None:
15551546
self._value_label_dict[labname][val[i]] = self._decode(
15561547
txt[off[i] : end]
15571548
)
1549+
15581550
if self._format_version >= 117:
15591551
self._path_or_buf.read(6) # </lbl>
1552+
1553+
def _read_old_value_labels(self) -> None:
1554+
"""Reads value labels with fixed-length strings (105 and earlier format)"""
1555+
assert self._dtype is not None
1556+
offset = self._nobs * self._dtype.itemsize
1557+
self._path_or_buf.seek(self._data_location + offset)
1558+
1559+
while True:
1560+
if not self._path_or_buf.read(2):
1561+
# end-of-file may have been reached, if so stop here
1562+
break
1563+
1564+
# otherwise back up and read again, taking byteorder into account
1565+
self._path_or_buf.seek(-2, os.SEEK_CUR)
1566+
n = self._read_uint16()
1567+
labname = self._decode(self._path_or_buf.read(9))
1568+
self._path_or_buf.read(1) # padding
1569+
codes = np.frombuffer(
1570+
self._path_or_buf.read(2 * n), dtype=f"{self._byteorder}i2", count=n
1571+
)
1572+
self._value_label_dict[labname] = {}
1573+
for i in range(n):
1574+
self._value_label_dict[labname][codes[i]] = self._decode(
1575+
self._path_or_buf.read(8)
1576+
)
1577+
1578+
def _read_value_labels(self) -> None:
1579+
self._ensure_open()
1580+
if self._value_labels_read:
1581+
# Don't read twice
1582+
return
1583+
1584+
if self._format_version >= 108:
1585+
self._read_new_value_labels()
1586+
else:
1587+
self._read_old_value_labels()
15601588
self._value_labels_read = True
15611589

15621590
def _read_strls(self) -> None:
@@ -1729,7 +1757,7 @@ def read(
17291757
i, _stata_elapsed_date_to_datetime_vec(data.iloc[:, i], fmt)
17301758
)
17311759

1732-
if convert_categoricals and self._format_version > 108:
1760+
if convert_categoricals:
17331761
data = self._do_convert_categoricals(
17341762
data, self._value_label_dict, self._lbllist, order_categoricals
17351763
)
@@ -1845,7 +1873,7 @@ def _do_select_columns(self, data: DataFrame, columns: Sequence[str]) -> DataFra
18451873
def _do_convert_categoricals(
18461874
self,
18471875
data: DataFrame,
1848-
value_label_dict: dict[str, dict[float, str]],
1876+
value_label_dict: dict[str, dict[int, str]],
18491877
lbllist: Sequence[str],
18501878
order_categoricals: bool,
18511879
) -> DataFrame:
@@ -1983,7 +2011,7 @@ def variable_labels(self) -> dict[str, str]:
19832011
self._ensure_open()
19842012
return dict(zip(self._varlist, self._variable_labels))
19852013

1986-
def value_labels(self) -> dict[str, dict[float, str]]:
2014+
def value_labels(self) -> dict[str, dict[int, str]]:
19872015
"""
19882016
Return a nested dict associating each variable name to its value and label.
19892017
816 Bytes
Binary file not shown.
1.19 KB
Binary file not shown.
1.49 KB
Binary file not shown.

pandas/tests/io/test_stata.py

+47-1
Original file line numberDiff line numberDiff line change
@@ -225,7 +225,7 @@ def test_read_dta3(self, file, datapath):
225225
tm.assert_frame_equal(parsed, expected)
226226

227227
@pytest.mark.parametrize(
228-
"file", ["stata4_113", "stata4_114", "stata4_115", "stata4_117"]
228+
"file", ["stata4_111", "stata4_113", "stata4_114", "stata4_115", "stata4_117"]
229229
)
230230
def test_read_dta4(self, file, datapath):
231231
file = datapath("io", "data", "stata", f"{file}.dta")
@@ -270,6 +270,52 @@ def test_read_dta4(self, file, datapath):
270270
# stata doesn't save .category metadata
271271
tm.assert_frame_equal(parsed, expected)
272272

273+
@pytest.mark.parametrize("file", ["stata4_105", "stata4_108"])
274+
def test_readold_dta4(self, file, datapath):
275+
# This test is the same as test_read_dta4 above except that the columns
276+
# had to be renamed to match the restrictions in older file format
277+
file = datapath("io", "data", "stata", f"{file}.dta")
278+
parsed = self.read_dta(file)
279+
280+
expected = DataFrame.from_records(
281+
[
282+
["one", "ten", "one", "one", "one"],
283+
["two", "nine", "two", "two", "two"],
284+
["three", "eight", "three", "three", "three"],
285+
["four", "seven", 4, "four", "four"],
286+
["five", "six", 5, np.nan, "five"],
287+
["six", "five", 6, np.nan, "six"],
288+
["seven", "four", 7, np.nan, "seven"],
289+
["eight", "three", 8, np.nan, "eight"],
290+
["nine", "two", 9, np.nan, "nine"],
291+
["ten", "one", "ten", np.nan, "ten"],
292+
],
293+
columns=[
294+
"fulllab",
295+
"fulllab2",
296+
"incmplab",
297+
"misslab",
298+
"floatlab",
299+
],
300+
)
301+
302+
# these are all categoricals
303+
for col in expected:
304+
orig = expected[col].copy()
305+
306+
categories = np.asarray(expected["fulllab"][orig.notna()])
307+
if col == "incmplab":
308+
categories = orig
309+
310+
cat = orig.astype("category")._values
311+
cat = cat.set_categories(categories, ordered=True)
312+
cat.categories.rename(None, inplace=True)
313+
314+
expected[col] = cat
315+
316+
# stata doesn't save .category metadata
317+
tm.assert_frame_equal(parsed, expected)
318+
273319
# File containing strls
274320
def test_read_dta12(self, datapath):
275321
parsed_117 = self.read_dta(datapath("io", "data", "stata", "stata12_117.dta"))

0 commit comments

Comments
 (0)