Skip to content

ENH: Add support for reading value labels from 108-format and prior Stata dta files #58155

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 13 commits into from
Apr 9, 2024
Merged
Show file tree
Hide file tree
Changes from 12 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v3.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ Other enhancements
- :meth:`Styler.set_tooltips` provides alternative method to storing tooltips by using title attribute of td elements. (:issue:`56981`)
- Allow dictionaries to be passed to :meth:`pandas.Series.str.replace` via ``pat`` parameter (:issue:`51748`)
- Support passing a :class:`Series` input to :func:`json_normalize` that retains the :class:`Series` :class:`Index` (:issue:`51452`)
- Support reading value labels from Stata 108-format (Stata 6) and earlier files (:issue:`58154`)
- Users can globally disable any ``PerformanceWarning`` by setting the option ``mode.performance_warnings`` to ``False`` (:issue:`56920`)
- :meth:`Styler.format_index_names` can now be used to format the index and column names (:issue:`48936` and :issue:`47489`)
-
Expand Down
67 changes: 48 additions & 19 deletions pandas/io/stata.py
Original file line number Diff line number Diff line change
Expand Up @@ -1502,36 +1502,26 @@ def _decode(self, s: bytes) -> str:
)
return s.decode("latin-1")

def _read_value_labels(self) -> None:
self._ensure_open()
if self._value_labels_read:
# Don't read twice
return
if self._format_version <= 108:
# Value labels are not supported in version 108 and earlier.
self._value_labels_read = True
self._value_label_dict: dict[str, dict[float, str]] = {}
return

def _read_new_value_labels(self) -> None:
"""Reads value labels with variable length strings (108 and later format)"""
if self._format_version >= 117:
self._path_or_buf.seek(self._seek_value_labels)
else:
assert self._dtype is not None
offset = self._nobs * self._dtype.itemsize
self._path_or_buf.seek(self._data_location + offset)

self._value_labels_read = True
self._value_label_dict = {}

while True:
if self._format_version >= 117:
if self._path_or_buf.read(5) == b"</val": # <lbl>
break # end of value label table

slength = self._path_or_buf.read(4)
if not slength:
break # end of value label table (format < 117)
if self._format_version <= 117:
break # end of value label table (format < 117), or end-of-file
if self._format_version == 108:
labname = self._decode(self._path_or_buf.read(9))
elif self._format_version <= 117:
labname = self._decode(self._path_or_buf.read(33))
else:
labname = self._decode(self._path_or_buf.read(129))
Expand All @@ -1555,8 +1545,47 @@ def _read_value_labels(self) -> None:
self._value_label_dict[labname][val[i]] = self._decode(
txt[off[i] : end]
)

if self._format_version >= 117:
self._path_or_buf.read(6) # </lbl>

def _read_old_value_labels(self) -> None:
"""Reads value labels with fixed-length strings (105 and earlier format)"""
assert self._dtype is not None
offset = self._nobs * self._dtype.itemsize
self._path_or_buf.seek(self._data_location + offset)

while True:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Small docstring (one-liner) here and above indicating the versions that this targets would help readability without having to dig deeper.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good idea - I have now added this.

if not self._path_or_buf.read(2):
# end-of-file may have been reached, if so stop here
break

# otherwise back up and read again, taking byteorder into account
self._path_or_buf.seek(-2, os.SEEK_CUR)
n = self._read_uint16()
labname = self._decode(self._path_or_buf.read(9))
self._path_or_buf.read(1) # padding
codes = np.frombuffer(
self._path_or_buf.read(2 * n), dtype=f"{self._byteorder}i2", count=n
)
self._value_label_dict[labname] = {}
for i in range(n):
self._value_label_dict[labname][codes[i]] = self._decode(
self._path_or_buf.read(8)
)

def _read_value_labels(self) -> None:
self._ensure_open()
if self._value_labels_read:
# Don't read twice
return

self._value_label_dict: dict[str, dict[int, str]] = {}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Perhaps one last one. Should this be moved to the __init__? I prefer to declare all attributes there since it avoids late addition of attributes.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have now moved this up as suggested. I put it in the # State variables for the file section as this seemed the closest fit, but can shift it around it you like.


if self._format_version >= 108:
self._read_new_value_labels()
else:
self._read_old_value_labels()
self._value_labels_read = True

def _read_strls(self) -> None:
Expand Down Expand Up @@ -1729,7 +1758,7 @@ def read(
i, _stata_elapsed_date_to_datetime_vec(data.iloc[:, i], fmt)
)

if convert_categoricals and self._format_version > 108:
if convert_categoricals:
data = self._do_convert_categoricals(
data, self._value_label_dict, self._lbllist, order_categoricals
)
Expand Down Expand Up @@ -1845,7 +1874,7 @@ def _do_select_columns(self, data: DataFrame, columns: Sequence[str]) -> DataFra
def _do_convert_categoricals(
self,
data: DataFrame,
value_label_dict: dict[str, dict[float, str]],
value_label_dict: dict[str, dict[int, str]],
lbllist: Sequence[str],
order_categoricals: bool,
) -> DataFrame:
Expand Down Expand Up @@ -1983,7 +2012,7 @@ def variable_labels(self) -> dict[str, str]:
self._ensure_open()
return dict(zip(self._varlist, self._variable_labels))

def value_labels(self) -> dict[str, dict[float, str]]:
def value_labels(self) -> dict[str, dict[int, str]]:
"""
Return a nested dict associating each variable name to its value and label.

Expand Down
Binary file added pandas/tests/io/data/stata/stata4_105.dta
Binary file not shown.
Binary file added pandas/tests/io/data/stata/stata4_108.dta
Binary file not shown.
Binary file added pandas/tests/io/data/stata/stata4_111.dta
Binary file not shown.
48 changes: 47 additions & 1 deletion pandas/tests/io/test_stata.py
Original file line number Diff line number Diff line change
Expand Up @@ -225,7 +225,7 @@ def test_read_dta3(self, file, datapath):
tm.assert_frame_equal(parsed, expected)

@pytest.mark.parametrize(
"file", ["stata4_113", "stata4_114", "stata4_115", "stata4_117"]
"file", ["stata4_111", "stata4_113", "stata4_114", "stata4_115", "stata4_117"]
)
def test_read_dta4(self, file, datapath):
file = datapath("io", "data", "stata", f"{file}.dta")
Expand Down Expand Up @@ -270,6 +270,52 @@ def test_read_dta4(self, file, datapath):
# stata doesn't save .category metadata
tm.assert_frame_equal(parsed, expected)

@pytest.mark.parametrize("file", ["stata4_105", "stata4_108"])
def test_readold_dta4(self, file, datapath):
# This test is the same as test_read_dta4 above except that the columns
# had to be renamed to match the restrictions in older file format
file = datapath("io", "data", "stata", f"{file}.dta")
parsed = self.read_dta(file)

expected = DataFrame.from_records(
[
["one", "ten", "one", "one", "one"],
["two", "nine", "two", "two", "two"],
["three", "eight", "three", "three", "three"],
["four", "seven", 4, "four", "four"],
["five", "six", 5, np.nan, "five"],
["six", "five", 6, np.nan, "six"],
["seven", "four", 7, np.nan, "seven"],
["eight", "three", 8, np.nan, "eight"],
["nine", "two", 9, np.nan, "nine"],
["ten", "one", "ten", np.nan, "ten"],
],
columns=[
"fulllab",
"fulllab2",
"incmplab",
"misslab",
"floatlab",
],
)

# these are all categoricals
for col in expected:
orig = expected[col].copy()

categories = np.asarray(expected["fulllab"][orig.notna()])
if col == "incmplab":
categories = orig

cat = orig.astype("category")._values
cat = cat.set_categories(categories, ordered=True)
cat.categories.rename(None, inplace=True)

expected[col] = cat

# stata doesn't save .category metadata
tm.assert_frame_equal(parsed, expected)

# File containing strls
def test_read_dta12(self, datapath):
parsed_117 = self.read_dta(datapath("io", "data", "stata", "stata12_117.dta"))
Expand Down