-
-
Notifications
You must be signed in to change notification settings - Fork 18.4k
ENH: Add support for reading value labels from 108-format and prior Stata dta files #58155
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 12 commits
41d1f22
3feed75
be8aac5
dd14736
c2836bf
3f2acb3
2310022
bf8620c
36acb33
af7d5e4
b0dc320
792d10c
445fbaf
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1502,36 +1502,26 @@ def _decode(self, s: bytes) -> str: | |
) | ||
return s.decode("latin-1") | ||
|
||
def _read_value_labels(self) -> None: | ||
self._ensure_open() | ||
if self._value_labels_read: | ||
# Don't read twice | ||
return | ||
if self._format_version <= 108: | ||
# Value labels are not supported in version 108 and earlier. | ||
self._value_labels_read = True | ||
self._value_label_dict: dict[str, dict[float, str]] = {} | ||
return | ||
|
||
def _read_new_value_labels(self) -> None: | ||
"""Reads value labels with variable length strings (108 and later format)""" | ||
if self._format_version >= 117: | ||
self._path_or_buf.seek(self._seek_value_labels) | ||
else: | ||
assert self._dtype is not None | ||
offset = self._nobs * self._dtype.itemsize | ||
self._path_or_buf.seek(self._data_location + offset) | ||
|
||
self._value_labels_read = True | ||
self._value_label_dict = {} | ||
|
||
while True: | ||
if self._format_version >= 117: | ||
if self._path_or_buf.read(5) == b"</val": # <lbl> | ||
break # end of value label table | ||
|
||
slength = self._path_or_buf.read(4) | ||
if not slength: | ||
break # end of value label table (format < 117) | ||
if self._format_version <= 117: | ||
break # end of value label table (format < 117), or end-of-file | ||
if self._format_version == 108: | ||
labname = self._decode(self._path_or_buf.read(9)) | ||
elif self._format_version <= 117: | ||
labname = self._decode(self._path_or_buf.read(33)) | ||
else: | ||
labname = self._decode(self._path_or_buf.read(129)) | ||
|
@@ -1555,8 +1545,47 @@ def _read_value_labels(self) -> None: | |
self._value_label_dict[labname][val[i]] = self._decode( | ||
txt[off[i] : end] | ||
) | ||
|
||
if self._format_version >= 117: | ||
self._path_or_buf.read(6) # </lbl> | ||
|
||
def _read_old_value_labels(self) -> None: | ||
"""Reads value labels with fixed-length strings (105 and earlier format)""" | ||
assert self._dtype is not None | ||
offset = self._nobs * self._dtype.itemsize | ||
self._path_or_buf.seek(self._data_location + offset) | ||
|
||
while True: | ||
if not self._path_or_buf.read(2): | ||
# end-of-file may have been reached, if so stop here | ||
break | ||
|
||
# otherwise back up and read again, taking byteorder into account | ||
self._path_or_buf.seek(-2, os.SEEK_CUR) | ||
n = self._read_uint16() | ||
labname = self._decode(self._path_or_buf.read(9)) | ||
self._path_or_buf.read(1) # padding | ||
codes = np.frombuffer( | ||
self._path_or_buf.read(2 * n), dtype=f"{self._byteorder}i2", count=n | ||
) | ||
self._value_label_dict[labname] = {} | ||
for i in range(n): | ||
self._value_label_dict[labname][codes[i]] = self._decode( | ||
self._path_or_buf.read(8) | ||
) | ||
|
||
def _read_value_labels(self) -> None: | ||
self._ensure_open() | ||
if self._value_labels_read: | ||
# Don't read twice | ||
return | ||
|
||
self._value_label_dict: dict[str, dict[int, str]] = {} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Perhaps one last one. Should this be moved to the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I have now moved this up as suggested. I put it in the |
||
|
||
if self._format_version >= 108: | ||
self._read_new_value_labels() | ||
else: | ||
self._read_old_value_labels() | ||
self._value_labels_read = True | ||
|
||
def _read_strls(self) -> None: | ||
|
@@ -1729,7 +1758,7 @@ def read( | |
i, _stata_elapsed_date_to_datetime_vec(data.iloc[:, i], fmt) | ||
) | ||
|
||
if convert_categoricals and self._format_version > 108: | ||
if convert_categoricals: | ||
data = self._do_convert_categoricals( | ||
data, self._value_label_dict, self._lbllist, order_categoricals | ||
) | ||
|
@@ -1845,7 +1874,7 @@ def _do_select_columns(self, data: DataFrame, columns: Sequence[str]) -> DataFra | |
def _do_convert_categoricals( | ||
self, | ||
data: DataFrame, | ||
value_label_dict: dict[str, dict[float, str]], | ||
value_label_dict: dict[str, dict[int, str]], | ||
lbllist: Sequence[str], | ||
order_categoricals: bool, | ||
) -> DataFrame: | ||
|
@@ -1983,7 +2012,7 @@ def variable_labels(self) -> dict[str, str]: | |
self._ensure_open() | ||
return dict(zip(self._varlist, self._variable_labels)) | ||
|
||
def value_labels(self) -> dict[str, dict[float, str]]: | ||
def value_labels(self) -> dict[str, dict[int, str]]: | ||
""" | ||
Return a nested dict associating each variable name to its value and label. | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Small docstring (one-liner) here and above indicating the versions that this targets would help readability without having to dig deeper.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Good idea - I have now added this.