-
-
Notifications
You must be signed in to change notification settings - Fork 18.4k
ENH: Add support for reading value labels from 108-format and prior Stata dta files #58155
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 3 commits
41d1f22
3feed75
be8aac5
dd14736
c2836bf
3f2acb3
2310022
bf8620c
36acb33
af7d5e4
b0dc320
792d10c
445fbaf
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1507,11 +1507,6 @@ def _read_value_labels(self) -> None: | |
if self._value_labels_read: | ||
# Don't read twice | ||
return | ||
if self._format_version <= 108: | ||
# Value labels are not supported in version 108 and earlier. | ||
self._value_labels_read = True | ||
self._value_label_dict: dict[str, dict[float, str]] = {} | ||
return | ||
|
||
if self._format_version >= 117: | ||
self._path_or_buf.seek(self._seek_value_labels) | ||
|
@@ -1521,42 +1516,64 @@ def _read_value_labels(self) -> None: | |
self._path_or_buf.seek(self._data_location + offset) | ||
|
||
self._value_labels_read = True | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Perhaps move these to after the block that does the read. Make a bit more sense there now that this is short. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I agree, I have now moved this down. |
||
self._value_label_dict = {} | ||
self._value_label_dict: dict[str, dict[int, str]] = {} | ||
|
||
while True: | ||
if self._format_version >= 117: | ||
if self._path_or_buf.read(5) == b"</val": # <lbl> | ||
break # end of value label table | ||
|
||
slength = self._path_or_buf.read(4) | ||
if not slength: | ||
break # end of value label table (format < 117) | ||
if self._format_version <= 117: | ||
labname = self._decode(self._path_or_buf.read(33)) | ||
if self._format_version >= 108: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Something is off with the logic here and a few lines above with the other check. This should only be read if < 117 it looks like. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The content of |
||
slength = self._path_or_buf.read(4) | ||
if not slength: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Perhaps this should be There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I hadn't looked at the slength bit previously, but it could probably be a bit clearer. The |
||
break # end of value label table (format < 117) | ||
if self._format_version <= 108: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This should be == 108 since you ahve >= 108 above and <= 108 means ==108. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, this should be equivalent (and clearer) - I was using the cut-off versions for label name lengths for reference. I'll make the suggested change. |
||
labname = self._decode(self._path_or_buf.read(9)) | ||
elif self._format_version <= 117: | ||
labname = self._decode(self._path_or_buf.read(33)) | ||
else: | ||
labname = self._decode(self._path_or_buf.read(129)) | ||
self._path_or_buf.read(3) # padding | ||
|
||
n = self._read_uint32() | ||
txtlen = self._read_uint32() | ||
off = np.frombuffer( | ||
self._path_or_buf.read(4 * n), dtype=f"{self._byteorder}i4", count=n | ||
) | ||
val = np.frombuffer( | ||
self._path_or_buf.read(4 * n), dtype=f"{self._byteorder}i4", count=n | ||
) | ||
ii = np.argsort(off) | ||
off = off[ii] | ||
val = val[ii] | ||
txt = self._path_or_buf.read(txtlen) | ||
self._value_label_dict[labname] = {} | ||
for i in range(n): | ||
end = off[i + 1] if i < n - 1 else txtlen | ||
self._value_label_dict[labname][val[i]] = self._decode( | ||
txt[off[i] : end] | ||
) | ||
if self._format_version >= 117: | ||
self._path_or_buf.read(6) # </lbl> | ||
else: | ||
labname = self._decode(self._path_or_buf.read(129)) | ||
self._path_or_buf.read(3) # padding | ||
if not self._path_or_buf.read(2): | ||
# end-of-file may have been reached, if so stop here | ||
break | ||
|
||
n = self._read_uint32() | ||
txtlen = self._read_uint32() | ||
off = np.frombuffer( | ||
self._path_or_buf.read(4 * n), dtype=f"{self._byteorder}i4", count=n | ||
) | ||
val = np.frombuffer( | ||
self._path_or_buf.read(4 * n), dtype=f"{self._byteorder}i4", count=n | ||
) | ||
ii = np.argsort(off) | ||
off = off[ii] | ||
val = val[ii] | ||
txt = self._path_or_buf.read(txtlen) | ||
self._value_label_dict[labname] = {} | ||
for i in range(n): | ||
end = off[i + 1] if i < n - 1 else txtlen | ||
self._value_label_dict[labname][val[i]] = self._decode( | ||
txt[off[i] : end] | ||
# otherwise back up and read again, taking byteorder into account | ||
self._path_or_buf.seek(-2, os.SEEK_CUR) | ||
n = self._read_uint16() | ||
labname = self._decode(self._path_or_buf.read(9)) | ||
self._path_or_buf.read(1) # padding | ||
codes = np.frombuffer( | ||
self._path_or_buf.read(2 * n), dtype=f"{self._byteorder}i2", count=n | ||
) | ||
if self._format_version >= 117: | ||
self._path_or_buf.read(6) # </lbl> | ||
self._value_label_dict[labname] = {} | ||
for i in range(n): | ||
self._value_label_dict[labname][codes[i]] = self._decode( | ||
self._path_or_buf.read(8) | ||
) | ||
|
||
self._value_labels_read = True | ||
|
||
def _read_strls(self) -> None: | ||
|
@@ -1729,7 +1746,7 @@ def read( | |
i, _stata_elapsed_date_to_datetime_vec(data.iloc[:, i], fmt) | ||
) | ||
|
||
if convert_categoricals and self._format_version > 108: | ||
if convert_categoricals: | ||
data = self._do_convert_categoricals( | ||
data, self._value_label_dict, self._lbllist, order_categoricals | ||
) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should this logic move to the helper functions? Seems cleaner to me to move the cursor in the code that does the actual reading. I think the other lines about checking if already can stay here.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think I was trying to avoid duplicating calculations for the seek location between the two function, but that does make sense (maybe long term the calculations could be moved to
_read_old_header
and stored inself._seek_value_labels
which would then match_read_new_header
?). I have now made the suggested change here too.