From 41d1f2259e9e0c69d65217745658fe655ae66be1 Mon Sep 17 00:00:00 2001 From: Chris Charlton Date: Fri, 5 Apr 2024 14:00:16 +0100 Subject: [PATCH 01/11] ENH: Add support for reading value labels from 108-format and prior Stata dta files --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/io/stata.py | 83 +++++++++++++--------- pandas/tests/io/data/stata/stata4_105.dta | Bin 0 -> 816 bytes pandas/tests/io/data/stata/stata4_108.dta | Bin 0 -> 1214 bytes pandas/tests/io/data/stata/stata4_111.dta | Bin 0 -> 1528 bytes pandas/tests/io/test_stata.py | 48 ++++++++++++- 6 files changed, 98 insertions(+), 34 deletions(-) create mode 100644 pandas/tests/io/data/stata/stata4_105.dta create mode 100644 pandas/tests/io/data/stata/stata4_108.dta create mode 100644 pandas/tests/io/data/stata/stata4_111.dta diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index d2d5707f32bf3..3ef0ff4074506 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -33,6 +33,7 @@ Other enhancements - :meth:`Styler.set_tooltips` provides alternative method to storing tooltips by using title attribute of td elements. (:issue:`56981`) - Allow dictionaries to be passed to :meth:`pandas.Series.str.replace` via ``pat`` parameter (:issue:`51748`) - Support passing a :class:`Series` input to :func:`json_normalize` that retains the :class:`Series` :class:`Index` (:issue:`51452`) +- Support reading value labels from Stata 108-format (Stata 6) and earlier files (:issue:`58154`) - Users can globally disable any ``PerformanceWarning`` by setting the option ``mode.performance_warnings`` to ``False`` (:issue:`56920`) - :meth:`Styler.format_index_names` can now be used to format the index and column names (:issue:`48936` and :issue:`47489`) - diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 3ec077806d6c4..310dad2e58859 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -1507,11 +1507,6 @@ def _read_value_labels(self) -> None: if self._value_labels_read: # Don't read twice return - if self._format_version <= 108: - # Value labels are not supported in version 108 and earlier. - self._value_labels_read = True - self._value_label_dict: dict[str, dict[float, str]] = {} - return if self._format_version >= 117: self._path_or_buf.seek(self._seek_value_labels) @@ -1528,35 +1523,57 @@ def _read_value_labels(self) -> None: if self._path_or_buf.read(5) == b" break # end of value label table - slength = self._path_or_buf.read(4) - if not slength: - break # end of value label table (format < 117) - if self._format_version <= 117: - labname = self._decode(self._path_or_buf.read(33)) - else: - labname = self._decode(self._path_or_buf.read(129)) - self._path_or_buf.read(3) # padding + if self._format_version >= 108: + slength = self._path_or_buf.read(4) + if not slength: + break # end of value label table (format < 117) + if self._format_version <= 108: + labname = self._decode(self._path_or_buf.read(9)) + elif self._format_version <= 117: + labname = self._decode(self._path_or_buf.read(33)) + else: + labname = self._decode(self._path_or_buf.read(129)) + self._path_or_buf.read(3) # padding - n = self._read_uint32() - txtlen = self._read_uint32() - off = np.frombuffer( - self._path_or_buf.read(4 * n), dtype=f"{self._byteorder}i4", count=n - ) - val = np.frombuffer( - self._path_or_buf.read(4 * n), dtype=f"{self._byteorder}i4", count=n - ) - ii = np.argsort(off) - off = off[ii] - val = val[ii] - txt = self._path_or_buf.read(txtlen) - self._value_label_dict[labname] = {} - for i in range(n): - end = off[i + 1] if i < n - 1 else txtlen - self._value_label_dict[labname][val[i]] = self._decode( - txt[off[i] : end] + n = self._read_uint32() + txtlen = self._read_uint32() + off = np.frombuffer( + self._path_or_buf.read(4 * n), dtype=f"{self._byteorder}i4", count=n ) - if self._format_version >= 117: - self._path_or_buf.read(6) # + val = np.frombuffer( + self._path_or_buf.read(4 * n), dtype=f"{self._byteorder}i4", count=n + ) + ii = np.argsort(off) + off = off[ii] + val = val[ii] + txt = self._path_or_buf.read(txtlen) + self._value_label_dict[labname] = {} + for i in range(n): + end = off[i + 1] if i < n - 1 else txtlen + self._value_label_dict[labname][val[i]] = self._decode( + txt[off[i] : end] + ) + if self._format_version >= 117: + self._path_or_buf.read(6) # + else: + if not self._path_or_buf.read(2): + # end-of-file may have been reached, if so stop here + break + else: + # otherwise back up and read again, taking byteorder into account + self._path_or_buf.seek(-2, os.SEEK_CUR) + n = self._read_uint16() + labname = self._decode(self._path_or_buf.read(9)) + self._path_or_buf.read(1) # padding + codes = np.frombuffer( + self._path_or_buf.read(2 * n), dtype=f"{self._byteorder}i2", count=n + ) + self._value_label_dict[labname] = {} + for i in range(n): + self._value_label_dict[labname][codes[i]] = self._decode( + self._path_or_buf.read(8) + ) + self._value_labels_read = True def _read_strls(self) -> None: @@ -1729,7 +1746,7 @@ def read( i, _stata_elapsed_date_to_datetime_vec(data.iloc[:, i], fmt) ) - if convert_categoricals and self._format_version > 108: + if convert_categoricals: data = self._do_convert_categoricals( data, self._value_label_dict, self._lbllist, order_categoricals ) diff --git a/pandas/tests/io/data/stata/stata4_105.dta b/pandas/tests/io/data/stata/stata4_105.dta new file mode 100644 index 0000000000000000000000000000000000000000..f804c315b344bb10576889f5adbbddf445d28275 GIT binary patch literal 816 zcmcgqJ95G>5PcBLui^-5Xc=HALK>tdU8)=)V<$4!jIbD6hNKEd$x%28H%WG7hk=4g z7tiSF?e5b)$&zWe5MhYFKQR+eaUtf@*+NX04~qqo)S5J{thI3hG-M8wzizgi+L*Ge zOV-6cunU{<{bags;_>o=&*tagmWI7vY+{3HAY{4?giWtYO0fJx&<3>CZ(#oXprC!Li)#VU0)!%9 zg2So}j1&QQK;9831|Cv&fD;3F9ApF>8SwoHe93MDAw0?+O9(=El6`JK5JIQ{G0DJd zM~a@oVa0EOLqXyBKhwtAWZyxAE_&!=KySw7nzVr5yL!$F&6(6y1t(Io3xd+ThkB`Z jIwz*hI`cH=rf%Rvmd|s(#m4Wp$M2e9cxB}Mb;Z!MC9H z(wM%j}9bhNbdfM6G`>(VS@uIM|y$hk|3liQrUlCO8*d z2zVHtfU&urAtfFdhkMzWo@t^{Oj8tbrZu8;oFOt3jodsU*AtyUn?YP4F>ZkL?<3`eN9CFn(&_{$MeO-FOjeHk=k~v^%X9ye!^c7yd}=2UmMN zspZ|avTKCq7KRFM&0wfyEOUfXad)EPz#S-SRoJ&#AzvFWUV6wekzv=a>y3Vz*V>uA z)BH;EIMj=)dpaYxzAYrIn$6Kk`^))?)e1`pmW-UGV`F14QeIEX`nuKcXCgFlAfw-t z6&v2z1Pe%J%|I@{S>|D$4x`ZdHbJ~BpXYG|dL(V3%xWYn!Q^h?g=0gb!uV&yECDcq z0SMD>8K5dK00H<820?+q`7Xem0uZLW0x+Wh?~ee_d@l`2I^+99!H}eLen0{kk`%-t z?l=WHTNU^`OuO_(P|Cy;?g}pza_EGEphccY{}^$KI76HxE)bW9D+D!3ol!f)C-Z0< z$R?d7L!@a!3Gu{097IxrZ$=ULYAnI3d$4jWLy)Q;#1JYXV%ot&h5r$9sOSlTR@QLV S*roxhKqD#q5s~w6M)CoZS=6Wi literal 0 HcmV?d00001 diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py index 0cc8018ea6213..a58655d91a417 100644 --- a/pandas/tests/io/test_stata.py +++ b/pandas/tests/io/test_stata.py @@ -225,7 +225,7 @@ def test_read_dta3(self, file, datapath): tm.assert_frame_equal(parsed, expected) @pytest.mark.parametrize( - "file", ["stata4_113", "stata4_114", "stata4_115", "stata4_117"] + "file", ["stata4_111", "stata4_113", "stata4_114", "stata4_115", "stata4_117"] ) def test_read_dta4(self, file, datapath): file = datapath("io", "data", "stata", f"{file}.dta") @@ -270,6 +270,52 @@ def test_read_dta4(self, file, datapath): # stata doesn't save .category metadata tm.assert_frame_equal(parsed, expected) + @pytest.mark.parametrize("file", ["stata4_105", "stata4_108"]) + def test_readold_dta4(self, file, datapath): + # This test is the same as test_read_dta4 above except that the columns + # had to be renamed to match the restrictions in older file format + file = datapath("io", "data", "stata", f"{file}.dta") + parsed = self.read_dta(file) + + expected = DataFrame.from_records( + [ + ["one", "ten", "one", "one", "one"], + ["two", "nine", "two", "two", "two"], + ["three", "eight", "three", "three", "three"], + ["four", "seven", 4, "four", "four"], + ["five", "six", 5, np.nan, "five"], + ["six", "five", 6, np.nan, "six"], + ["seven", "four", 7, np.nan, "seven"], + ["eight", "three", 8, np.nan, "eight"], + ["nine", "two", 9, np.nan, "nine"], + ["ten", "one", "ten", np.nan, "ten"], + ], + columns=[ + "fulllab", + "fulllab2", + "incmplab", + "misslab", + "floatlab", + ], + ) + + # these are all categoricals + for col in expected: + orig = expected[col].copy() + + categories = np.asarray(expected["fulllab"][orig.notna()]) + if col == "incmplab": + categories = orig + + cat = orig.astype("category")._values + cat = cat.set_categories(categories, ordered=True) + cat.categories.rename(None, inplace=True) + + expected[col] = cat + + # stata doesn't save .category metadata + tm.assert_frame_equal(parsed, expected) + # File containing strls def test_read_dta12(self, datapath): parsed_117 = self.read_dta(datapath("io", "data", "stata", "stata12_117.dta")) From 3feed754024937a850194ced00034e45e571aa49 Mon Sep 17 00:00:00 2001 From: Chris Charlton Date: Fri, 5 Apr 2024 15:14:33 +0100 Subject: [PATCH 02/11] Add type hints for value label dictionary --- pandas/io/stata.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 310dad2e58859..baf802bbb4e3b 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -1516,7 +1516,7 @@ def _read_value_labels(self) -> None: self._path_or_buf.seek(self._data_location + offset) self._value_labels_read = True - self._value_label_dict = {} + self._value_label_dict: dict[str, dict[int, str]] = {} while True: if self._format_version >= 117: @@ -1547,7 +1547,7 @@ def _read_value_labels(self) -> None: off = off[ii] val = val[ii] txt = self._path_or_buf.read(txtlen) - self._value_label_dict[labname] = {} + self._value_label_dict[labname]: dict[int, str] = {} for i in range(n): end = off[i + 1] if i < n - 1 else txtlen self._value_label_dict[labname][val[i]] = self._decode( @@ -1568,7 +1568,7 @@ def _read_value_labels(self) -> None: codes = np.frombuffer( self._path_or_buf.read(2 * n), dtype=f"{self._byteorder}i2", count=n ) - self._value_label_dict[labname] = {} + self._value_label_dict[labname]: dict[int, str] = {} for i in range(n): self._value_label_dict[labname][codes[i]] = self._decode( self._path_or_buf.read(8) From be8aac51a33aa014c081eef381490982f357a477 Mon Sep 17 00:00:00 2001 From: Chris Charlton Date: Fri, 5 Apr 2024 16:30:55 +0100 Subject: [PATCH 03/11] Apply changes suggested by pylint --- pandas/io/stata.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/pandas/io/stata.py b/pandas/io/stata.py index baf802bbb4e3b..42fe5c09d1614 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -1547,7 +1547,7 @@ def _read_value_labels(self) -> None: off = off[ii] val = val[ii] txt = self._path_or_buf.read(txtlen) - self._value_label_dict[labname]: dict[int, str] = {} + self._value_label_dict[labname] = {} for i in range(n): end = off[i + 1] if i < n - 1 else txtlen self._value_label_dict[labname][val[i]] = self._decode( @@ -1559,16 +1559,16 @@ def _read_value_labels(self) -> None: if not self._path_or_buf.read(2): # end-of-file may have been reached, if so stop here break - else: - # otherwise back up and read again, taking byteorder into account - self._path_or_buf.seek(-2, os.SEEK_CUR) - n = self._read_uint16() + + # otherwise back up and read again, taking byteorder into account + self._path_or_buf.seek(-2, os.SEEK_CUR) + n = self._read_uint16() labname = self._decode(self._path_or_buf.read(9)) self._path_or_buf.read(1) # padding codes = np.frombuffer( self._path_or_buf.read(2 * n), dtype=f"{self._byteorder}i2", count=n ) - self._value_label_dict[labname]: dict[int, str] = {} + self._value_label_dict[labname] = {} for i in range(n): self._value_label_dict[labname][codes[i]] = self._decode( self._path_or_buf.read(8) From dd147368a7f314f9d928960dc8bf737be6e9172c Mon Sep 17 00:00:00 2001 From: Chris Charlton Date: Fri, 5 Apr 2024 19:11:41 +0100 Subject: [PATCH 04/11] Clarify that only the 108 format has both 8 character (plus null terminator) label names and uses the newer value label layout --- pandas/io/stata.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 42fe5c09d1614..6bdfb66a1c51b 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -1527,7 +1527,7 @@ def _read_value_labels(self) -> None: slength = self._path_or_buf.read(4) if not slength: break # end of value label table (format < 117) - if self._format_version <= 108: + if self._format_version == 108: labname = self._decode(self._path_or_buf.read(9)) elif self._format_version <= 117: labname = self._decode(self._path_or_buf.read(33)) From c2836bf925c0aa9bea5079a2eae2fc6b6a2b58da Mon Sep 17 00:00:00 2001 From: Chris Charlton Date: Fri, 5 Apr 2024 19:42:00 +0100 Subject: [PATCH 05/11] Split function for reading value labels into newer and older format versions --- pandas/io/stata.py | 118 ++++++++++++++++++++++++--------------------- 1 file changed, 63 insertions(+), 55 deletions(-) diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 6bdfb66a1c51b..0aa0363d520c6 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -1502,6 +1502,65 @@ def _decode(self, s: bytes) -> str: ) return s.decode("latin-1") + def _read_new_value_labels(self) -> None: + while True: + if self._format_version >= 117: + if self._path_or_buf.read(5) == b" + break # end of value label table + + slength = self._path_or_buf.read(4) + if not slength: + break # end of value label table (format < 117), or end-of-file + if self._format_version == 108: + labname = self._decode(self._path_or_buf.read(9)) + elif self._format_version <= 117: + labname = self._decode(self._path_or_buf.read(33)) + else: + labname = self._decode(self._path_or_buf.read(129)) + self._path_or_buf.read(3) # padding + + n = self._read_uint32() + txtlen = self._read_uint32() + off = np.frombuffer( + self._path_or_buf.read(4 * n), dtype=f"{self._byteorder}i4", count=n + ) + val = np.frombuffer( + self._path_or_buf.read(4 * n), dtype=f"{self._byteorder}i4", count=n + ) + ii = np.argsort(off) + off = off[ii] + val = val[ii] + txt = self._path_or_buf.read(txtlen) + self._value_label_dict[labname] = {} + for i in range(n): + end = off[i + 1] if i < n - 1 else txtlen + self._value_label_dict[labname][val[i]] = self._decode( + txt[off[i] : end] + ) + + if self._format_version >= 117: + self._path_or_buf.read(6) # + + def _read_old_value_labels(self) -> None: + while True: + if not self._path_or_buf.read(2): + # end-of-file may have been reached, if so stop here + break + + # otherwise back up and read again, taking byteorder into account + self._path_or_buf.seek(-2, os.SEEK_CUR) + n = self._read_uint16() + labname = self._decode(self._path_or_buf.read(9)) + self._path_or_buf.read(1) # padding + codes = np.frombuffer( + self._path_or_buf.read(2 * n), dtype=f"{self._byteorder}i2", count=n + ) + self._value_label_dict[labname] = {} + for i in range(n): + self._value_label_dict[labname][codes[i]] = self._decode( + self._path_or_buf.read(8) + ) + def _read_value_labels(self) -> None: self._ensure_open() if self._value_labels_read: @@ -1518,61 +1577,10 @@ def _read_value_labels(self) -> None: self._value_labels_read = True self._value_label_dict: dict[str, dict[int, str]] = {} - while True: - if self._format_version >= 117: - if self._path_or_buf.read(5) == b" - break # end of value label table - - if self._format_version >= 108: - slength = self._path_or_buf.read(4) - if not slength: - break # end of value label table (format < 117) - if self._format_version == 108: - labname = self._decode(self._path_or_buf.read(9)) - elif self._format_version <= 117: - labname = self._decode(self._path_or_buf.read(33)) - else: - labname = self._decode(self._path_or_buf.read(129)) - self._path_or_buf.read(3) # padding - - n = self._read_uint32() - txtlen = self._read_uint32() - off = np.frombuffer( - self._path_or_buf.read(4 * n), dtype=f"{self._byteorder}i4", count=n - ) - val = np.frombuffer( - self._path_or_buf.read(4 * n), dtype=f"{self._byteorder}i4", count=n - ) - ii = np.argsort(off) - off = off[ii] - val = val[ii] - txt = self._path_or_buf.read(txtlen) - self._value_label_dict[labname] = {} - for i in range(n): - end = off[i + 1] if i < n - 1 else txtlen - self._value_label_dict[labname][val[i]] = self._decode( - txt[off[i] : end] - ) - if self._format_version >= 117: - self._path_or_buf.read(6) # - else: - if not self._path_or_buf.read(2): - # end-of-file may have been reached, if so stop here - break - - # otherwise back up and read again, taking byteorder into account - self._path_or_buf.seek(-2, os.SEEK_CUR) - n = self._read_uint16() - labname = self._decode(self._path_or_buf.read(9)) - self._path_or_buf.read(1) # padding - codes = np.frombuffer( - self._path_or_buf.read(2 * n), dtype=f"{self._byteorder}i2", count=n - ) - self._value_label_dict[labname] = {} - for i in range(n): - self._value_label_dict[labname][codes[i]] = self._decode( - self._path_or_buf.read(8) - ) + if self._format_version >= 108: + self._read_new_value_labels() + else: + self._read_old_value_labels() self._value_labels_read = True From 23100224c4a568642c8391525a694dbf897d2893 Mon Sep 17 00:00:00 2001 From: Chris Charlton Date: Fri, 5 Apr 2024 20:43:04 +0100 Subject: [PATCH 06/11] Remove duplicate line --- pandas/io/stata.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 0aa0363d520c6..c7681fca5db86 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -1582,8 +1582,6 @@ def _read_value_labels(self) -> None: else: self._read_old_value_labels() - self._value_labels_read = True - def _read_strls(self) -> None: self._path_or_buf.seek(self._seek_strls) # Wrap v_o in a string to allow uint64 values as keys on 32bit OS From bf8620c833507aa5abd81202ac770fa4d53101a7 Mon Sep 17 00:00:00 2001 From: Chris Charlton Date: Fri, 5 Apr 2024 21:21:46 +0100 Subject: [PATCH 07/11] Update type hints for value label dictionary keys to match read content --- pandas/io/stata.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/io/stata.py b/pandas/io/stata.py index c7681fca5db86..81f09a3726a37 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -1868,7 +1868,7 @@ def _do_select_columns(self, data: DataFrame, columns: Sequence[str]) -> DataFra def _do_convert_categoricals( self, data: DataFrame, - value_label_dict: dict[str, dict[float, str]], + value_label_dict: dict[str, dict[int, str]], lbllist: Sequence[str], order_categoricals: bool, ) -> DataFrame: @@ -2006,7 +2006,7 @@ def variable_labels(self) -> dict[str, str]: self._ensure_open() return dict(zip(self._varlist, self._variable_labels)) - def value_labels(self) -> dict[str, dict[float, str]]: + def value_labels(self) -> dict[str, dict[int, str]]: """ Return a nested dict associating each variable name to its value and label. From af7d5e48059695bd4144ccf8ba1ccd54090bd42d Mon Sep 17 00:00:00 2001 From: Chris Charlton Date: Tue, 9 Apr 2024 11:31:26 +0100 Subject: [PATCH 08/11] Indicate versions each value label helper function applies to via docstrings --- pandas/io/stata.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/io/stata.py b/pandas/io/stata.py index c1c19a75500f9..39e26ba3faa53 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -1503,6 +1503,7 @@ def _decode(self, s: bytes) -> str: return s.decode("latin-1") def _read_new_value_labels(self) -> None: + """Reads value labels with variable length strings (108 and later format)""" while True: if self._format_version >= 117: if self._path_or_buf.read(5) == b" @@ -1542,6 +1543,7 @@ def _read_new_value_labels(self) -> None: self._path_or_buf.read(6) # def _read_old_value_labels(self) -> None: + """Reads value labels with fixed-length strings (105 and earlier format)""" while True: if not self._path_or_buf.read(2): # end-of-file may have been reached, if so stop here From b0dc32063042f6a4be3a769f6c5578f681cde4c5 Mon Sep 17 00:00:00 2001 From: Chris Charlton Date: Tue, 9 Apr 2024 11:37:39 +0100 Subject: [PATCH 09/11] Seek to value table location within version specific helper functions --- pandas/io/stata.py | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 39e26ba3faa53..cabe8b018bcb8 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -1504,6 +1504,13 @@ def _decode(self, s: bytes) -> str: def _read_new_value_labels(self) -> None: """Reads value labels with variable length strings (108 and later format)""" + if self._format_version >= 117: + self._path_or_buf.seek(self._seek_value_labels) + else: + assert self._dtype is not None + offset = self._nobs * self._dtype.itemsize + self._path_or_buf.seek(self._data_location + offset) + while True: if self._format_version >= 117: if self._path_or_buf.read(5) == b" @@ -1544,6 +1551,10 @@ def _read_new_value_labels(self) -> None: def _read_old_value_labels(self) -> None: """Reads value labels with fixed-length strings (105 and earlier format)""" + assert self._dtype is not None + offset = self._nobs * self._dtype.itemsize + self._path_or_buf.seek(self._data_location + offset) + while True: if not self._path_or_buf.read(2): # end-of-file may have been reached, if so stop here @@ -1569,13 +1580,6 @@ def _read_value_labels(self) -> None: # Don't read twice return - if self._format_version >= 117: - self._path_or_buf.seek(self._seek_value_labels) - else: - assert self._dtype is not None - offset = self._nobs * self._dtype.itemsize - self._path_or_buf.seek(self._data_location + offset) - self._value_labels_read = True self._value_label_dict: dict[str, dict[int, str]] = {} From 792d10cd78a371c833fce7f73aa1547d1b3d85e5 Mon Sep 17 00:00:00 2001 From: Chris Charlton Date: Tue, 9 Apr 2024 11:41:28 +0100 Subject: [PATCH 10/11] Wait until value labels are read before setting flag --- pandas/io/stata.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/stata.py b/pandas/io/stata.py index cabe8b018bcb8..3c8669c70c9d6 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -1580,13 +1580,13 @@ def _read_value_labels(self) -> None: # Don't read twice return - self._value_labels_read = True self._value_label_dict: dict[str, dict[int, str]] = {} if self._format_version >= 108: self._read_new_value_labels() else: self._read_old_value_labels() + self._value_labels_read = True def _read_strls(self) -> None: self._path_or_buf.seek(self._seek_strls) From 445fbaf0969a763fe342e855d5aa47ec939cefe5 Mon Sep 17 00:00:00 2001 From: Chris Charlton Date: Tue, 9 Apr 2024 12:33:46 +0100 Subject: [PATCH 11/11] Move value label dictionary initialisation to class __init__ --- pandas/io/stata.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 3c8669c70c9d6..47d879c022ee6 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -1122,6 +1122,7 @@ def __init__( # State variables for the file self._close_file: Callable[[], None] | None = None self._column_selector_set = False + self._value_label_dict: dict[str, dict[int, str]] = {} self._value_labels_read = False self._dtype: np.dtype | None = None self._lines_read = 0 @@ -1580,8 +1581,6 @@ def _read_value_labels(self) -> None: # Don't read twice return - self._value_label_dict: dict[str, dict[int, str]] = {} - if self._format_version >= 108: self._read_new_value_labels() else: