diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 54a6171f623f6..2b6fc46311ea7 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -37,7 +37,7 @@ Other enhancements pandas (so it will become an integer or float dtype depending on the presence of missing data). (:issue:`28368`) - :meth:`DataFrame.to_json` now accepts an ``indent`` integer argument to enable pretty printing of JSON output (:issue:`12004`) - +- :meth:`read_stata` can read Stata 119 dta files. (:issue:`28250`) Build Changes ^^^^^^^^^^^^^ diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 31fdaa5cc6735..c67106e897727 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -1139,13 +1139,17 @@ def _read_new_header(self, first_char): # The first part of the header is common to 117 and 118. self.path_or_buf.read(27) # stata_dta>
self.format_version = int(self.path_or_buf.read(3)) - if self.format_version not in [117, 118]: + if self.format_version not in [117, 118, 119]: raise ValueError(_version_error) self._set_encoding() self.path_or_buf.read(21) # self.byteorder = self.path_or_buf.read(3) == b"MSF" and ">" or "<" self.path_or_buf.read(15) # - self.nvar = struct.unpack(self.byteorder + "H", self.path_or_buf.read(2))[0] + nvar_type = "H" if self.format_version <= 118 else "I" + nvar_size = 2 if self.format_version <= 118 else 4 + self.nvar = struct.unpack( + self.byteorder + nvar_type, self.path_or_buf.read(nvar_size) + )[0] self.path_or_buf.read(7) # self.nobs = self._get_nobs() @@ -1207,7 +1211,7 @@ def _read_new_header(self, first_char): self.path_or_buf.seek(self._seek_variable_labels) self._variable_labels = self._get_variable_labels() - # Get data type information, works for versions 117-118. + # Get data type information, works for versions 117-119. def _get_dtypes(self, seek_vartypes): self.path_or_buf.seek(seek_vartypes) @@ -1241,14 +1245,14 @@ def f(typ): def _get_varlist(self): if self.format_version == 117: b = 33 - elif self.format_version == 118: + elif self.format_version >= 118: b = 129 return [self._decode(self.path_or_buf.read(b)) for i in range(self.nvar)] # Returns the format list def _get_fmtlist(self): - if self.format_version == 118: + if self.format_version >= 118: b = 57 elif self.format_version > 113: b = 49 @@ -1270,7 +1274,7 @@ def _get_lbllist(self): return [self._decode(self.path_or_buf.read(b)) for i in range(self.nvar)] def _get_variable_labels(self): - if self.format_version == 118: + if self.format_version >= 118: vlblist = [ self._decode(self.path_or_buf.read(321)) for i in range(self.nvar) ] @@ -1285,13 +1289,13 @@ def _get_variable_labels(self): return vlblist def _get_nobs(self): - if self.format_version == 118: + if self.format_version >= 118: return struct.unpack(self.byteorder + "Q", self.path_or_buf.read(8))[0] else: return struct.unpack(self.byteorder + "I", self.path_or_buf.read(4))[0] def _get_data_label(self): - if self.format_version == 118: + if self.format_version >= 118: strlen = struct.unpack(self.byteorder + "H", self.path_or_buf.read(2))[0] return self._decode(self.path_or_buf.read(strlen)) elif self.format_version == 117: @@ -1303,7 +1307,7 @@ def _get_data_label(self): return self._decode(self.path_or_buf.read(32)) def _get_time_stamp(self): - if self.format_version == 118: + if self.format_version >= 118: strlen = struct.unpack("b", self.path_or_buf.read(1))[0] return self.path_or_buf.read(strlen).decode("utf-8") elif self.format_version == 117: @@ -1321,7 +1325,7 @@ def _get_seek_variable_labels(self): # a work around that uses the previous label, 33 bytes for each # variable, 20 for the closing tag and 17 for the opening tag return self._seek_value_label_names + (33 * self.nvar) + 20 + 17 - elif self.format_version == 118: + elif self.format_version >= 118: return struct.unpack(self.byteorder + "q", self.path_or_buf.read(8))[0] + 17 else: raise ValueError() @@ -1519,10 +1523,12 @@ def _read_strls(self): else: buf = self.path_or_buf.read(12) # Only tested on little endian file on little endian machine. + v_size = 2 if self.format_version == 118 else 3 if self.byteorder == "<": - buf = buf[0:2] + buf[4:10] + buf = buf[0:v_size] + buf[4 : 12 - v_size] else: - buf = buf[0:2] + buf[6:] + # This path may not be correct, impossible to test + buf = buf[0:v_size] + buf[4 + v_size :] v_o = struct.unpack("Q", buf)[0] typ = struct.unpack("B", self.path_or_buf.read(1))[0] length = struct.unpack(self.byteorder + "I", self.path_or_buf.read(4))[0] diff --git a/pandas/tests/io/data/stata1_119.dta.gz b/pandas/tests/io/data/stata1_119.dta.gz new file mode 100644 index 0000000000000..0f75d8b92db14 Binary files /dev/null and b/pandas/tests/io/data/stata1_119.dta.gz differ diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py index 1e7d568602656..a0ec06a2197ae 100644 --- a/pandas/tests/io/test_stata.py +++ b/pandas/tests/io/test_stata.py @@ -101,6 +101,8 @@ def setup_method(self, datapath): self.dta24_111 = os.path.join(self.dirpath, "stata7_111.dta") self.dta25_118 = os.path.join(self.dirpath, "stata16_118.dta") + self.dta26_119 = os.path.join(self.dirpath, "stata1_119.dta.gz") + self.stata_dates = os.path.join(self.dirpath, "stata13_dates.dta") def read_dta(self, file): @@ -1780,3 +1782,14 @@ def test_encoding_latin1_118(self): expected = pd.DataFrame([["Düsseldorf"]] * 151, columns=["kreis1849"]) tm.assert_frame_equal(encoded, expected) + + @pytest.mark.slow + def test_stata_119(self): + # Gzipped since contains 32,999 variables and uncompressed is 20MiB + with gzip.open(self.dta26_119, "rb") as gz: + df = read_stata(gz) + assert df.shape == (1, 32999) + assert df.iloc[0, 6] == "A" * 3000 + assert df.iloc[0, 7] == 3.14 + assert df.iloc[0, -1] == 1 + assert df.iloc[0, 0] == pd.Timestamp(datetime(2012, 12, 21, 21, 12, 21))