Skip to content

Commit f12034a

Browse files
committed
ENH: Add dta 119 reading to StataReader
Add requirements for reading 119 format files
1 parent b95a7eb commit f12034a

File tree

4 files changed

+32
-13
lines changed

4 files changed

+32
-13
lines changed

doc/source/whatsnew/v1.0.0.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ Other enhancements
3737
pandas (so it will become an integer or float dtype depending on the presence of missing data).
3838
(:issue:`28368`)
3939
- :meth:`DataFrame.to_json` now accepts an ``indent`` integer argument to enable pretty printing of JSON output (:issue:`12004`)
40-
40+
- :meth:`read_stata` can read Stata 119 dta files. (:issue:`28250`)
4141

4242
Build Changes
4343
^^^^^^^^^^^^^

pandas/io/stata.py

+18-12
Original file line numberDiff line numberDiff line change
@@ -1139,13 +1139,17 @@ def _read_new_header(self, first_char):
11391139
# The first part of the header is common to 117 and 118.
11401140
self.path_or_buf.read(27) # stata_dta><header><release>
11411141
self.format_version = int(self.path_or_buf.read(3))
1142-
if self.format_version not in [117, 118]:
1142+
if self.format_version not in [117, 118, 119]:
11431143
raise ValueError(_version_error)
11441144
self._set_encoding()
11451145
self.path_or_buf.read(21) # </release><byteorder>
11461146
self.byteorder = self.path_or_buf.read(3) == b"MSF" and ">" or "<"
11471147
self.path_or_buf.read(15) # </byteorder><K>
1148-
self.nvar = struct.unpack(self.byteorder + "H", self.path_or_buf.read(2))[0]
1148+
nvar_type = "H" if self.format_version <= 118 else "I"
1149+
nvar_size = 2 if self.format_version <= 118 else 4
1150+
self.nvar = struct.unpack(
1151+
self.byteorder + nvar_type, self.path_or_buf.read(nvar_size)
1152+
)[0]
11491153
self.path_or_buf.read(7) # </K><N>
11501154

11511155
self.nobs = self._get_nobs()
@@ -1207,7 +1211,7 @@ def _read_new_header(self, first_char):
12071211
self.path_or_buf.seek(self._seek_variable_labels)
12081212
self._variable_labels = self._get_variable_labels()
12091213

1210-
# Get data type information, works for versions 117-118.
1214+
# Get data type information, works for versions 117-119.
12111215
def _get_dtypes(self, seek_vartypes):
12121216

12131217
self.path_or_buf.seek(seek_vartypes)
@@ -1241,14 +1245,14 @@ def f(typ):
12411245
def _get_varlist(self):
12421246
if self.format_version == 117:
12431247
b = 33
1244-
elif self.format_version == 118:
1248+
elif self.format_version >= 118:
12451249
b = 129
12461250

12471251
return [self._decode(self.path_or_buf.read(b)) for i in range(self.nvar)]
12481252

12491253
# Returns the format list
12501254
def _get_fmtlist(self):
1251-
if self.format_version == 118:
1255+
if self.format_version >= 118:
12521256
b = 57
12531257
elif self.format_version > 113:
12541258
b = 49
@@ -1270,7 +1274,7 @@ def _get_lbllist(self):
12701274
return [self._decode(self.path_or_buf.read(b)) for i in range(self.nvar)]
12711275

12721276
def _get_variable_labels(self):
1273-
if self.format_version == 118:
1277+
if self.format_version >= 118:
12741278
vlblist = [
12751279
self._decode(self.path_or_buf.read(321)) for i in range(self.nvar)
12761280
]
@@ -1285,13 +1289,13 @@ def _get_variable_labels(self):
12851289
return vlblist
12861290

12871291
def _get_nobs(self):
1288-
if self.format_version == 118:
1292+
if self.format_version >= 118:
12891293
return struct.unpack(self.byteorder + "Q", self.path_or_buf.read(8))[0]
12901294
else:
12911295
return struct.unpack(self.byteorder + "I", self.path_or_buf.read(4))[0]
12921296

12931297
def _get_data_label(self):
1294-
if self.format_version == 118:
1298+
if self.format_version >= 118:
12951299
strlen = struct.unpack(self.byteorder + "H", self.path_or_buf.read(2))[0]
12961300
return self._decode(self.path_or_buf.read(strlen))
12971301
elif self.format_version == 117:
@@ -1303,7 +1307,7 @@ def _get_data_label(self):
13031307
return self._decode(self.path_or_buf.read(32))
13041308

13051309
def _get_time_stamp(self):
1306-
if self.format_version == 118:
1310+
if self.format_version >= 118:
13071311
strlen = struct.unpack("b", self.path_or_buf.read(1))[0]
13081312
return self.path_or_buf.read(strlen).decode("utf-8")
13091313
elif self.format_version == 117:
@@ -1321,7 +1325,7 @@ def _get_seek_variable_labels(self):
13211325
# a work around that uses the previous label, 33 bytes for each
13221326
# variable, 20 for the closing tag and 17 for the opening tag
13231327
return self._seek_value_label_names + (33 * self.nvar) + 20 + 17
1324-
elif self.format_version == 118:
1328+
elif self.format_version >= 118:
13251329
return struct.unpack(self.byteorder + "q", self.path_or_buf.read(8))[0] + 17
13261330
else:
13271331
raise ValueError()
@@ -1519,10 +1523,12 @@ def _read_strls(self):
15191523
else:
15201524
buf = self.path_or_buf.read(12)
15211525
# Only tested on little endian file on little endian machine.
1526+
v_size = 2 if self.format_version == 118 else 3
15221527
if self.byteorder == "<":
1523-
buf = buf[0:2] + buf[4:10]
1528+
buf = buf[0:v_size] + buf[4 : 12 - v_size]
15241529
else:
1525-
buf = buf[0:2] + buf[6:]
1530+
# This path may not be correct, impossible to test
1531+
buf = buf[0:v_size] + buf[4 + v_size :]
15261532
v_o = struct.unpack("Q", buf)[0]
15271533
typ = struct.unpack("B", self.path_or_buf.read(1))[0]
15281534
length = struct.unpack(self.byteorder + "I", self.path_or_buf.read(4))[0]
263 KB
Binary file not shown.

pandas/tests/io/test_stata.py

+13
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,8 @@ def setup_method(self, datapath):
101101
self.dta24_111 = os.path.join(self.dirpath, "stata7_111.dta")
102102
self.dta25_118 = os.path.join(self.dirpath, "stata16_118.dta")
103103

104+
self.dta26_119 = os.path.join(self.dirpath, "stata1_119.dta.gz")
105+
104106
self.stata_dates = os.path.join(self.dirpath, "stata13_dates.dta")
105107

106108
def read_dta(self, file):
@@ -1780,3 +1782,14 @@ def test_encoding_latin1_118(self):
17801782

17811783
expected = pd.DataFrame([["Düsseldorf"]] * 151, columns=["kreis1849"])
17821784
tm.assert_frame_equal(encoded, expected)
1785+
1786+
@pytest.mark.slow
1787+
def test_stata_119(self):
1788+
# Gzipped since contains 32,999 variables and uncompressed is 20MiB
1789+
with gzip.open(self.dta26_119, "rb") as gz:
1790+
df = read_stata(gz)
1791+
assert df.shape == (1, 32999)
1792+
assert df.iloc[0, 6] == "A" * 3000
1793+
assert df.iloc[0, 7] == 3.14
1794+
assert df.iloc[0, -1] == 1
1795+
assert df.iloc[0, 0] == pd.Timestamp(datetime(2012, 12, 21, 21, 12, 21))

0 commit comments

Comments
 (0)