Skip to content

Commit 2d1e59d

Browse files
authored
BUG: Unable to open Stata 118 or 119 format files saved in big-endian… (pandas-dev#58640)
* BUG: Unable to open Stata 118 or 119 format files saved in big-endian format that contain strL data * Rename test functions to make their purpose clearer
1 parent a787f45 commit 2d1e59d

14 files changed

+43
-12
lines changed

doc/source/whatsnew/v3.0.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -546,6 +546,7 @@ I/O
546546
- Bug in :meth:`DataFrame.to_excel` when writing empty :class:`DataFrame` with :class:`MultiIndex` on both axes (:issue:`57696`)
547547
- Bug in :meth:`DataFrame.to_string` that raised ``StopIteration`` with nested DataFrames. (:issue:`16098`)
548548
- Bug in :meth:`read_csv` raising ``TypeError`` when ``index_col`` is specified and ``na_values`` is a dict containing the key ``None``. (:issue:`57547`)
549+
- Bug in :meth:`read_stata` raising ``KeyError`` when input file is stored in big-endian format and contains strL data. (:issue:`58638`)
549550

550551
Period
551552
^^^^^^

pandas/io/stata.py

+3-4
Original file line numberDiff line numberDiff line change
@@ -1600,14 +1600,13 @@ def _read_strls(self) -> None:
16001600
v_o = self._read_uint64()
16011601
else:
16021602
buf = self._path_or_buf.read(12)
1603-
# Only tested on little endian file on little endian machine.
1603+
# Only tested on little endian machine.
16041604
v_size = 2 if self._format_version == 118 else 3
16051605
if self._byteorder == "<":
16061606
buf = buf[0:v_size] + buf[4 : (12 - v_size)]
16071607
else:
1608-
# This path may not be correct, impossible to test
1609-
buf = buf[0:v_size] + buf[(4 + v_size) :]
1610-
v_o = struct.unpack("Q", buf)[0]
1608+
buf = buf[4 - v_size : 4] + buf[(4 + v_size) :]
1609+
v_o = struct.unpack(f"{self._byteorder}Q", buf)[0]
16111610
typ = self._read_uint8()
16121611
length = self._read_uint32()
16131612
va = self._path_or_buf.read(length)
2.56 KB
Binary file not shown.
2.57 KB
Binary file not shown.
1.25 KB
Binary file not shown.
2.56 KB
Binary file not shown.
2.57 KB
Binary file not shown.
5.44 KB
Binary file not shown.
5.43 KB
Binary file not shown.
5.44 KB
Binary file not shown.
4.52 KB
Binary file not shown.
4.51 KB
Binary file not shown.
4.52 KB
Binary file not shown.

pandas/tests/io/test_stata.py

+39-8
Original file line numberDiff line numberDiff line change
@@ -314,8 +314,19 @@ def test_readold_dta4(self, version, datapath):
314314
tm.assert_frame_equal(parsed, expected)
315315

316316
# File containing strls
317-
def test_read_dta12(self, datapath):
318-
parsed_117 = self.read_dta(datapath("io", "data", "stata", "stata12_117.dta"))
317+
@pytest.mark.parametrize(
318+
"file",
319+
[
320+
"stata12_117",
321+
"stata12_be_117",
322+
"stata12_118",
323+
"stata12_be_118",
324+
"stata12_119",
325+
"stata12_be_119",
326+
],
327+
)
328+
def test_read_dta_strl(self, file, datapath):
329+
parsed = self.read_dta(datapath("io", "data", "stata", f"{file}.dta"))
319330
expected = DataFrame.from_records(
320331
[
321332
[1, "abc", "abcdefghi"],
@@ -325,10 +336,20 @@ def test_read_dta12(self, datapath):
325336
columns=["x", "y", "z"],
326337
)
327338

328-
tm.assert_frame_equal(parsed_117, expected, check_dtype=False)
339+
tm.assert_frame_equal(parsed, expected, check_dtype=False)
329340

330-
def test_read_dta18(self, datapath):
331-
parsed_118 = self.read_dta(datapath("io", "data", "stata", "stata14_118.dta"))
341+
# 117 is not included in this list as it uses ASCII strings
342+
@pytest.mark.parametrize(
343+
"file",
344+
[
345+
"stata14_118",
346+
"stata14_be_118",
347+
"stata14_119",
348+
"stata14_be_119",
349+
],
350+
)
351+
def test_read_dta118_119(self, file, datapath):
352+
parsed_118 = self.read_dta(datapath("io", "data", "stata", f"{file}.dta"))
332353
parsed_118["Bytes"] = parsed_118["Bytes"].astype("O")
333354
expected = DataFrame.from_records(
334355
[
@@ -352,7 +373,7 @@ def test_read_dta18(self, datapath):
352373
for col in parsed_118.columns:
353374
tm.assert_almost_equal(parsed_118[col], expected[col])
354375

355-
with StataReader(datapath("io", "data", "stata", "stata14_118.dta")) as rdr:
376+
with StataReader(datapath("io", "data", "stata", f"{file}.dta")) as rdr:
356377
vl = rdr.variable_labels()
357378
vl_expected = {
358379
"Unicode_Cities_Strl": "Here are some strls with Ünicode chars",
@@ -1799,8 +1820,18 @@ def test_gzip_writing(self, temp_file):
17991820
reread = read_stata(gz, index_col="index")
18001821
tm.assert_frame_equal(df, reread)
18011822

1802-
def test_unicode_dta_118(self, datapath):
1803-
unicode_df = self.read_dta(datapath("io", "data", "stata", "stata16_118.dta"))
1823+
# 117 is not included in this list as it uses ASCII strings
1824+
@pytest.mark.parametrize(
1825+
"file",
1826+
[
1827+
"stata16_118",
1828+
"stata16_be_118",
1829+
"stata16_119",
1830+
"stata16_be_119",
1831+
],
1832+
)
1833+
def test_unicode_dta_118_119(self, file, datapath):
1834+
unicode_df = self.read_dta(datapath("io", "data", "stata", f"{file}.dta"))
18041835

18051836
columns = ["utf8", "latin1", "ascii", "utf8_strl", "ascii_strl"]
18061837
values = [

0 commit comments

Comments
 (0)