Skip to content

Commit 3572f16

Browse files
Debian Science Teamrebecca-palmer
Debian Science Team
authored andcommitted
HDF5 and Stata I/O are broken on some architectures
Fix some issues, warn on use and xfail tests for the remainder armhf TestHDF5Store::test*encoding only sometimes crashes (1.1.3+dfsg-1 passed on build but failed autopkgtest) HDF5 and Stata are known to fail on big-endian architectures Stata also fails on qemu-ppc64el, but not real ppc64el Author: Andreas Tille <[email protected]>, Graham Inggs <[email protected]>, Yaroslav Halchenko <[email protected]>, Rebecca N. Palmer <[email protected]> Bug-Debian: https://bugs.debian.org/877419 Forwarded: no Gbp-Pq: Name xfail_tests_nonintel_io.patch
1 parent ddddadb commit 3572f16

File tree

7 files changed

+43
-8
lines changed

7 files changed

+43
-8
lines changed

pandas/_testing.py

+4
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@
66
import gzip
77
import operator
88
import os
9+
import platform
10+
import re
911
from shutil import rmtree
1012
import string
1113
import tempfile
@@ -2612,6 +2614,8 @@ class for all warnings. To check that no warning is returned,
26122614
)
26132615
assert actual_warning.filename == caller.filename, msg
26142616
else:
2617+
if actual_warning.category==UserWarning and "Non-x86 system detected" in str(actual_warning.message) and not bool(re.match('i.?86|x86',platform.uname()[4])):
2618+
continue
26152619
extra_warnings.append(
26162620
(
26172621
actual_warning.category.__name__,

pandas/io/pytables.py

+10-1
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,10 @@
99
import re
1010
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type, Union
1111
import warnings
12+
import platform
13+
import re
14+
from pandas.compat import is_platform_little_endian
15+
warn_hdf_platform = "Non-x86 system detected, HDF(5) format I/O may give wrong results (particularly on files created with older versions) or crash - https://bugs.debian.org/877419" if not bool(re.match('i.?86|x86',platform.uname()[4])) else False
1216

1317
import numpy as np
1418

@@ -527,6 +531,8 @@ def __init__(
527531
fletcher32: bool = False,
528532
**kwargs,
529533
):
534+
if warn_hdf_platform:
535+
warnings.warn(warn_hdf_platform)
530536

531537
if "format" in kwargs:
532538
raise ValueError("format is not a defined argument for HDFStore")
@@ -766,7 +772,10 @@ def flush(self, fsync: bool = False):
766772
self._handle.flush()
767773
if fsync:
768774
try:
769-
os.fsync(self._handle.fileno())
775+
if is_platform_little_endian():
776+
os.fsync(self._handle.fileno())
777+
else:
778+
os.sync() # due to a pytables bad-cast bug, fileno is invalid on 64-bit big-endian
770779
except OSError:
771780
pass
772781

pandas/io/stata.py

+5
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,9 @@
2929
Union,
3030
)
3131
import warnings
32+
import platform
33+
import re
34+
warn_stata_platform = "Non-x86 system detected, Stata format I/O may give wrong results (particularly on strings) - https://bugs.debian.org/877419" if not bool(re.match('i.?86|x86',platform.uname()[4])) else False
3235

3336
from dateutil.relativedelta import relativedelta
3437
import numpy as np
@@ -875,6 +878,8 @@ def __init__(self):
875878
# NOTE: the byte type seems to be reserved for categorical variables
876879
# with a label, but the underlying variable is -127 to 100
877880
# we're going to drop the label and cast to int
881+
if warn_stata_platform:
882+
warnings.warn(warn_stata_platform)
878883
self.DTYPE_MAP = dict(
879884
list(zip(range(1, 245), [np.dtype("a" + str(i)) for i in range(1, 245)]))
880885
+ [

pandas/tests/io/pytables/test_store.py

+11
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,10 @@
5454

5555
from pandas.io import pytables as pytables # noqa: E402 isort:skip
5656
from pandas.io.pytables import TableIterator # noqa: E402 isort:skip
57+
import platform
58+
import re
59+
import sys
60+
is_crashing_arch=bool((platform.uname()[4].startswith('arm') or platform.uname()[4].startswith('aarch')) and sys.maxsize<2**33) # meant for armhf, though this form will also skip on armel - uname = kernel arch
5761

5862

5963
_default_compressor = "blosc"
@@ -1088,6 +1092,7 @@ def check(format, index):
10881092
check("table", index)
10891093
check("fixed", index)
10901094

1095+
@pytest.mark.xfail(condition=is_crashing_arch,reason="https://bugs.debian.org/790925",strict=False,run=False)
10911096
@pytest.mark.skipif(
10921097
not is_platform_little_endian(), reason="reason platform is not little endian"
10931098
)
@@ -1120,6 +1125,7 @@ def test_encoding(self, setup_path):
11201125
],
11211126
)
11221127
@pytest.mark.parametrize("dtype", ["category", object])
1128+
@pytest.mark.xfail(condition=is_crashing_arch,reason="https://bugs.debian.org/790925",strict=False,run=False)
11231129
def test_latin_encoding(self, setup_path, dtype, val):
11241130
enc = "latin-1"
11251131
nan_rep = ""
@@ -1305,6 +1311,7 @@ def test_read_missing_key_opened_store(self, setup_path):
13051311
# still read from it.
13061312
pd.read_hdf(store, "k1")
13071313

1314+
@pytest.mark.xfail(condition=is_crashing_arch,reason="https://bugs.debian.org/790925",strict=False,run=False)
13081315
def test_append_frame_column_oriented(self, setup_path):
13091316
with ensure_clean_store(setup_path) as store:
13101317

@@ -3921,6 +3928,7 @@ def test_start_stop_fixed(self, setup_path):
39213928
df.iloc[3:5, 1:3] = np.nan
39223929
df.iloc[8:10, -2] = np.nan
39233930

3931+
@pytest.mark.xfail(condition=is_crashing_arch,reason="https://bugs.debian.org/790925",strict=False,run=False)
39243932
def test_select_filter_corner(self, setup_path):
39253933

39263934
df = DataFrame(np.random.randn(50, 100))
@@ -4177,6 +4185,7 @@ def test_pytables_native2_read(self, datapath, setup_path):
41774185
assert isinstance(d1, DataFrame)
41784186

41794187
@td.xfail_non_writeable
4188+
@pytest.mark.xfail(condition=not is_platform_little_endian(),reason="known failure of hdf on non-little endian",strict=False,raises=AttributeError)
41804189
def test_legacy_table_fixed_format_read_py2(self, datapath, setup_path):
41814190
# GH 24510
41824191
# legacy table with fixed format written in Python 2
@@ -4191,6 +4200,7 @@ def test_legacy_table_fixed_format_read_py2(self, datapath, setup_path):
41914200
)
41924201
tm.assert_frame_equal(expected, result)
41934202

4203+
@pytest.mark.xfail(condition=not is_platform_little_endian(),reason="known failure of hdf on non-little endian",strict=False,raises=AttributeError)
41944204
def test_legacy_table_fixed_format_read_datetime_py2(self, datapath, setup_path):
41954205
# GH 31750
41964206
# legacy table with fixed format and datetime64 column written in Python 2
@@ -4870,6 +4880,7 @@ def test_fspath(self):
48704880
with pd.HDFStore(path) as store:
48714881
assert os.fspath(store) == str(path)
48724882

4883+
@pytest.mark.xfail(condition=not is_platform_little_endian(),reason="known failure of hdf on non-little endian",strict=False,raises=AttributeError)
48734884
def test_read_py2_hdf_file_in_py3(self, datapath):
48744885
# GH 16781
48754886

pandas/tests/io/pytables/test_timezones.py

+3
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import pytest
55

66
import pandas.util._test_decorators as td
7+
from pandas.compat import is_platform_little_endian
78

89
import pandas as pd
910
from pandas import DataFrame, DatetimeIndex, Series, Timestamp, date_range
@@ -311,6 +312,7 @@ def test_store_timezone(setup_path):
311312
tm.assert_frame_equal(result, df)
312313

313314

315+
@pytest.mark.xfail(condition=not is_platform_little_endian(),reason="known failure of hdf on non-little endian",strict=False,raises=AttributeError)
314316
def test_legacy_datetimetz_object(datapath, setup_path):
315317
# legacy from < 0.17.0
316318
# 8260
@@ -362,6 +364,7 @@ def test_read_with_where_tz_aware_index(setup_path):
362364
tm.assert_frame_equal(result, expected)
363365

364366

367+
@pytest.mark.xfail(condition=not is_platform_little_endian(),reason="known failure of hdf on non-little endian",strict=False,raises=AttributeError)
365368
def test_py2_created_with_datetimez(datapath, setup_path):
366369
# The test HDF5 file was created in Python 2, but could not be read in
367370
# Python 3.

pandas/tests/io/test_common.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88

99
import pytest
1010

11-
from pandas.compat import is_platform_windows
11+
from pandas.compat import is_platform_windows, is_platform_little_endian
1212
import pandas.util._test_decorators as td
1313

1414
import pandas as pd
@@ -230,11 +230,11 @@ def test_read_expands_user_home_dir(
230230
"pyarrow",
231231
("io", "data", "feather", "feather-0_3_1.feather"),
232232
),
233-
(
233+
pytest.param(
234234
pd.read_hdf,
235235
"tables",
236236
("io", "data", "legacy_hdf", "datetimetz_object.h5"),
237-
),
237+
marks=pytest.mark.xfail(condition=not is_platform_little_endian(),reason="known failure of hdf on non-little endian",strict=False,raises=AttributeError)),
238238
(pd.read_stata, "os", ("io", "data", "stata", "stata10_115.dta")),
239239
(pd.read_sas, "os", ("io", "sas", "data", "test1.sas7bdat")),
240240
(pd.read_json, "os", ("io", "json", "data", "tsframe_v012.json")),

pandas/tests/io/test_stata.py

+7-4
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,8 @@
2929
read_stata,
3030
)
3131

32+
from pandas.compat import is_platform_little_endian
33+
pytestmark = pytest.mark.xfail(condition=not is_platform_little_endian(),reason="known failure of test_stata on non-little endian",strict=False)
3234

3335
@pytest.fixture()
3436
def mixed_frame():
@@ -203,7 +205,7 @@ def test_read_dta2(self):
203205
# parsed_113 = self.read_dta(self.dta2_113)
204206

205207
# Remove resource warnings
206-
w = [x for x in w if x.category is UserWarning]
208+
w = [x for x in w if x.category is UserWarning and not "Non-x86 system detected" in str(x.message)]
207209

208210
# should get warning for each call to read_dta
209211
assert len(w) == 3
@@ -465,7 +467,7 @@ def test_read_write_dta12(self, version):
465467
warnings.simplefilter("always", InvalidColumnName)
466468
original.to_stata(path, None, version=version)
467469
# should get a warning for that format.
468-
assert len(w) == 1
470+
assert len([x for x in w if not "Non-x86 system detected" in str(x.message)]) == 1
469471

470472
written_and_read_again = self.read_dta(path)
471473
tm.assert_frame_equal(written_and_read_again.set_index("index"), formatted)
@@ -1788,8 +1790,9 @@ def test_encoding_latin1_118(self):
17881790
the string values returned are correct."""
17891791
with tm.assert_produces_warning(UnicodeWarning) as w:
17901792
encoded = read_stata(self.dta_encoding_118)
1791-
assert len(w) == 151
1792-
assert w[0].message.args[0] == msg
1793+
w2 = [x for x in w if not "Non-x86 system detected" in str(x.message)]
1794+
assert len(w2) == 151
1795+
assert w2[0].message.args[0] == msg
17931796

17941797
expected = pd.DataFrame([["Düsseldorf"]] * 151, columns=["kreis1849"])
17951798
tm.assert_frame_equal(encoded, expected)

0 commit comments

Comments
 (0)