Skip to content

Commit a92ad86

Browse files
Fix parquet paritioning pytest failures (#13474)
This PR fixes parquet pytest failures, mostly working around two upstream issues: 1. pandas-dev/pandas#53345 2. apache/arrow#33321 Thus fixes 20 pytest failure: This PR: ``` = 231 failed, 95767 passed, 2045 skipped, 764 xfailed, 300 xpassed in 426.65s (0:07:06) = ``` On `pandas_2.0_feature_branch`: ``` = 251 failed, 95747 passed, 2045 skipped, 764 xfailed, 300 xpassed in 433.50s (0:07:13) = ```
1 parent 258bf3d commit a92ad86

File tree

1 file changed

+48
-2
lines changed

1 file changed

+48
-2
lines changed

python/cudf/cudf/tests/test_parquet.py

+48-2
Original file line numberDiff line numberDiff line change
@@ -1749,6 +1749,15 @@ def test_parquet_partitioned(tmpdir_factory, cols, filename):
17491749

17501750
# Check that cudf and pd return the same read
17511751
got_cudf = cudf.read_parquet(gdf_dir)
1752+
if PANDAS_GE_200 and isinstance(got_pd["c"].dtype, pd.CategoricalDtype):
1753+
# Work-around for pandas bug:
1754+
# https://github.com/pandas-dev/pandas/issues/53345
1755+
got_pd["c"] = got_pd["c"].astype(
1756+
pd.CategoricalDtype(
1757+
categories=got_pd["c"].dtype.categories.astype("int64"),
1758+
ordered=got_pd["c"].dtype.ordered,
1759+
)
1760+
)
17521761
assert_eq(got_pd, got_cudf)
17531762

17541763
# If filename is specified, check that it is correct
@@ -1796,6 +1805,15 @@ def test_parquet_writer_chunked_partitioned(tmpdir_factory, return_meta):
17961805

17971806
# Check that cudf and pd return the same read
17981807
got_cudf = cudf.read_parquet(gdf_dir)
1808+
if PANDAS_GE_200:
1809+
# Work-around for pandas bug:
1810+
# https://github.com/pandas-dev/pandas/issues/53345
1811+
got_pd["a"] = got_pd["a"].astype(
1812+
pd.CategoricalDtype(
1813+
categories=got_pd["a"].dtype.categories.astype("int64"),
1814+
ordered=got_pd["a"].dtype.ordered,
1815+
)
1816+
)
17991817
assert_eq(got_pd, got_cudf)
18001818

18011819

@@ -1836,7 +1854,15 @@ def test_parquet_writer_chunked_max_file_size(
18361854

18371855
# Check that cudf and pd return the same read
18381856
got_cudf = cudf.read_parquet(gdf_dir)
1839-
1857+
if PANDAS_GE_200:
1858+
# Work-around for pandas bug:
1859+
# https://github.com/pandas-dev/pandas/issues/53345
1860+
got_pd["a"] = got_pd["a"].astype(
1861+
pd.CategoricalDtype(
1862+
categories=got_pd["a"].dtype.categories.astype("int64"),
1863+
ordered=got_pd["a"].dtype.ordered,
1864+
)
1865+
)
18401866
assert_eq(
18411867
got_pd.sort_values(["b"]).reset_index(drop=True),
18421868
got_cudf.sort_values(["b"]).reset_index(drop=True),
@@ -1882,6 +1908,15 @@ def test_parquet_writer_chunked_partitioned_context(tmpdir_factory):
18821908

18831909
# Check that cudf and pd return the same read
18841910
got_cudf = cudf.read_parquet(gdf_dir)
1911+
if PANDAS_GE_200:
1912+
# Work-around for pandas bug:
1913+
# https://github.com/pandas-dev/pandas/issues/53345
1914+
got_pd["a"] = got_pd["a"].astype(
1915+
pd.CategoricalDtype(
1916+
categories=got_pd["a"].dtype.categories.astype("int64"),
1917+
ordered=got_pd["a"].dtype.ordered,
1918+
)
1919+
)
18851920
assert_eq(got_pd, got_cudf)
18861921

18871922

@@ -1989,6 +2024,15 @@ def test_read_parquet_partitioned_filtered(
19892024
filters = [[("a", "==", 10)], [("c", "==", 1)]]
19902025
got = cudf.read_parquet(read_path, filters=filters)
19912026
expect = pd.read_parquet(read_path, filters=filters)
2027+
if PANDAS_GE_200:
2028+
# Work-around for pandas bug:
2029+
# https://github.com/pandas-dev/pandas/issues/53345
2030+
expect["c"] = expect["c"].astype(
2031+
pd.CategoricalDtype(
2032+
categories=expect["c"].dtype.categories.astype("int64"),
2033+
ordered=expect["c"].dtype.ordered,
2034+
)
2035+
)
19922036
assert_eq(expect, got)
19932037

19942038

@@ -2803,7 +2847,9 @@ def test_parquet_roundtrip_time_delta():
28032847
)
28042848
buffer = BytesIO()
28052849
df.to_parquet(buffer)
2806-
assert_eq(df, cudf.read_parquet(buffer))
2850+
# TODO: Remove `check_dtype` once following issue is fixed in arrow:
2851+
# https://github.com/apache/arrow/issues/33321
2852+
assert_eq(df, cudf.read_parquet(buffer), check_dtype=not PANDAS_GE_200)
28072853

28082854

28092855
def test_parquet_reader_malformed_file(datadir):

0 commit comments

Comments
 (0)