Fix parquet paritioning pytest failures (#13474)

galipremsagar · web-flow · commit a92ad860d15c · 2023-05-31T08:56:23.000-05:00
This PR fixes parquet pytest failures, mostly working around two upstream issues: 1. pandas-dev/pandas#53345 2. apache/arrow#33321 Thus fixes 20 pytest failure: This PR: ``` = 231 failed, 95767 passed, 2045 skipped, 764 xfailed, 300 xpassed in 426.65s (0:07:06) = ``` On `pandas_2.0_feature_branch`: ``` = 251 failed, 95747 passed, 2045 skipped, 764 xfailed, 300 xpassed in 433.50s (0:07:13) = ```
diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py
@@ -1749,6 +1749,15 @@ def test_parquet_partitioned(tmpdir_factory, cols, filename):
 
     # Check that cudf and pd return the same read
     got_cudf = cudf.read_parquet(gdf_dir)
+    if PANDAS_GE_200 and isinstance(got_pd["c"].dtype, pd.CategoricalDtype):
+        # Work-around for pandas bug:
+        # https://github.com/pandas-dev/pandas/issues/53345
+        got_pd["c"] = got_pd["c"].astype(
+            pd.CategoricalDtype(
+                categories=got_pd["c"].dtype.categories.astype("int64"),
+                ordered=got_pd["c"].dtype.ordered,
+            )
+        )
     assert_eq(got_pd, got_cudf)
 
     # If filename is specified, check that it is correct
@@ -1796,6 +1805,15 @@ def test_parquet_writer_chunked_partitioned(tmpdir_factory, return_meta):
 
     # Check that cudf and pd return the same read
     got_cudf = cudf.read_parquet(gdf_dir)
+    if PANDAS_GE_200:
+        # Work-around for pandas bug:
+        # https://github.com/pandas-dev/pandas/issues/53345
+        got_pd["a"] = got_pd["a"].astype(
+            pd.CategoricalDtype(
+                categories=got_pd["a"].dtype.categories.astype("int64"),
+                ordered=got_pd["a"].dtype.ordered,
+            )
+        )
     assert_eq(got_pd, got_cudf)
 
 
@@ -1836,7 +1854,15 @@ def test_parquet_writer_chunked_max_file_size(
 
     # Check that cudf and pd return the same read
     got_cudf = cudf.read_parquet(gdf_dir)
-
+    if PANDAS_GE_200:
+        # Work-around for pandas bug:
+        # https://github.com/pandas-dev/pandas/issues/53345
+        got_pd["a"] = got_pd["a"].astype(
+            pd.CategoricalDtype(
+                categories=got_pd["a"].dtype.categories.astype("int64"),
+                ordered=got_pd["a"].dtype.ordered,
+            )
+        )
     assert_eq(
         got_pd.sort_values(["b"]).reset_index(drop=True),
         got_cudf.sort_values(["b"]).reset_index(drop=True),
@@ -1882,6 +1908,15 @@ def test_parquet_writer_chunked_partitioned_context(tmpdir_factory):
 
     # Check that cudf and pd return the same read
     got_cudf = cudf.read_parquet(gdf_dir)
+    if PANDAS_GE_200:
+        # Work-around for pandas bug:
+        # https://github.com/pandas-dev/pandas/issues/53345
+        got_pd["a"] = got_pd["a"].astype(
+            pd.CategoricalDtype(
+                categories=got_pd["a"].dtype.categories.astype("int64"),
+                ordered=got_pd["a"].dtype.ordered,
+            )
+        )
     assert_eq(got_pd, got_cudf)
 
 
@@ -1989,6 +2024,15 @@ def test_read_parquet_partitioned_filtered(
     filters = [[("a", "==", 10)], [("c", "==", 1)]]
     got = cudf.read_parquet(read_path, filters=filters)
     expect = pd.read_parquet(read_path, filters=filters)
+    if PANDAS_GE_200:
+        # Work-around for pandas bug:
+        # https://github.com/pandas-dev/pandas/issues/53345
+        expect["c"] = expect["c"].astype(
+            pd.CategoricalDtype(
+                categories=expect["c"].dtype.categories.astype("int64"),
+                ordered=expect["c"].dtype.ordered,
+            )
+        )
     assert_eq(expect, got)
 
 
@@ -2803,7 +2847,9 @@ def test_parquet_roundtrip_time_delta():
     )
     buffer = BytesIO()
     df.to_parquet(buffer)
-    assert_eq(df, cudf.read_parquet(buffer))
+    # TODO: Remove `check_dtype` once following issue is fixed in arrow:
+    # https://github.com/apache/arrow/issues/33321
+    assert_eq(df, cudf.read_parquet(buffer), check_dtype=not PANDAS_GE_200)
 
 
 def test_parquet_reader_malformed_file(datadir):