@@ -1749,6 +1749,15 @@ def test_parquet_partitioned(tmpdir_factory, cols, filename):
1749
1749
1750
1750
# Check that cudf and pd return the same read
1751
1751
got_cudf = cudf .read_parquet (gdf_dir )
1752
+ if PANDAS_GE_200 and isinstance (got_pd ["c" ].dtype , pd .CategoricalDtype ):
1753
+ # Work-around for pandas bug:
1754
+ # https://github.com/pandas-dev/pandas/issues/53345
1755
+ got_pd ["c" ] = got_pd ["c" ].astype (
1756
+ pd .CategoricalDtype (
1757
+ categories = got_pd ["c" ].dtype .categories .astype ("int64" ),
1758
+ ordered = got_pd ["c" ].dtype .ordered ,
1759
+ )
1760
+ )
1752
1761
assert_eq (got_pd , got_cudf )
1753
1762
1754
1763
# If filename is specified, check that it is correct
@@ -1796,6 +1805,15 @@ def test_parquet_writer_chunked_partitioned(tmpdir_factory, return_meta):
1796
1805
1797
1806
# Check that cudf and pd return the same read
1798
1807
got_cudf = cudf .read_parquet (gdf_dir )
1808
+ if PANDAS_GE_200 :
1809
+ # Work-around for pandas bug:
1810
+ # https://github.com/pandas-dev/pandas/issues/53345
1811
+ got_pd ["a" ] = got_pd ["a" ].astype (
1812
+ pd .CategoricalDtype (
1813
+ categories = got_pd ["a" ].dtype .categories .astype ("int64" ),
1814
+ ordered = got_pd ["a" ].dtype .ordered ,
1815
+ )
1816
+ )
1799
1817
assert_eq (got_pd , got_cudf )
1800
1818
1801
1819
@@ -1836,7 +1854,15 @@ def test_parquet_writer_chunked_max_file_size(
1836
1854
1837
1855
# Check that cudf and pd return the same read
1838
1856
got_cudf = cudf .read_parquet (gdf_dir )
1839
-
1857
+ if PANDAS_GE_200 :
1858
+ # Work-around for pandas bug:
1859
+ # https://github.com/pandas-dev/pandas/issues/53345
1860
+ got_pd ["a" ] = got_pd ["a" ].astype (
1861
+ pd .CategoricalDtype (
1862
+ categories = got_pd ["a" ].dtype .categories .astype ("int64" ),
1863
+ ordered = got_pd ["a" ].dtype .ordered ,
1864
+ )
1865
+ )
1840
1866
assert_eq (
1841
1867
got_pd .sort_values (["b" ]).reset_index (drop = True ),
1842
1868
got_cudf .sort_values (["b" ]).reset_index (drop = True ),
@@ -1882,6 +1908,15 @@ def test_parquet_writer_chunked_partitioned_context(tmpdir_factory):
1882
1908
1883
1909
# Check that cudf and pd return the same read
1884
1910
got_cudf = cudf .read_parquet (gdf_dir )
1911
+ if PANDAS_GE_200 :
1912
+ # Work-around for pandas bug:
1913
+ # https://github.com/pandas-dev/pandas/issues/53345
1914
+ got_pd ["a" ] = got_pd ["a" ].astype (
1915
+ pd .CategoricalDtype (
1916
+ categories = got_pd ["a" ].dtype .categories .astype ("int64" ),
1917
+ ordered = got_pd ["a" ].dtype .ordered ,
1918
+ )
1919
+ )
1885
1920
assert_eq (got_pd , got_cudf )
1886
1921
1887
1922
@@ -1989,6 +2024,15 @@ def test_read_parquet_partitioned_filtered(
1989
2024
filters = [[("a" , "==" , 10 )], [("c" , "==" , 1 )]]
1990
2025
got = cudf .read_parquet (read_path , filters = filters )
1991
2026
expect = pd .read_parquet (read_path , filters = filters )
2027
+ if PANDAS_GE_200 :
2028
+ # Work-around for pandas bug:
2029
+ # https://github.com/pandas-dev/pandas/issues/53345
2030
+ expect ["c" ] = expect ["c" ].astype (
2031
+ pd .CategoricalDtype (
2032
+ categories = expect ["c" ].dtype .categories .astype ("int64" ),
2033
+ ordered = expect ["c" ].dtype .ordered ,
2034
+ )
2035
+ )
1992
2036
assert_eq (expect , got )
1993
2037
1994
2038
@@ -2803,7 +2847,9 @@ def test_parquet_roundtrip_time_delta():
2803
2847
)
2804
2848
buffer = BytesIO ()
2805
2849
df .to_parquet (buffer )
2806
- assert_eq (df , cudf .read_parquet (buffer ))
2850
+ # TODO: Remove `check_dtype` once following issue is fixed in arrow:
2851
+ # https://github.com/apache/arrow/issues/33321
2852
+ assert_eq (df , cudf .read_parquet (buffer ), check_dtype = not PANDAS_GE_200 )
2807
2853
2808
2854
2809
2855
def test_parquet_reader_malformed_file (datadir ):
0 commit comments