Skip to content

Commit f76441c

Browse files
rhshadrachfeefladder
authored andcommitted
TST: Replace deprecated pyarrow.parquet.ParquetDataset (pandas-dev#42873)
1 parent 8c3349e commit f76441c

File tree

2 files changed

+28
-10
lines changed

2 files changed

+28
-10
lines changed

pandas/compat/pyarrow.py

+2
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,10 @@
1111
pa_version_under2p0 = _palv < Version("2.0.0")
1212
pa_version_under3p0 = _palv < Version("3.0.0")
1313
pa_version_under4p0 = _palv < Version("4.0.0")
14+
pa_version_under5p0 = _palv < Version("5.0.0")
1415
except ImportError:
1516
pa_version_under1p0 = True
1617
pa_version_under2p0 = True
1718
pa_version_under3p0 = True
1819
pa_version_under4p0 = True
20+
pa_version_under5p0 = True

pandas/tests/io/test_parquet.py

+26-10
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
from pandas.compat.pyarrow import (
1818
pa_version_under1p0,
1919
pa_version_under2p0,
20+
pa_version_under5p0,
2021
)
2122
import pandas.util._test_decorators as td
2223

@@ -222,6 +223,29 @@ def compare(repeat):
222223
compare(repeat)
223224

224225

226+
def check_partition_names(path, expected):
227+
"""Check partitions of a parquet file are as expected.
228+
229+
Parameters
230+
----------
231+
path: str
232+
Path of the dataset.
233+
expected: iterable of str
234+
Expected partition names.
235+
"""
236+
if pa_version_under5p0:
237+
import pyarrow.parquet as pq
238+
239+
dataset = pq.ParquetDataset(path, validate_schema=False)
240+
assert len(dataset.partitions.partition_names) == len(expected)
241+
assert dataset.partitions.partition_names == set(expected)
242+
else:
243+
import pyarrow.dataset as ds
244+
245+
dataset = ds.dataset(path, partitioning="hive")
246+
assert dataset.partitioning.schema.names == expected
247+
248+
225249
def test_invalid_engine(df_compat):
226250
msg = "engine must be one of 'pyarrow', 'fastparquet'"
227251
with pytest.raises(ValueError, match=msg):
@@ -743,11 +767,7 @@ def test_partition_cols_supported(self, pa, df_full):
743767
df = df_full
744768
with tm.ensure_clean_dir() as path:
745769
df.to_parquet(path, partition_cols=partition_cols, compression=None)
746-
import pyarrow.parquet as pq
747-
748-
dataset = pq.ParquetDataset(path, validate_schema=False)
749-
assert len(dataset.partitions.partition_names) == 2
750-
assert dataset.partitions.partition_names == set(partition_cols)
770+
check_partition_names(path, partition_cols)
751771
assert read_parquet(path).shape == df.shape
752772

753773
def test_partition_cols_string(self, pa, df_full):
@@ -757,11 +777,7 @@ def test_partition_cols_string(self, pa, df_full):
757777
df = df_full
758778
with tm.ensure_clean_dir() as path:
759779
df.to_parquet(path, partition_cols=partition_cols, compression=None)
760-
import pyarrow.parquet as pq
761-
762-
dataset = pq.ParquetDataset(path, validate_schema=False)
763-
assert len(dataset.partitions.partition_names) == 1
764-
assert dataset.partitions.partition_names == set(partition_cols_list)
780+
check_partition_names(path, partition_cols_list)
765781
assert read_parquet(path).shape == df.shape
766782

767783
@pytest.mark.parametrize("path_type", [str, pathlib.Path])

0 commit comments

Comments
 (0)