Skip to content

Commit d235f69

Browse files
jorisvandenbosschexhochy
authored andcommitted
ARROW-5436: [Python] parquet.read_table add filters keyword
https://issues.apache.org/jira/browse/ARROW-5436 I suppose the fact that `parquet.read_table` dispatched to FileSystem.read_parquet was for historical reasons (that function was added before ParquetDataset was added), but directly calling ParquetDataset there looks cleaner instead of going through FileSystem.read_parquet. So therefore I also changed that. In addition, I made sure the `memory_map` keyword was actually passed through, I think an oversight of #2954. (those two changes should be useful anyway, regardless of adding `filters` keyword or not) Author: Joris Van den Bossche <[email protected]> Closes #4409 from jorisvandenbossche/ARROW-5436-parquet-read_table and squashes the following commits: 85e5b0e <Joris Van den Bossche> lint 0ae1488 <Joris Van den Bossche> add test with nested list 9baf420 <Joris Van den Bossche> add filters to read_pandas 0df8c88 <Joris Van den Bossche> Merge remote-tracking branch 'upstream/master' into ARROW-5436-parquet-read_table 4ea7b77 <Joris Van den Bossche> fix test 4eb2ea7 <Joris Van den Bossche> add filters keyword 9c10f70 <Joris Van den Bossche> fix passing of memory_map (leftover from ARROW-2807) 896abb2 <Joris Van den Bossche> simplify read_table (use ParquetDataset directly)
1 parent 052130a commit d235f69

File tree

2 files changed

+44
-8
lines changed

2 files changed

+44
-8
lines changed

python/pyarrow/parquet.py

+12-8
Original file line numberDiff line numberDiff line change
@@ -1181,6 +1181,11 @@ def _make_manifest(path_or_paths, fs, pathsep='/', metadata_nthreads=1,
11811181
If the source is a file path, use a memory map to read file, which can
11821182
improve performance in some environments
11831183
{1}
1184+
filters : List[Tuple] or List[List[Tuple]] or None (default)
1185+
List of filters to apply, like ``[[('x', '=', 0), ...], ...]``. This
1186+
implements partition-level (hive) filtering only, i.e., to prevent the
1187+
loading of some files of the dataset if `source` is a directory.
1188+
See the docstring of ParquetDataset for more details.
11841189
11851190
Returns
11861191
-------
@@ -1190,14 +1195,12 @@ def _make_manifest(path_or_paths, fs, pathsep='/', metadata_nthreads=1,
11901195

11911196
def read_table(source, columns=None, use_threads=True, metadata=None,
11921197
use_pandas_metadata=False, memory_map=True,
1193-
filesystem=None):
1198+
filesystem=None, filters=None):
11941199
if _is_path_like(source):
1195-
fs, path = _get_filesystem_and_path(filesystem, source)
1196-
return fs.read_parquet(path, columns=columns,
1197-
use_threads=use_threads, metadata=metadata,
1198-
use_pandas_metadata=use_pandas_metadata)
1199-
1200-
pf = ParquetFile(source, metadata=metadata)
1200+
pf = ParquetDataset(source, metadata=metadata, memory_map=memory_map,
1201+
filesystem=filesystem, filters=filters)
1202+
else:
1203+
pf = ParquetFile(source, metadata=metadata, memory_map=memory_map)
12011204
return pf.read(columns=columns, use_threads=use_threads,
12021205
use_pandas_metadata=use_pandas_metadata)
12031206

@@ -1212,10 +1215,11 @@ def read_table(source, columns=None, use_threads=True, metadata=None,
12121215

12131216

12141217
def read_pandas(source, columns=None, use_threads=True, memory_map=True,
1215-
metadata=None):
1218+
metadata=None, filters=None):
12161219
return read_table(source, columns=columns,
12171220
use_threads=use_threads,
12181221
metadata=metadata, memory_map=True,
1222+
filters=filters,
12191223
use_pandas_metadata=True)
12201224

12211225

python/pyarrow/tests/test_parquet.py

+32
Original file line numberDiff line numberDiff line change
@@ -1523,6 +1523,38 @@ def test_invalid_pred_op(tempdir):
15231523
])
15241524

15251525

1526+
@pytest.mark.pandas
1527+
def test_filters_read_table(tempdir):
1528+
# test that filters keyword is passed through in read_table
1529+
fs = LocalFileSystem.get_instance()
1530+
base_path = tempdir
1531+
1532+
integer_keys = [0, 1, 2, 3, 4]
1533+
partition_spec = [
1534+
['integers', integer_keys],
1535+
]
1536+
N = 5
1537+
1538+
df = pd.DataFrame({
1539+
'index': np.arange(N),
1540+
'integers': np.array(integer_keys, dtype='i4'),
1541+
}, columns=['index', 'integers'])
1542+
1543+
_generate_partition_directories(fs, base_path, partition_spec, df)
1544+
1545+
table = pq.read_table(
1546+
base_path, filesystem=fs, filters=[('integers', '<', 3)])
1547+
assert table.num_rows == 3
1548+
1549+
table = pq.read_table(
1550+
base_path, filesystem=fs, filters=[[('integers', '<', 3)]])
1551+
assert table.num_rows == 3
1552+
1553+
table = pq.read_pandas(
1554+
base_path, filters=[('integers', '<', 3)])
1555+
assert table.num_rows == 3
1556+
1557+
15261558
@pytest.yield_fixture
15271559
def s3_example():
15281560
access_key = os.environ['PYARROW_TEST_S3_ACCESS_KEY']

0 commit comments

Comments
 (0)