Skip to content

Commit 1417297

Browse files
Materialize RangeIndex when index=True in parquet writer (#7711)
Resolves: #6873 This PR enables support to materialize a `RangeIndex` when `index=True`. Didn't add any tests as we already test for this in `test_parquet_index` but we were having the tests wrong due to a typo which was writing to the same file both pandas & cudf dataframes. This test is now fixed in this PR. Authors: - GALI PREM SAGAR (@galipremsagar) Approvers: - Keith Kraus (@kkraus14) URL: #7711
1 parent df3c0f0 commit 1417297

File tree

4 files changed

+45
-23
lines changed

4 files changed

+45
-23
lines changed

python/cudf/cudf/_lib/parquet.pyx

+3-1
Original file line numberDiff line numberDiff line change
@@ -294,7 +294,9 @@ cpdef write_parquet(
294294
cdef unique_ptr[cudf_io_types.data_sink] _data_sink
295295
cdef cudf_io_types.sink_info sink = make_sink_info(path, _data_sink)
296296

297-
if index is not False and not isinstance(table._index, cudf.RangeIndex):
297+
if index is True or (
298+
index is None and not isinstance(table._index, cudf.RangeIndex)
299+
):
298300
tv = table.view()
299301
tbl_meta = make_unique[table_input_metadata](tv)
300302
for level, idx_name in enumerate(table._index.names):

python/cudf/cudf/_lib/utils.pyx

+24-8
Original file line numberDiff line numberDiff line change
@@ -99,15 +99,31 @@ cpdef generate_pandas_metadata(Table table, index):
9999
idx = table.index
100100

101101
if isinstance(idx, cudf.core.index.RangeIndex):
102-
descr = {
103-
"kind": "range",
104-
"name": table.index.name,
105-
"start": table.index.start,
106-
"stop": table.index.stop,
107-
"step": table.index.step,
108-
}
102+
if index is None:
103+
descr = {
104+
"kind": "range",
105+
"name": table.index.name,
106+
"start": table.index.start,
107+
"stop": table.index.stop,
108+
"step": table.index.step,
109+
}
110+
else:
111+
# When `index=True`, RangeIndex needs to be materialized.
112+
materialized_idx = cudf.Index(idx._values, name=idx.name)
113+
descr = \
114+
_index_level_name(
115+
index_name=materialized_idx.name,
116+
level=level,
117+
column_names=col_names
118+
)
119+
index_levels.append(materialized_idx)
109120
else:
110-
descr = _index_level_name(idx.name, level, col_names)
121+
descr = \
122+
_index_level_name(
123+
index_name=idx.name,
124+
level=level,
125+
column_names=col_names
126+
)
111127
if is_categorical_dtype(idx):
112128
raise ValueError(
113129
"'category' column dtypes are currently not "

python/cudf/cudf/tests/test_parquet.py

+13-12
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
1-
# Copyright (c) 2019-2020, NVIDIA CORPORATION.
1+
# Copyright (c) 2019-2021, NVIDIA CORPORATION.
2+
23
import datetime
34
import math
45
import os
@@ -1718,24 +1719,24 @@ def test_parquet_nullable_boolean(tmpdir, engine):
17181719
],
17191720
)
17201721
@pytest.mark.parametrize("index", [None, True, False])
1721-
def test_parquet_index(tmpdir, pdf, index):
1722-
pandas_path = tmpdir.join("pandas_index.parquet")
1723-
cudf_path = tmpdir.join("pandas_index.parquet")
1722+
def test_parquet_index(pdf, index):
1723+
pandas_buffer = BytesIO()
1724+
cudf_buffer = BytesIO()
17241725

17251726
gdf = cudf.from_pandas(pdf)
17261727

1727-
pdf.to_parquet(pandas_path, index=index)
1728-
gdf.to_parquet(cudf_path, index=index)
1728+
pdf.to_parquet(pandas_buffer, index=index)
1729+
gdf.to_parquet(cudf_buffer, index=index)
17291730

1730-
expected = pd.read_parquet(cudf_path)
1731-
actual = cudf.read_parquet(cudf_path)
1731+
expected = pd.read_parquet(cudf_buffer)
1732+
actual = cudf.read_parquet(pandas_buffer)
17321733

1733-
assert_eq(expected, actual)
1734+
assert_eq(expected, actual, check_index_type=True)
17341735

1735-
expected = pd.read_parquet(pandas_path)
1736-
actual = cudf.read_parquet(pandas_path)
1736+
expected = pd.read_parquet(pandas_buffer)
1737+
actual = cudf.read_parquet(cudf_buffer)
17371738

1738-
assert_eq(expected, actual)
1739+
assert_eq(expected, actual, check_index_type=True)
17391740

17401741

17411742
@pytest.mark.parametrize("engine", ["cudf", "pyarrow"])

python/cudf/cudf/utils/ioutils.py

+5-2
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# Copyright (c) 2019-2020, NVIDIA CORPORATION.
1+
# Copyright (c) 2019-2021, NVIDIA CORPORATION.
22

33
import datetime
44
import os
@@ -193,7 +193,10 @@
193193
index : bool, default None
194194
If ``True``, include the dataframe's index(es) in the file output. If
195195
``False``, they will not be written to the file. If ``None``, the
196-
engine's default behavior will be used.
196+
engine's default behavior will be used. However, instead of being saved
197+
as values, the ``RangeIndex`` will be stored as a range in the metadata
198+
so it doesn’t require much space and is faster. Other indexes will
199+
be included as columns in the file output.
197200
partition_cols : list, optional, default None
198201
Column names by which to partition the dataset
199202
Columns are partitioned in the order they are given

0 commit comments

Comments
 (0)