Skip to content

Commit d9b7a98

Browse files
authored
Avoid converting Decimal32/Decimal64 in to_arrow and from_arrow APIs (#17422)
Now that the Arrow format includes `Decimal32` and `Decimal64` data types, CUDF no longer needs to convert them to decimal128 when importing/exporting values via the `to_arrow` and `from_arrow` APIs. Instead we can just treat them like any other fixed-width data type and use the buffers directly. This doesn't fully address #17080 as it doesn't make any changes to the Parquet side of things This also incorporates the changes from #17405 which are needed for debug tests. That should get merged first, and then I can rebase this. Authors: - Matt Topol (https://github.com/zeroshade) - David Wendt (https://github.com/davidwendt) - GALI PREM SAGAR (https://github.com/galipremsagar) - Bradley Dice (https://github.com/bdice) Approvers: - Paul Mattione (https://github.com/pmattione-nvidia) - Bradley Dice (https://github.com/bdice) - Lawrence Mitchell (https://github.com/wence-) - GALI PREM SAGAR (https://github.com/galipremsagar) - Robert (Bobby) Evans (https://github.com/revans2) - David Wendt (https://github.com/davidwendt) - Vyas Ramasubramani (https://github.com/vyasr) URL: #17422
1 parent aa80d45 commit d9b7a98

23 files changed

+735
-417
lines changed

conda/recipes/cudf/meta.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,7 @@ requirements:
8181
- numba-cuda >=0.2.0,<0.3.0a0
8282
- numba >=0.59.1,<0.61.0a0
8383
- numpy >=1.23,<3.0a0
84-
- pyarrow>=14.0.0,<18.0.0a0
84+
- pyarrow>=14.0.0,<20.0.0a0
8585
- libcudf ={{ version }}
8686
- pylibcudf ={{ version }}
8787
- {{ pin_compatible('rmm', max_pin='x.x') }}

conda/recipes/pylibcudf/meta.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,7 @@ requirements:
7777
- typing_extensions >=4.0.0
7878
- pandas >=2.0,<2.2.4dev0
7979
- numpy >=1.23,<3.0a0
80-
- pyarrow>=14.0.0,<18.0.0a0
80+
- pyarrow>=14.0.0,<20.0.0a0
8181
- {{ pin_compatible('rmm', max_pin='x.x') }}
8282
- fsspec >=0.6.0
8383
{% if cuda_major == "11" %}

cpp/cmake/thirdparty/get_arrow.cmake

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# =============================================================================
2-
# Copyright (c) 2020-2024, NVIDIA CORPORATION.
2+
# Copyright (c) 2020-2025, NVIDIA CORPORATION.
33
#
44
# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
55
# in compliance with the License. You may obtain a copy of the License at
@@ -347,7 +347,7 @@ if(NOT DEFINED CUDF_VERSION_Arrow)
347347
set(CUDF_VERSION_Arrow
348348
# This version must be kept in sync with the libarrow version pinned for builds in
349349
# dependencies.yaml.
350-
16.1.0
350+
19.0.0
351351
CACHE STRING "The version of Arrow to find (or build)"
352352
)
353353
endif()

cpp/cmake/thirdparty/get_nanoarrow.cmake

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# =============================================================================
2-
# Copyright (c) 2024, NVIDIA CORPORATION.
2+
# Copyright (c) 2024-2025, NVIDIA CORPORATION.
33
#
44
# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
55
# in compliance with the License. You may obtain a copy of the License at
@@ -23,11 +23,11 @@ function(find_and_configure_nanoarrow)
2323
# Currently we need to always build nanoarrow so we don't pickup a previous installed version
2424
set(CPM_DOWNLOAD_nanoarrow ON)
2525
rapids_cpm_find(
26-
nanoarrow 0.6.0.dev
26+
nanoarrow 0.7.0.dev
2727
GLOBAL_TARGETS nanoarrow
2828
CPM_ARGS
2929
GIT_REPOSITORY https://github.com/apache/arrow-nanoarrow.git
30-
GIT_TAG 1e2664a70ec14907409cadcceb14d79b9670bcdb
30+
GIT_TAG 4bf5a9322626e95e3717e43de7616c0a256179eb
3131
GIT_SHALLOW FALSE
3232
OPTIONS "BUILD_SHARED_LIBS OFF" "NANOARROW_NAMESPACE cudf" ${_exclude_from_all}
3333
)
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
2+
{
3+
"packages" : {
4+
"nanoarrow" : {
5+
"version" : "0.7.0.dev",
6+
"git_url" : "https://github.com/apache/arrow-nanoarrow.git",
7+
"git_tag" : "4bf5a9322626e95e3717e43de7616c0a256179eb",
8+
"git_shallow" : false,
9+
"patches" : [
10+
{
11+
"file" : "${current_json_dir}/nanoarrow_clang_tidy_compliance.diff",
12+
"issue" : "https://github.com/apache/arrow-nanoarrow/issues/537",
13+
"fixed_in" : ""
14+
}
15+
]
16+
}
17+
}
18+
}

cpp/src/interop/arrow_utilities.cpp

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2020-2024, NVIDIA CORPORATION.
2+
* Copyright (c) 2020-2025, NVIDIA CORPORATION.
33
*
44
* Licensed under the Apache License, Version 2.0 (the "License");
55
* you may not use this file except in compliance with the License.
@@ -63,6 +63,8 @@ data_type arrow_to_cudf_type(ArrowSchemaView const* arrow_view)
6363
default: CUDF_FAIL("Unsupported duration unit in arrow", cudf::data_type_error);
6464
}
6565
}
66+
case NANOARROW_TYPE_DECIMAL32: return data_type{type_id::DECIMAL32, -arrow_view->decimal_scale};
67+
case NANOARROW_TYPE_DECIMAL64: return data_type{type_id::DECIMAL64, -arrow_view->decimal_scale};
6668
case NANOARROW_TYPE_DECIMAL128:
6769
return data_type{type_id::DECIMAL128, -arrow_view->decimal_scale};
6870
default: CUDF_FAIL("Unsupported type_id conversion to cudf", cudf::data_type_error);
@@ -84,6 +86,8 @@ ArrowType id_to_arrow_type(cudf::type_id id)
8486
case cudf::type_id::FLOAT32: return NANOARROW_TYPE_FLOAT;
8587
case cudf::type_id::FLOAT64: return NANOARROW_TYPE_DOUBLE;
8688
case cudf::type_id::TIMESTAMP_DAYS: return NANOARROW_TYPE_DATE32;
89+
case cudf::type_id::DECIMAL32: return NANOARROW_TYPE_DECIMAL32;
90+
case cudf::type_id::DECIMAL64: return NANOARROW_TYPE_DECIMAL64;
8791
case cudf::type_id::DECIMAL128: return NANOARROW_TYPE_DECIMAL128;
8892
default: CUDF_FAIL("Unsupported type_id conversion to arrow type", cudf::data_type_error);
8993
}

cpp/src/interop/arrow_utilities.hpp

Lines changed: 1 addition & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2024, NVIDIA CORPORATION.
2+
* Copyright (c) 2024-2025, NVIDIA CORPORATION.
33
*
44
* Licensed under the Apache License, Version 2.0 (the "License");
55
* you may not use this file except in compliance with the License.
@@ -70,21 +70,5 @@ ArrowType id_to_arrow_storage_type(cudf::type_id id);
7070
*/
7171
int initialize_array(ArrowArray* arr, ArrowType storage_type, cudf::column_view column);
7272

73-
/**
74-
* @brief Helper to convert decimal values to 128-bit versions for Arrow compatibility
75-
*
76-
* The template parameter should be the underlying type of the data (e.g. int32_t for
77-
* 32-bit decimal and int64_t for 64-bit decimal).
78-
*
79-
* @param input column_view of the data
80-
* @param stream cuda stream to perform the operations on
81-
* @param mr memory resource to allocate the returned device_uvector with
82-
* @return unique_ptr to a device_buffer containing the upcasted data
83-
*/
84-
template <typename DeviceType>
85-
std::unique_ptr<rmm::device_buffer> decimals_to_arrow(cudf::column_view input,
86-
rmm::cuda_stream_view stream,
87-
rmm::device_async_resource_ref mr);
88-
8973
} // namespace detail
9074
} // namespace cudf

cpp/src/interop/from_arrow_device.cu

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2024, NVIDIA CORPORATION.
2+
* Copyright (c) 2024-2025, NVIDIA CORPORATION.
33
*
44
* Licensed under the Apache License, Version 2.0 (the "License");
55
* you may not use this file except in compliance with the License.
@@ -49,9 +49,7 @@ namespace {
4949
using dispatch_tuple_t = std::tuple<column_view, owned_columns_t>;
5050

5151
struct dispatch_from_arrow_device {
52-
template <typename T,
53-
CUDF_ENABLE_IF(not is_rep_layout_compatible<T>() &&
54-
!std::is_same_v<T, numeric::decimal128>)>
52+
template <typename T, CUDF_ENABLE_IF(not is_rep_layout_compatible<T>() && !is_fixed_point<T>())>
5553
dispatch_tuple_t operator()(ArrowSchemaView*,
5654
ArrowArray const*,
5755
data_type,
@@ -62,8 +60,7 @@ struct dispatch_from_arrow_device {
6260
CUDF_FAIL("Unsupported type in from_arrow_device", cudf::data_type_error);
6361
}
6462

65-
template <typename T,
66-
CUDF_ENABLE_IF(is_rep_layout_compatible<T>() || std::is_same_v<T, numeric::decimal128>)>
63+
template <typename T, CUDF_ENABLE_IF(is_rep_layout_compatible<T>() || is_fixed_point<T>())>
6764
dispatch_tuple_t operator()(ArrowSchemaView* schema,
6865
ArrowArray const* input,
6966
data_type type,

cpp/src/interop/from_arrow_host.cu

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2024, NVIDIA CORPORATION.
2+
* Copyright (c) 2024-2025, NVIDIA CORPORATION.
33
*
44
* Licensed under the Apache License, Version 2.0 (the "License");
55
* you may not use this file except in compliance with the License.
@@ -69,22 +69,19 @@ struct dispatch_copy_from_arrow_host {
6969
return mask;
7070
}
7171

72-
template <typename T,
73-
CUDF_ENABLE_IF(not is_rep_layout_compatible<T>() &&
74-
!std::is_same_v<T, numeric::decimal128>)>
72+
template <typename T, CUDF_ENABLE_IF(not is_rep_layout_compatible<T>() && !is_fixed_point<T>())>
7573
std::unique_ptr<column> operator()(ArrowSchemaView*, ArrowArray const*, data_type, bool)
7674
{
7775
CUDF_FAIL("Unsupported type in copy_from_arrow_host.");
7876
}
7977

80-
template <typename T,
81-
CUDF_ENABLE_IF(is_rep_layout_compatible<T>() || std::is_same_v<T, numeric::decimal128>)>
78+
template <typename T, CUDF_ENABLE_IF(is_rep_layout_compatible<T>() || is_fixed_point<T>())>
8279
std::unique_ptr<column> operator()(ArrowSchemaView* schema,
8380
ArrowArray const* input,
8481
data_type type,
8582
bool skip_mask)
8683
{
87-
using DeviceType = std::conditional_t<std::is_same_v<T, numeric::decimal128>, __int128_t, T>;
84+
using DeviceType = device_storage_type_t<T>;
8885

8986
size_type const num_rows = input->length;
9087
size_type const offset = input->offset;

cpp/src/interop/to_arrow_device.cu

Lines changed: 8 additions & 94 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2024, NVIDIA CORPORATION.
2+
* Copyright (c) 2024-2025, NVIDIA CORPORATION.
33
*
44
* Licensed under the Apache License, Version 2.0 (the "License");
55
* you may not use this file except in compliance with the License.
@@ -92,13 +92,15 @@ int set_buffer(std::unique_ptr<T> device_buf, int64_t i, ArrowArray* out)
9292
}
9393

9494
struct dispatch_to_arrow_device {
95-
template <typename T, CUDF_ENABLE_IF(not is_rep_layout_compatible<T>())>
95+
template <typename T,
96+
CUDF_ENABLE_IF(not is_rep_layout_compatible<T>() and not is_fixed_point<T>())>
9697
int operator()(cudf::column&&, rmm::cuda_stream_view, rmm::device_async_resource_ref, ArrowArray*)
9798
{
9899
CUDF_FAIL("Unsupported type for to_arrow_device", cudf::data_type_error);
99100
}
100101

101-
template <typename T, CUDF_ENABLE_IF(is_rep_layout_compatible<T>())>
102+
// cover rep layout compatible and decimal types
103+
template <typename T, CUDF_ENABLE_IF(is_rep_layout_compatible<T>() or is_fixed_point<T>())>
102104
int operator()(cudf::column&& column,
103105
rmm::cuda_stream_view stream,
104106
rmm::device_async_resource_ref mr,
@@ -132,64 +134,6 @@ struct dispatch_to_arrow_device {
132134
}
133135
};
134136

135-
template <typename DeviceType>
136-
int construct_decimals(cudf::column_view input,
137-
rmm::cuda_stream_view stream,
138-
rmm::device_async_resource_ref mr,
139-
ArrowArray* out)
140-
{
141-
nanoarrow::UniqueArray tmp;
142-
NANOARROW_RETURN_NOT_OK(initialize_array(tmp.get(), NANOARROW_TYPE_DECIMAL128, input));
143-
144-
auto buf = detail::convert_decimals_to_decimal128<DeviceType>(input, stream, mr);
145-
// Synchronize stream here to ensure the decimal128 buffer is ready.
146-
stream.synchronize();
147-
NANOARROW_RETURN_NOT_OK(set_buffer(std::move(buf), fixed_width_data_buffer_idx, tmp.get()));
148-
149-
ArrowArrayMove(tmp.get(), out);
150-
return NANOARROW_OK;
151-
}
152-
153-
template <>
154-
int dispatch_to_arrow_device::operator()<numeric::decimal32>(cudf::column&& column,
155-
rmm::cuda_stream_view stream,
156-
rmm::device_async_resource_ref mr,
157-
ArrowArray* out)
158-
{
159-
using DeviceType = int32_t;
160-
NANOARROW_RETURN_NOT_OK(construct_decimals<DeviceType>(column.view(), stream, mr, out));
161-
auto contents = column.release();
162-
NANOARROW_RETURN_NOT_OK(set_null_mask(contents, out));
163-
return NANOARROW_OK;
164-
}
165-
166-
template <>
167-
int dispatch_to_arrow_device::operator()<numeric::decimal64>(cudf::column&& column,
168-
rmm::cuda_stream_view stream,
169-
rmm::device_async_resource_ref mr,
170-
ArrowArray* out)
171-
{
172-
using DeviceType = int64_t;
173-
NANOARROW_RETURN_NOT_OK(construct_decimals<DeviceType>(column.view(), stream, mr, out));
174-
auto contents = column.release();
175-
NANOARROW_RETURN_NOT_OK(set_null_mask(contents, out));
176-
return NANOARROW_OK;
177-
}
178-
179-
template <>
180-
int dispatch_to_arrow_device::operator()<numeric::decimal128>(cudf::column&& column,
181-
rmm::cuda_stream_view stream,
182-
rmm::device_async_resource_ref mr,
183-
ArrowArray* out)
184-
{
185-
nanoarrow::UniqueArray tmp;
186-
NANOARROW_RETURN_NOT_OK(initialize_array(tmp.get(), NANOARROW_TYPE_DECIMAL128, column));
187-
auto contents = column.release();
188-
NANOARROW_RETURN_NOT_OK(set_contents(contents, tmp.get()));
189-
ArrowArrayMove(tmp.get(), out);
190-
return NANOARROW_OK;
191-
}
192-
193137
template <>
194138
int dispatch_to_arrow_device::operator()<bool>(cudf::column&& column,
195139
rmm::cuda_stream_view stream,
@@ -350,13 +294,14 @@ struct dispatch_to_arrow_device_view {
350294
rmm::cuda_stream_view stream;
351295
rmm::device_async_resource_ref mr;
352296

353-
template <typename T, CUDF_ENABLE_IF(not is_rep_layout_compatible<T>())>
297+
template <typename T,
298+
CUDF_ENABLE_IF(not is_rep_layout_compatible<T>() and not is_fixed_point<T>())>
354299
int operator()(ArrowArray*) const
355300
{
356301
CUDF_FAIL("Unsupported type for to_arrow_device", cudf::data_type_error);
357302
}
358303

359-
template <typename T, CUDF_ENABLE_IF(is_rep_layout_compatible<T>())>
304+
template <typename T, CUDF_ENABLE_IF(is_rep_layout_compatible<T>() or is_fixed_point<T>())>
360305
int operator()(ArrowArray* out) const
361306
{
362307
nanoarrow::UniqueArray tmp;
@@ -404,37 +349,6 @@ struct dispatch_to_arrow_device_view {
404349
}
405350
};
406351

407-
template <>
408-
int dispatch_to_arrow_device_view::operator()<numeric::decimal32>(ArrowArray* out) const
409-
{
410-
using DeviceType = int32_t;
411-
NANOARROW_RETURN_NOT_OK(construct_decimals<DeviceType>(column, stream, mr, out));
412-
NANOARROW_RETURN_NOT_OK(set_null_mask(column, out));
413-
return NANOARROW_OK;
414-
}
415-
416-
template <>
417-
int dispatch_to_arrow_device_view::operator()<numeric::decimal64>(ArrowArray* out) const
418-
{
419-
using DeviceType = int64_t;
420-
NANOARROW_RETURN_NOT_OK(construct_decimals<DeviceType>(column, stream, mr, out));
421-
NANOARROW_RETURN_NOT_OK(set_null_mask(column, out));
422-
return NANOARROW_OK;
423-
}
424-
425-
template <>
426-
int dispatch_to_arrow_device_view::operator()<numeric::decimal128>(ArrowArray* out) const
427-
{
428-
nanoarrow::UniqueArray tmp;
429-
430-
NANOARROW_RETURN_NOT_OK(initialize_array(tmp.get(), NANOARROW_TYPE_DECIMAL128, column));
431-
NANOARROW_RETURN_NOT_OK(set_null_mask(column, tmp.get()));
432-
NANOARROW_RETURN_NOT_OK(set_view_to_buffer(column, tmp.get()));
433-
434-
ArrowArrayMove(tmp.get(), out);
435-
return NANOARROW_OK;
436-
}
437-
438352
template <>
439353
int dispatch_to_arrow_device_view::operator()<bool>(ArrowArray* out) const
440354
{

cpp/src/interop/to_arrow_host.cu

Lines changed: 3 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2024, NVIDIA CORPORATION.
2+
* Copyright (c) 2024-2025, NVIDIA CORPORATION.
33
*
44
* Licensed under the Apache License, Version 2.0 (the "License");
55
* you may not use this file except in compliance with the License.
@@ -115,8 +115,7 @@ struct dispatch_to_arrow_host {
115115
CUDF_FAIL("Unsupported type for to_arrow_host", cudf::data_type_error);
116116
}
117117

118-
template <typename T,
119-
CUDF_ENABLE_IF(is_rep_layout_compatible<T>() || std::is_same_v<T, numeric::decimal128>)>
118+
template <typename T, CUDF_ENABLE_IF(is_rep_layout_compatible<T>() || is_fixed_point<T>())>
120119
int operator()(ArrowArray* out) const
121120
{
122121
nanoarrow::UniqueArray tmp;
@@ -125,40 +124,14 @@ struct dispatch_to_arrow_host {
125124
NANOARROW_RETURN_NOT_OK(initialize_array(tmp.get(), storage_type, column));
126125

127126
NANOARROW_RETURN_NOT_OK(populate_validity_bitmap(ArrowArrayValidityBitmap(tmp.get())));
128-
using DataType = std::conditional_t<std::is_same_v<T, numeric::decimal128>, __int128_t, T>;
127+
using DataType = device_storage_type_t<T>;
129128
NANOARROW_RETURN_NOT_OK(
130129
populate_data_buffer(device_span<DataType const>(column.data<DataType>(), column.size()),
131130
ArrowArrayBuffer(tmp.get(), fixed_width_data_buffer_idx)));
132131

133132
ArrowArrayMove(tmp.get(), out);
134133
return NANOARROW_OK;
135134
}
136-
137-
// convert decimal types from libcudf to arrow where those types are not directly
138-
// supported by Arrow. These types must be fit into 128 bits, the smallest
139-
// decimal resolution supported by Arrow
140-
template <typename T,
141-
CUDF_ENABLE_IF(!is_rep_layout_compatible<T>() &&
142-
(std::is_same_v<T, numeric::decimal32> ||
143-
std::is_same_v<T, numeric::decimal64>))>
144-
int operator()(ArrowArray* out) const
145-
{
146-
using DeviceType = std::conditional_t<std::is_same_v<T, numeric::decimal32>, int32_t, int64_t>;
147-
nanoarrow::UniqueArray tmp;
148-
NANOARROW_RETURN_NOT_OK(initialize_array(tmp.get(), NANOARROW_TYPE_DECIMAL128, column));
149-
150-
NANOARROW_RETURN_NOT_OK(populate_validity_bitmap(ArrowArrayValidityBitmap(tmp.get())));
151-
auto buf = detail::convert_decimals_to_decimal128<DeviceType>(column, stream, mr);
152-
// No need to synchronize stream here as populate_data_buffer uses the same stream to copy data
153-
// to host.
154-
NANOARROW_RETURN_NOT_OK(
155-
populate_data_buffer(device_span<__int128_t const>(
156-
reinterpret_cast<const __int128_t*>(buf->data()), column.size()),
157-
ArrowArrayBuffer(tmp.get(), fixed_width_data_buffer_idx)));
158-
159-
ArrowArrayMove(tmp.get(), out);
160-
return NANOARROW_OK;
161-
}
162135
};
163136

164137
int get_column(cudf::column_view column,

0 commit comments

Comments
 (0)