Skip to content

Commit f1f1d0f

Browse files
authored
Add column_device_view to orc writer (#7676)
This PR adds column_device_view members to EncChunk, DictionaryChunk and StripeDictionary structures which are used in the ORC writer. The idea is to replace members in these structures which replicate the same information. Usage of nvstrdesc_s has also been eliminated in the ORC writer. Fixes #7347, Addresses #5682, Addresses #7334 Authors: - Kumar Aatish (@kaatish) Approvers: - Vukasin Milovanovic (@vuule) - Devavret Makkar (@devavret) URL: #7676
1 parent b854598 commit f1f1d0f

File tree

10 files changed

+283
-273
lines changed

10 files changed

+283
-273
lines changed

cpp/src/io/orc/dict_enc.cu

+80-71
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
#include "orc_common.h"
1818
#include "orc_gpu.h"
1919

20+
#include <cudf/table/table_device_view.cuh>
2021
#include <io/utilities/block_utils.cuh>
2122

2223
#include <rmm/cuda_stream_view.hpp>
@@ -46,14 +47,16 @@ struct dictinit_state_s {
4647
};
4748

4849
/**
49-
* @brief Return a 12-bit hash from a byte sequence
50+
* @brief Return a 12-bit hash from a string
5051
*/
51-
static inline __device__ uint32_t nvstr_init_hash(char const *ptr, uint32_t len)
52+
static inline __device__ uint32_t hash_string(const string_view val)
5253
{
53-
if (len != 0) {
54-
return (ptr[0] + (ptr[len - 1] << 5) + (len << 10)) & ((1 << init_hash_bits) - 1);
55-
} else {
54+
if (val.empty()) {
5655
return 0;
56+
} else {
57+
char const *ptr = val.data();
58+
uint32_t len = val.size_bytes();
59+
return (ptr[0] + (ptr[len - 1] << 5) + (len << 10)) & ((1 << init_hash_bits) - 1);
5760
}
5861
}
5962

@@ -71,7 +74,8 @@ static __device__ void LoadNonNullIndices(volatile dictinit_state_s *s,
7174
{
7275
if (t == 0) { s->nnz = 0; }
7376
for (uint32_t i = 0; i < s->chunk.num_rows; i += block_size) {
74-
const uint32_t *valid_map = s->chunk.valid_map_base;
77+
const uint32_t *valid_map = s->chunk.leaf_column->null_mask();
78+
auto column_offset = s->chunk.leaf_column->offset();
7579
uint32_t is_valid, nz_pos;
7680
if (t < block_size / 32) {
7781
if (!valid_map) {
@@ -80,10 +84,10 @@ static __device__ void LoadNonNullIndices(volatile dictinit_state_s *s,
8084
uint32_t const row = s->chunk.start_row + i + t * 32;
8185
auto const chunk_end = s->chunk.start_row + s->chunk.num_rows;
8286

83-
auto const valid_map_idx = (row + s->chunk.column_offset) / 32;
87+
auto const valid_map_idx = (row + column_offset) / 32;
8488
uint32_t valid = (row < chunk_end) ? valid_map[valid_map_idx] : 0;
8589

86-
auto const rows_in_next_word = (row + s->chunk.column_offset) & 0x1f;
90+
auto const rows_in_next_word = (row + column_offset) & 0x1f;
8791
if (rows_in_next_word != 0) {
8892
auto const rows_in_current_word = 32 - rows_in_next_word;
8993
// Read next word if any rows are within the chunk
@@ -111,12 +115,18 @@ static __device__ void LoadNonNullIndices(volatile dictinit_state_s *s,
111115
* @brief Gather all non-NULL string rows and compute total character data size
112116
*
113117
* @param[in] chunks DictionaryChunk device array [rowgroup][column]
114-
* @param[in] num_columns Number of columns
118+
* @param[in] num_columns Number of string columns
115119
*/
116120
// blockDim {block_size,1,1}
117121
template <int block_size>
118122
__global__ void __launch_bounds__(block_size, 2)
119-
gpuInitDictionaryIndices(DictionaryChunk *chunks, uint32_t num_columns)
123+
gpuInitDictionaryIndices(DictionaryChunk *chunks,
124+
const table_device_view view,
125+
uint32_t *dict_data,
126+
uint32_t *dict_index,
127+
size_t row_index_stride,
128+
size_type *str_col_ids,
129+
uint32_t num_columns)
120130
{
121131
__shared__ __align__(16) dictinit_state_s state_g;
122132

@@ -131,12 +141,21 @@ __global__ void __launch_bounds__(block_size, 2)
131141
dictinit_state_s *const s = &state_g;
132142
uint32_t col_id = blockIdx.x;
133143
uint32_t group_id = blockIdx.y;
134-
const nvstrdesc_s *ck_data;
135-
uint32_t *dict_data;
136144
uint32_t nnz, start_row, dict_char_count;
137145
int t = threadIdx.x;
138146

139-
if (t == 0) s->chunk = chunks[group_id * num_columns + col_id];
147+
if (t == 0) {
148+
column_device_view *leaf_column_view = view.begin() + str_col_ids[col_id];
149+
s->chunk = chunks[group_id * num_columns + col_id];
150+
s->chunk.leaf_column = leaf_column_view;
151+
s->chunk.dict_data =
152+
dict_data + col_id * leaf_column_view->size() + group_id * row_index_stride;
153+
s->chunk.dict_index = dict_index + col_id * leaf_column_view->size();
154+
s->chunk.start_row = group_id * row_index_stride;
155+
s->chunk.num_rows =
156+
min(row_index_stride,
157+
max(static_cast<size_t>(leaf_column_view->size() - s->chunk.start_row), size_t{0}));
158+
}
140159
for (uint32_t i = 0; i < sizeof(s->map) / sizeof(uint32_t); i += block_size) {
141160
if (i + t < sizeof(s->map) / sizeof(uint32_t)) s->map.u32[i + t] = 0;
142161
}
@@ -152,15 +171,15 @@ __global__ void __launch_bounds__(block_size, 2)
152171
nnz = s->nnz;
153172
dict_data = s->chunk.dict_data;
154173
start_row = s->chunk.start_row;
155-
ck_data = static_cast<const nvstrdesc_s *>(s->chunk.column_data_base) + start_row;
156174
for (uint32_t i = 0; i < nnz; i += block_size) {
157175
uint32_t ck_row = 0;
158176
uint32_t hash = 0;
159177
uint32_t len = 0;
160178
if (i + t < nnz) {
161-
ck_row = s->dict[i + t];
162-
len = static_cast<uint32_t>(ck_data[ck_row].count);
163-
hash = nvstr_init_hash(ck_data[ck_row].ptr, len);
179+
ck_row = s->dict[i + t];
180+
string_view string_val = s->chunk.leaf_column->element<string_view>(ck_row + start_row);
181+
len = static_cast<uint32_t>(string_val.size_bytes());
182+
hash = hash_string(string_val);
164183
}
165184
len = block_reduce(temp_storage.reduce_storage).Sum(len);
166185
if (t == 0) s->chunk.string_char_count += len;
@@ -200,10 +219,11 @@ __global__ void __launch_bounds__(block_size, 2)
200219
uint32_t ck_row = 0, pos = 0, hash = 0, pos_old, pos_new, sh, colliding_row;
201220
bool collision;
202221
if (i + t < nnz) {
203-
ck_row = dict_data[i + t] - start_row;
204-
hash = nvstr_init_hash(ck_data[ck_row].ptr, static_cast<uint32_t>(ck_data[ck_row].count));
205-
sh = (hash & 1) ? 16 : 0;
206-
pos_old = s->map.u16[hash];
222+
ck_row = dict_data[i + t] - start_row;
223+
string_view string_val = s->chunk.leaf_column->element<string_view>(ck_row + start_row);
224+
hash = hash_string(string_val);
225+
sh = (hash & 1) ? 16 : 0;
226+
pos_old = s->map.u16[hash];
207227
}
208228
// The isolation of the atomicAdd, along with pos_old/pos_new is to guarantee deterministic
209229
// behavior for the first row in the hash map that will be used for early duplicate detection
@@ -233,18 +253,16 @@ __global__ void __launch_bounds__(block_size, 2)
233253
for (uint32_t i = 0; i < nnz; i += block_size) {
234254
uint32_t ck_row = 0, ck_row_ref = 0, is_dupe = 0;
235255
if (i + t < nnz) {
236-
const char *str1, *str2;
237-
uint32_t len1, len2, hash;
238-
ck_row = s->dict[i + t];
239-
str1 = ck_data[ck_row].ptr;
240-
len1 = static_cast<uint32_t>(ck_data[ck_row].count);
241-
hash = nvstr_init_hash(str1, len1);
242-
ck_row_ref = s->dict[(hash > 0) ? s->map.u16[hash - 1] : 0];
256+
ck_row = s->dict[i + t];
257+
string_view string_value = s->chunk.leaf_column->element<string_view>(ck_row + start_row);
258+
auto const string_length = static_cast<uint32_t>(string_value.size_bytes());
259+
auto const hash = hash_string(string_value);
260+
ck_row_ref = s->dict[(hash > 0) ? s->map.u16[hash - 1] : 0];
243261
if (ck_row_ref != ck_row) {
244-
str2 = ck_data[ck_row_ref].ptr;
245-
len2 = static_cast<uint32_t>(ck_data[ck_row_ref].count);
246-
is_dupe = nvstr_is_equal(str1, len1, str2, len2);
247-
dict_char_count += (is_dupe) ? 0 : len1;
262+
string_view reference_string =
263+
s->chunk.leaf_column->element<string_view>(ck_row_ref + start_row);
264+
is_dupe = (string_value == reference_string);
265+
dict_char_count += (is_dupe) ? 0 : string_length;
248266
}
249267
}
250268
uint32_t dupes_in_block;
@@ -269,6 +287,12 @@ __global__ void __launch_bounds__(block_size, 2)
269287
chunks[group_id * num_columns + col_id].string_char_count = s->chunk.string_char_count;
270288
chunks[group_id * num_columns + col_id].num_dict_strings = nnz - s->total_dupes;
271289
chunks[group_id * num_columns + col_id].dict_char_count = dict_char_count;
290+
chunks[group_id * num_columns + col_id].leaf_column = s->chunk.leaf_column;
291+
292+
chunks[group_id * num_columns + col_id].dict_data = s->chunk.dict_data;
293+
chunks[group_id * num_columns + col_id].dict_index = s->chunk.dict_index;
294+
chunks[group_id * num_columns + col_id].start_row = s->chunk.start_row;
295+
chunks[group_id * num_columns + col_id].num_rows = s->chunk.num_rows;
272296
}
273297
}
274298

@@ -357,7 +381,6 @@ __global__ void __launch_bounds__(block_size)
357381
uint32_t num_strings;
358382
uint32_t *dict_data, *dict_index;
359383
uint32_t dict_char_count;
360-
const nvstrdesc_s *str_data;
361384
int t = threadIdx.x;
362385

363386
if (t == 0) s->stripe = stripes[stripe_id * num_columns + col_id];
@@ -366,21 +389,17 @@ __global__ void __launch_bounds__(block_size)
366389
num_strings = s->stripe.num_strings;
367390
dict_data = s->stripe.dict_data;
368391
if (!dict_data) return;
369-
dict_index = s->stripe.dict_index;
370-
str_data = static_cast<const nvstrdesc_s *>(s->stripe.column_data_base);
371-
dict_char_count = 0;
392+
dict_index = s->stripe.dict_index;
393+
string_view current_string = string_view::min();
394+
dict_char_count = 0;
372395
for (uint32_t i = 0; i < num_strings; i += block_size) {
373396
uint32_t cur = (i + t < num_strings) ? dict_data[i + t] : 0;
374397
uint32_t cur_len = 0;
375-
const char *cur_ptr;
376-
bool is_dupe = false;
377-
if (i + t < num_strings) {
378-
cur_ptr = str_data[cur].ptr;
379-
cur_len = str_data[cur].count;
380-
}
398+
bool is_dupe = false;
399+
if (i + t < num_strings) { current_string = s->stripe.leaf_column->element<string_view>(cur); }
381400
if (i + t != 0 && i + t < num_strings) {
382401
uint32_t prev = dict_data[i + t - 1];
383-
is_dupe = nvstr_is_equal(cur_ptr, cur_len, str_data[prev].ptr, str_data[prev].count);
402+
is_dupe = (current_string == (s->stripe.leaf_column->element<string_view>(prev)));
384403
}
385404
dict_char_count += (is_dupe) ? 0 : cur_len;
386405
uint32_t dupes_in_block;
@@ -403,35 +422,27 @@ __global__ void __launch_bounds__(block_size)
403422
}
404423

405424
/**
406-
* @brief Launches kernel for initializing dictionary chunks
407-
*
408-
* @param[in] chunks DictionaryChunk device array [rowgroup][column]
409-
* @param[in] num_columns Number of columns
410-
* @param[in] num_rowgroups Number of row groups
411-
* @param[in] stream CUDA stream to use, default `rmm::cuda_stream_default`
425+
* @copydoc cudf::io::orc::gpu::InitDictionaryIndices
412426
*/
413-
void InitDictionaryIndices(DictionaryChunk *chunks,
427+
void InitDictionaryIndices(const table_device_view &view,
428+
DictionaryChunk *chunks,
429+
uint32_t *dict_data,
430+
uint32_t *dict_index,
431+
size_t row_index_stride,
432+
size_type *str_col_ids,
414433
uint32_t num_columns,
415434
uint32_t num_rowgroups,
416435
rmm::cuda_stream_view stream)
417436
{
418437
static constexpr int block_size = 512;
419438
dim3 dim_block(block_size, 1);
420439
dim3 dim_grid(num_columns, num_rowgroups);
421-
gpuInitDictionaryIndices<block_size>
422-
<<<dim_grid, dim_block, 0, stream.value()>>>(chunks, num_columns);
440+
gpuInitDictionaryIndices<block_size><<<dim_grid, dim_block, 0, stream.value()>>>(
441+
chunks, view, dict_data, dict_index, row_index_stride, str_col_ids, num_columns);
423442
}
424443

425444
/**
426-
* @brief Launches kernel for building stripe dictionaries
427-
*
428-
* @param[in] stripes StripeDictionary device array [stripe][column]
429-
* @param[in] stripes_host StripeDictionary host array [stripe][column]
430-
* @param[in] chunks DictionaryChunk device array [rowgroup][column]
431-
* @param[in] num_stripes Number of stripes
432-
* @param[in] num_rowgroups Number of row groups
433-
* @param[in] num_columns Number of columns
434-
* @param[in] stream CUDA stream to use, default `rmm::cuda_stream_default`
445+
* @copydoc cudf::io::orc::gpu::BuildStripeDictionaries
435446
*/
436447
void BuildStripeDictionaries(StripeDictionary *stripes,
437448
StripeDictionary *stripes_host,
@@ -447,18 +458,16 @@ void BuildStripeDictionaries(StripeDictionary *stripes,
447458
stripes, chunks, num_columns);
448459
for (uint32_t i = 0; i < num_stripes * num_columns; i++) {
449460
if (stripes_host[i].dict_data != nullptr) {
450-
thrust::device_ptr<uint32_t> p = thrust::device_pointer_cast(stripes_host[i].dict_data);
451-
const nvstrdesc_s *str_data =
452-
static_cast<const nvstrdesc_s *>(stripes_host[i].column_data_base);
461+
thrust::device_ptr<uint32_t> dict_data_ptr =
462+
thrust::device_pointer_cast(stripes_host[i].dict_data);
463+
column_device_view *string_column = stripes_host[i].leaf_column;
453464
// NOTE: Requires the --expt-extended-lambda nvcc flag
454465
thrust::sort(rmm::exec_policy(stream),
455-
p,
456-
p + stripes_host[i].num_strings,
457-
[str_data] __device__(const uint32_t &lhs, const uint32_t &rhs) {
458-
return nvstr_is_lesser(str_data[lhs].ptr,
459-
(uint32_t)str_data[lhs].count,
460-
str_data[rhs].ptr,
461-
(uint32_t)str_data[rhs].count);
466+
dict_data_ptr,
467+
dict_data_ptr + stripes_host[i].num_strings,
468+
[string_column] __device__(const uint32_t &lhs, const uint32_t &rhs) {
469+
return string_column->element<string_view>(lhs) <
470+
string_column->element<string_view>(rhs);
462471
});
463472
}
464473
}

0 commit comments

Comments
 (0)