17
17
#include " orc_common.h"
18
18
#include " orc_gpu.h"
19
19
20
+ #include < cudf/table/table_device_view.cuh>
20
21
#include < io/utilities/block_utils.cuh>
21
22
22
23
#include < rmm/cuda_stream_view.hpp>
@@ -46,14 +47,16 @@ struct dictinit_state_s {
46
47
};
47
48
48
49
/* *
49
- * @brief Return a 12-bit hash from a byte sequence
50
+ * @brief Return a 12-bit hash from a string
50
51
*/
51
- static inline __device__ uint32_t nvstr_init_hash ( char const *ptr, uint32_t len )
52
+ static inline __device__ uint32_t hash_string ( const string_view val )
52
53
{
53
- if (len != 0 ) {
54
- return (ptr[0 ] + (ptr[len - 1 ] << 5 ) + (len << 10 )) & ((1 << init_hash_bits) - 1 );
55
- } else {
54
+ if (val.empty ()) {
56
55
return 0 ;
56
+ } else {
57
+ char const *ptr = val.data ();
58
+ uint32_t len = val.size_bytes ();
59
+ return (ptr[0 ] + (ptr[len - 1 ] << 5 ) + (len << 10 )) & ((1 << init_hash_bits) - 1 );
57
60
}
58
61
}
59
62
@@ -71,7 +74,8 @@ static __device__ void LoadNonNullIndices(volatile dictinit_state_s *s,
71
74
{
72
75
if (t == 0 ) { s->nnz = 0 ; }
73
76
for (uint32_t i = 0 ; i < s->chunk .num_rows ; i += block_size) {
74
- const uint32_t *valid_map = s->chunk .valid_map_base ;
77
+ const uint32_t *valid_map = s->chunk .leaf_column ->null_mask ();
78
+ auto column_offset = s->chunk .leaf_column ->offset ();
75
79
uint32_t is_valid, nz_pos;
76
80
if (t < block_size / 32 ) {
77
81
if (!valid_map) {
@@ -80,10 +84,10 @@ static __device__ void LoadNonNullIndices(volatile dictinit_state_s *s,
80
84
uint32_t const row = s->chunk .start_row + i + t * 32 ;
81
85
auto const chunk_end = s->chunk .start_row + s->chunk .num_rows ;
82
86
83
- auto const valid_map_idx = (row + s-> chunk . column_offset ) / 32 ;
87
+ auto const valid_map_idx = (row + column_offset) / 32 ;
84
88
uint32_t valid = (row < chunk_end) ? valid_map[valid_map_idx] : 0 ;
85
89
86
- auto const rows_in_next_word = (row + s-> chunk . column_offset ) & 0x1f ;
90
+ auto const rows_in_next_word = (row + column_offset) & 0x1f ;
87
91
if (rows_in_next_word != 0 ) {
88
92
auto const rows_in_current_word = 32 - rows_in_next_word;
89
93
// Read next word if any rows are within the chunk
@@ -111,12 +115,18 @@ static __device__ void LoadNonNullIndices(volatile dictinit_state_s *s,
111
115
* @brief Gather all non-NULL string rows and compute total character data size
112
116
*
113
117
* @param[in] chunks DictionaryChunk device array [rowgroup][column]
114
- * @param[in] num_columns Number of columns
118
+ * @param[in] num_columns Number of string columns
115
119
*/
116
120
// blockDim {block_size,1,1}
117
121
template <int block_size>
118
122
__global__ void __launch_bounds__ (block_size, 2 )
119
- gpuInitDictionaryIndices(DictionaryChunk *chunks, uint32_t num_columns)
123
+ gpuInitDictionaryIndices(DictionaryChunk *chunks,
124
+ const table_device_view view,
125
+ uint32_t *dict_data,
126
+ uint32_t *dict_index,
127
+ size_t row_index_stride,
128
+ size_type *str_col_ids,
129
+ uint32_t num_columns)
120
130
{
121
131
__shared__ __align__ (16 ) dictinit_state_s state_g;
122
132
@@ -131,12 +141,21 @@ __global__ void __launch_bounds__(block_size, 2)
131
141
dictinit_state_s *const s = &state_g;
132
142
uint32_t col_id = blockIdx .x ;
133
143
uint32_t group_id = blockIdx .y ;
134
- const nvstrdesc_s *ck_data;
135
- uint32_t *dict_data;
136
144
uint32_t nnz, start_row, dict_char_count;
137
145
int t = threadIdx .x ;
138
146
139
- if (t == 0 ) s->chunk = chunks[group_id * num_columns + col_id];
147
+ if (t == 0 ) {
148
+ column_device_view *leaf_column_view = view.begin () + str_col_ids[col_id];
149
+ s->chunk = chunks[group_id * num_columns + col_id];
150
+ s->chunk .leaf_column = leaf_column_view;
151
+ s->chunk .dict_data =
152
+ dict_data + col_id * leaf_column_view->size () + group_id * row_index_stride;
153
+ s->chunk .dict_index = dict_index + col_id * leaf_column_view->size ();
154
+ s->chunk .start_row = group_id * row_index_stride;
155
+ s->chunk .num_rows =
156
+ min (row_index_stride,
157
+ max (static_cast <size_t >(leaf_column_view->size () - s->chunk .start_row ), size_t {0 }));
158
+ }
140
159
for (uint32_t i = 0 ; i < sizeof (s->map ) / sizeof (uint32_t ); i += block_size) {
141
160
if (i + t < sizeof (s->map ) / sizeof (uint32_t )) s->map .u32 [i + t] = 0 ;
142
161
}
@@ -152,15 +171,15 @@ __global__ void __launch_bounds__(block_size, 2)
152
171
nnz = s->nnz ;
153
172
dict_data = s->chunk .dict_data ;
154
173
start_row = s->chunk .start_row ;
155
- ck_data = static_cast <const nvstrdesc_s *>(s->chunk .column_data_base ) + start_row;
156
174
for (uint32_t i = 0 ; i < nnz; i += block_size) {
157
175
uint32_t ck_row = 0 ;
158
176
uint32_t hash = 0 ;
159
177
uint32_t len = 0 ;
160
178
if (i + t < nnz) {
161
- ck_row = s->dict [i + t];
162
- len = static_cast <uint32_t >(ck_data[ck_row].count );
163
- hash = nvstr_init_hash (ck_data[ck_row].ptr , len);
179
+ ck_row = s->dict [i + t];
180
+ string_view string_val = s->chunk .leaf_column ->element <string_view>(ck_row + start_row);
181
+ len = static_cast <uint32_t >(string_val.size_bytes ());
182
+ hash = hash_string (string_val);
164
183
}
165
184
len = block_reduce (temp_storage.reduce_storage ).Sum (len);
166
185
if (t == 0 ) s->chunk .string_char_count += len;
@@ -200,10 +219,11 @@ __global__ void __launch_bounds__(block_size, 2)
200
219
uint32_t ck_row = 0 , pos = 0 , hash = 0 , pos_old, pos_new, sh, colliding_row;
201
220
bool collision;
202
221
if (i + t < nnz) {
203
- ck_row = dict_data[i + t] - start_row;
204
- hash = nvstr_init_hash (ck_data[ck_row].ptr , static_cast <uint32_t >(ck_data[ck_row].count ));
205
- sh = (hash & 1 ) ? 16 : 0 ;
206
- pos_old = s->map .u16 [hash];
222
+ ck_row = dict_data[i + t] - start_row;
223
+ string_view string_val = s->chunk .leaf_column ->element <string_view>(ck_row + start_row);
224
+ hash = hash_string (string_val);
225
+ sh = (hash & 1 ) ? 16 : 0 ;
226
+ pos_old = s->map .u16 [hash];
207
227
}
208
228
// The isolation of the atomicAdd, along with pos_old/pos_new is to guarantee deterministic
209
229
// behavior for the first row in the hash map that will be used for early duplicate detection
@@ -233,18 +253,16 @@ __global__ void __launch_bounds__(block_size, 2)
233
253
for (uint32_t i = 0 ; i < nnz; i += block_size) {
234
254
uint32_t ck_row = 0 , ck_row_ref = 0 , is_dupe = 0 ;
235
255
if (i + t < nnz) {
236
- const char *str1, *str2;
237
- uint32_t len1, len2, hash;
238
- ck_row = s->dict [i + t];
239
- str1 = ck_data[ck_row].ptr ;
240
- len1 = static_cast <uint32_t >(ck_data[ck_row].count );
241
- hash = nvstr_init_hash (str1, len1);
242
- ck_row_ref = s->dict [(hash > 0 ) ? s->map .u16 [hash - 1 ] : 0 ];
256
+ ck_row = s->dict [i + t];
257
+ string_view string_value = s->chunk .leaf_column ->element <string_view>(ck_row + start_row);
258
+ auto const string_length = static_cast <uint32_t >(string_value.size_bytes ());
259
+ auto const hash = hash_string (string_value);
260
+ ck_row_ref = s->dict [(hash > 0 ) ? s->map .u16 [hash - 1 ] : 0 ];
243
261
if (ck_row_ref != ck_row) {
244
- str2 = ck_data[ck_row_ref]. ptr ;
245
- len2 = static_cast < uint32_t >(ck_data[ ck_row_ref]. count );
246
- is_dupe = nvstr_is_equal (str1, len1, str2, len2 );
247
- dict_char_count += (is_dupe) ? 0 : len1 ;
262
+ string_view reference_string =
263
+ s-> chunk . leaf_column -> element <string_view>( ck_row_ref + start_row );
264
+ is_dupe = (string_value == reference_string );
265
+ dict_char_count += (is_dupe) ? 0 : string_length ;
248
266
}
249
267
}
250
268
uint32_t dupes_in_block;
@@ -269,6 +287,12 @@ __global__ void __launch_bounds__(block_size, 2)
269
287
chunks[group_id * num_columns + col_id].string_char_count = s->chunk .string_char_count ;
270
288
chunks[group_id * num_columns + col_id].num_dict_strings = nnz - s->total_dupes ;
271
289
chunks[group_id * num_columns + col_id].dict_char_count = dict_char_count;
290
+ chunks[group_id * num_columns + col_id].leaf_column = s->chunk .leaf_column ;
291
+
292
+ chunks[group_id * num_columns + col_id].dict_data = s->chunk .dict_data ;
293
+ chunks[group_id * num_columns + col_id].dict_index = s->chunk .dict_index ;
294
+ chunks[group_id * num_columns + col_id].start_row = s->chunk .start_row ;
295
+ chunks[group_id * num_columns + col_id].num_rows = s->chunk .num_rows ;
272
296
}
273
297
}
274
298
@@ -357,7 +381,6 @@ __global__ void __launch_bounds__(block_size)
357
381
uint32_t num_strings;
358
382
uint32_t *dict_data, *dict_index;
359
383
uint32_t dict_char_count;
360
- const nvstrdesc_s *str_data;
361
384
int t = threadIdx .x ;
362
385
363
386
if (t == 0 ) s->stripe = stripes[stripe_id * num_columns + col_id];
@@ -366,21 +389,17 @@ __global__ void __launch_bounds__(block_size)
366
389
num_strings = s->stripe .num_strings ;
367
390
dict_data = s->stripe .dict_data ;
368
391
if (!dict_data) return ;
369
- dict_index = s->stripe .dict_index ;
370
- str_data = static_cast < const nvstrdesc_s *>(s-> stripe . column_data_base );
371
- dict_char_count = 0 ;
392
+ dict_index = s->stripe .dict_index ;
393
+ string_view current_string = string_view::min ( );
394
+ dict_char_count = 0 ;
372
395
for (uint32_t i = 0 ; i < num_strings; i += block_size) {
373
396
uint32_t cur = (i + t < num_strings) ? dict_data[i + t] : 0 ;
374
397
uint32_t cur_len = 0 ;
375
- const char *cur_ptr;
376
- bool is_dupe = false ;
377
- if (i + t < num_strings) {
378
- cur_ptr = str_data[cur].ptr ;
379
- cur_len = str_data[cur].count ;
380
- }
398
+ bool is_dupe = false ;
399
+ if (i + t < num_strings) { current_string = s->stripe .leaf_column ->element <string_view>(cur); }
381
400
if (i + t != 0 && i + t < num_strings) {
382
401
uint32_t prev = dict_data[i + t - 1 ];
383
- is_dupe = nvstr_is_equal (cur_ptr, cur_len, str_data[prev]. ptr , str_data[ prev]. count );
402
+ is_dupe = (current_string == (s-> stripe . leaf_column -> element <string_view>( prev)) );
384
403
}
385
404
dict_char_count += (is_dupe) ? 0 : cur_len;
386
405
uint32_t dupes_in_block;
@@ -403,35 +422,27 @@ __global__ void __launch_bounds__(block_size)
403
422
}
404
423
405
424
/* *
406
- * @brief Launches kernel for initializing dictionary chunks
407
- *
408
- * @param[in] chunks DictionaryChunk device array [rowgroup][column]
409
- * @param[in] num_columns Number of columns
410
- * @param[in] num_rowgroups Number of row groups
411
- * @param[in] stream CUDA stream to use, default `rmm::cuda_stream_default`
425
+ * @copydoc cudf::io::orc::gpu::InitDictionaryIndices
412
426
*/
413
- void InitDictionaryIndices (DictionaryChunk *chunks,
427
+ void InitDictionaryIndices (const table_device_view &view,
428
+ DictionaryChunk *chunks,
429
+ uint32_t *dict_data,
430
+ uint32_t *dict_index,
431
+ size_t row_index_stride,
432
+ size_type *str_col_ids,
414
433
uint32_t num_columns,
415
434
uint32_t num_rowgroups,
416
435
rmm::cuda_stream_view stream)
417
436
{
418
437
static constexpr int block_size = 512 ;
419
438
dim3 dim_block (block_size, 1 );
420
439
dim3 dim_grid (num_columns, num_rowgroups);
421
- gpuInitDictionaryIndices<block_size>
422
- <<<dim_grid, dim_block, 0 , stream.value()>>> (chunks , num_columns);
440
+ gpuInitDictionaryIndices<block_size><<<dim_grid, dim_block, 0 , stream.value()>>> (
441
+ chunks, view, dict_data, dict_index, row_index_stride, str_col_ids , num_columns);
423
442
}
424
443
425
444
/* *
426
- * @brief Launches kernel for building stripe dictionaries
427
- *
428
- * @param[in] stripes StripeDictionary device array [stripe][column]
429
- * @param[in] stripes_host StripeDictionary host array [stripe][column]
430
- * @param[in] chunks DictionaryChunk device array [rowgroup][column]
431
- * @param[in] num_stripes Number of stripes
432
- * @param[in] num_rowgroups Number of row groups
433
- * @param[in] num_columns Number of columns
434
- * @param[in] stream CUDA stream to use, default `rmm::cuda_stream_default`
445
+ * @copydoc cudf::io::orc::gpu::BuildStripeDictionaries
435
446
*/
436
447
void BuildStripeDictionaries (StripeDictionary *stripes,
437
448
StripeDictionary *stripes_host,
@@ -447,18 +458,16 @@ void BuildStripeDictionaries(StripeDictionary *stripes,
447
458
stripes, chunks, num_columns);
448
459
for (uint32_t i = 0 ; i < num_stripes * num_columns; i++) {
449
460
if (stripes_host[i].dict_data != nullptr ) {
450
- thrust::device_ptr<uint32_t > p = thrust::device_pointer_cast (stripes_host[i]. dict_data );
451
- const nvstrdesc_s *str_data =
452
- static_cast < const nvstrdesc_s *>( stripes_host[i].column_data_base ) ;
461
+ thrust::device_ptr<uint32_t > dict_data_ptr =
462
+ thrust::device_pointer_cast (stripes_host[i]. dict_data );
463
+ column_device_view *string_column = stripes_host[i].leaf_column ;
453
464
// NOTE: Requires the --expt-extended-lambda nvcc flag
454
465
thrust::sort (rmm::exec_policy (stream),
455
- p,
456
- p + stripes_host[i].num_strings ,
457
- [str_data] __device__ (const uint32_t &lhs, const uint32_t &rhs) {
458
- return nvstr_is_lesser (str_data[lhs].ptr ,
459
- (uint32_t )str_data[lhs].count ,
460
- str_data[rhs].ptr ,
461
- (uint32_t )str_data[rhs].count );
466
+ dict_data_ptr,
467
+ dict_data_ptr + stripes_host[i].num_strings ,
468
+ [string_column] __device__ (const uint32_t &lhs, const uint32_t &rhs) {
469
+ return string_column->element <string_view>(lhs) <
470
+ string_column->element <string_view>(rhs);
462
471
});
463
472
}
464
473
}
0 commit comments