@@ -1468,42 +1468,43 @@ class TypedColumnWriterImpl : public ColumnWriterImpl, public TypedColumnWriter<
1468
1468
// which case we call back to the dense write path)
1469
1469
std::shared_ptr<::arrow::Array> preserved_dictionary_;
1470
1470
1471
- int64_t WriteLevels (int64_t num_values , const int16_t * def_levels,
1471
+ int64_t WriteLevels (int64_t num_levels , const int16_t * def_levels,
1472
1472
const int16_t * rep_levels) {
1473
+ // Update histograms now, to maximize cache efficiency.
1474
+ UpdateLevelHistogram (num_levels, def_levels, rep_levels);
1475
+
1473
1476
int64_t values_to_write = 0 ;
1474
1477
// If the field is required and non-repeated, there are no definition levels
1475
1478
if (descr_->max_definition_level () > 0 ) {
1476
- for (int64_t i = 0 ; i < num_values ; ++i) {
1479
+ for (int64_t i = 0 ; i < num_levels ; ++i) {
1477
1480
if (def_levels[i] == descr_->max_definition_level ()) {
1478
1481
++values_to_write;
1479
1482
}
1480
1483
}
1481
1484
1482
- WriteDefinitionLevels (num_values , def_levels);
1485
+ WriteDefinitionLevels (num_levels , def_levels);
1483
1486
} else {
1484
1487
// Required field, write all values
1485
- values_to_write = num_values ;
1488
+ values_to_write = num_levels ;
1486
1489
}
1487
1490
1488
1491
// Not present for non-repeated fields
1489
1492
if (descr_->max_repetition_level () > 0 ) {
1490
1493
// A row could include more than one value
1491
1494
// Count the occasions where we start a new row
1492
- for (int64_t i = 0 ; i < num_values ; ++i) {
1495
+ for (int64_t i = 0 ; i < num_levels ; ++i) {
1493
1496
if (rep_levels[i] == 0 ) {
1494
1497
rows_written_++;
1495
1498
num_buffered_rows_++;
1496
1499
}
1497
1500
}
1498
1501
1499
- WriteRepetitionLevels (num_values , rep_levels);
1502
+ WriteRepetitionLevels (num_levels , rep_levels);
1500
1503
} else {
1501
1504
// Each value is exactly one row
1502
- rows_written_ += num_values ;
1503
- num_buffered_rows_ += num_values ;
1505
+ rows_written_ += num_levels ;
1506
+ num_buffered_rows_ += num_levels ;
1504
1507
}
1505
-
1506
- UpdateLevelHistogram (num_values, def_levels, rep_levels);
1507
1508
return values_to_write;
1508
1509
}
1509
1510
@@ -1575,6 +1576,9 @@ class TypedColumnWriterImpl : public ColumnWriterImpl, public TypedColumnWriter<
1575
1576
1576
1577
void WriteLevelsSpaced (int64_t num_levels, const int16_t * def_levels,
1577
1578
const int16_t * rep_levels) {
1579
+ // Update histograms now, to maximize cache efficiency.
1580
+ UpdateLevelHistogram (num_levels, def_levels, rep_levels);
1581
+
1578
1582
// If the field is required and non-repeated, there are no definition levels
1579
1583
if (descr_->max_definition_level () > 0 ) {
1580
1584
WriteDefinitionLevels (num_levels, def_levels);
@@ -1595,8 +1599,6 @@ class TypedColumnWriterImpl : public ColumnWriterImpl, public TypedColumnWriter<
1595
1599
rows_written_ += num_levels;
1596
1600
num_buffered_rows_ += num_levels;
1597
1601
}
1598
-
1599
- UpdateLevelHistogram (num_levels, def_levels, rep_levels);
1600
1602
}
1601
1603
1602
1604
void UpdateLevelHistogram (int64_t num_levels, const int16_t * def_levels,
@@ -1606,26 +1608,17 @@ class TypedColumnWriterImpl : public ColumnWriterImpl, public TypedColumnWriter<
1606
1608
}
1607
1609
1608
1610
auto add_levels = [](std::vector<int64_t >& level_histogram,
1609
- ::arrow::util::span<const int16_t > levels) {
1610
- for (int16_t level : levels) {
1611
- ARROW_DCHECK_LT (level, static_cast <int16_t >(level_histogram.size ()));
1612
- ++level_histogram[level];
1613
- }
1611
+ ::arrow::util::span<const int16_t > levels, int16_t max_level) {
1612
+ ARROW_DCHECK_EQ (static_cast <size_t >(max_level) + 1 , level_histogram.size ());
1613
+ ::parquet::UpdateLevelHistogram (levels, level_histogram);
1614
1614
};
1615
1615
1616
- if (descr_->max_definition_level () > 0 ) {
1617
- add_levels (page_size_statistics_->definition_level_histogram ,
1618
- {def_levels, static_cast <size_t >(num_levels)});
1619
- } else {
1620
- page_size_statistics_->definition_level_histogram [0 ] += num_levels;
1621
- }
1622
-
1623
- if (descr_->max_repetition_level () > 0 ) {
1624
- add_levels (page_size_statistics_->repetition_level_histogram ,
1625
- {rep_levels, static_cast <size_t >(num_levels)});
1626
- } else {
1627
- page_size_statistics_->repetition_level_histogram [0 ] += num_levels;
1628
- }
1616
+ add_levels (page_size_statistics_->definition_level_histogram ,
1617
+ {def_levels, static_cast <size_t >(num_levels)},
1618
+ descr_->max_definition_level ());
1619
+ add_levels (page_size_statistics_->repetition_level_histogram ,
1620
+ {rep_levels, static_cast <size_t >(num_levels)},
1621
+ descr_->max_repetition_level ());
1629
1622
}
1630
1623
1631
1624
// Update the unencoded data bytes for ByteArray only per the specification.
0 commit comments