@@ -76,16 +76,16 @@ template <typename T> auto PandasIsNA(bool mask_value, T &scalar_value) {
76
76
}
77
77
}
78
78
79
- template <typename T> auto MaybeResizeKlibContainer (T &container) {
80
- const auto current_size = container.size ();
81
- if (container.n_buckets () == current_size) {
82
- container.resize (current_size * 4 );
83
- }
84
- }
85
-
86
79
template <typename T> class PandasVector {
87
80
public:
88
- explicit PandasVector<T>() : external_view_exists_(false ) {}
81
+ static constexpr size_t INIT_VEC_CAP = 128 ;
82
+
83
+ explicit PandasVector<T>() : external_view_exists_(false ) {
84
+ vec_.reserve (INIT_VEC_CAP);
85
+ }
86
+ explicit PandasVector<T>(std::vector<T>&& vec) : vec_(vec), external_view_exists_(false ) {
87
+ vec_.reserve (INIT_VEC_CAP);
88
+ }
89
89
~PandasVector<T>() = default ;
90
90
PandasVector<T>(PandasVector<T> const &) = delete ;
91
91
void operator =(PandasVector<T> const &) = delete ;
@@ -137,8 +137,9 @@ template <typename T, bool IsMasked> class PandasHashTable {
137
137
explicit PandasHashTable<T, IsMasked>(HashValueT new_size) {
138
138
#if __APPLE__
139
139
// macOS cannot resolve size_t to uint32_t or uint64_t that khash needs
140
- hash_map_.resize (static_cast <uint64_t >(new_size));
141
- hash_set_.resize (static_cast <uint64_t >(new_size));
140
+ const auto ns = static_cast <uint64_t >(new_size);
141
+ hash_map_.resize (ns);
142
+ hash_set_.resize (ns);
142
143
#else
143
144
hash_map_.resize (new_size);
144
145
hash_set_.resize (new_size);
@@ -226,7 +227,6 @@ template <typename T, bool IsMasked> class PandasHashTable {
226
227
const auto n = values_v.shape (0 );
227
228
for (auto i = decltype (n){0 }; i < n; i++) {
228
229
hash_map_[keys_v (i)] = values_v (i);
229
- MaybeResizeKlibContainer (hash_map_);
230
230
}
231
231
}
232
232
@@ -251,15 +251,13 @@ template <typename T, bool IsMasked> class PandasHashTable {
251
251
na_position = i;
252
252
} else {
253
253
hash_map_[values_v (i)] = i;
254
- MaybeResizeKlibContainer (hash_map_);
255
254
}
256
255
}
257
256
na_position_ = na_position;
258
257
} else {
259
258
for (auto i = decltype (n){0 }; i < n; i++) {
260
259
const auto key = values_v (i);
261
260
hash_map_[key] = i;
262
- MaybeResizeKlibContainer (hash_map_);
263
261
}
264
262
}
265
263
}
@@ -428,7 +426,6 @@ template <typename T, bool IsMasked> class PandasHashTable {
428
426
int dummy;
429
427
k = hash_map_.put (val, &dummy);
430
428
hash_map_.value (k) = count;
431
- MaybeResizeKlibContainer (hash_map_);
432
429
uniques.Append (val);
433
430
labels[i] = count;
434
431
count++;
@@ -487,7 +484,6 @@ template <typename T, bool IsMasked> class PandasHashTable {
487
484
k = hash_map_.put (val, &dummy);
488
485
uniques.Append (val);
489
486
hash_map_.value (k) = count_prior;
490
- MaybeResizeKlibContainer (hash_map_);
491
487
labels[i] = count_prior;
492
488
count_prior++;
493
489
} else {
@@ -521,7 +517,6 @@ template <typename T, bool IsMasked> class PandasHashTable {
521
517
k = hash_map_.put (val, &dummy);
522
518
uniques.Append (val);
523
519
hash_map_.value (k) = count_prior;
524
- MaybeResizeKlibContainer (hash_map_);
525
520
labels[i] = count_prior;
526
521
count_prior++;
527
522
} else {
@@ -550,8 +545,10 @@ template <typename T, bool IsMasked> class PandasHashTable {
550
545
551
546
const auto values_v = values.view ();
552
547
const auto n = values.shape (0 );
553
- PandasVector<uint8_t > result;
548
+ bool seen_na = false ;
549
+ auto na_pos = decltype (n){0 };
554
550
551
+ std::vector<uint8_t > missing_vec;
555
552
if constexpr (IsMasked) {
556
553
using MaskT = nb::ndarray<const uint8_t , nb::ndim<1 >>;
557
554
MaskT mask;
@@ -560,43 +557,45 @@ template <typename T, bool IsMasked> class PandasHashTable {
560
557
}
561
558
nb::call_guard<nb::gil_scoped_release>();
562
559
const auto mask_v = mask.view ();
563
-
564
- bool seen_na = false ;
565
560
for (auto i = decltype (n){0 }; i < n; i++) {
566
561
const auto val = values_v (i);
567
562
568
563
if (PandasIsNA (mask_v (i), val)) {
569
564
if (!seen_na) {
570
565
uniques.Append (val);
571
- result. Append ( 1 ) ;
566
+ na_pos = i ;
572
567
seen_na = true ;
573
568
}
574
569
continue ;
575
570
}
576
571
577
572
int absent;
578
573
hash_set_.put (val, &absent);
579
- MaybeResizeKlibContainer (hash_set_);
580
574
if (absent) {
581
575
uniques.Append (val);
582
- result.Append (0 );
583
576
}
584
577
}
585
578
} else {
579
+ // TODO: why do we even have this branch?
586
580
nb::call_guard<nb::gil_scoped_release>();
587
581
for (auto i = decltype (n){0 }; i < n; i++) {
588
582
const auto val = values_v (i);
589
583
int absent;
590
584
hash_set_.put (val, &absent);
591
- MaybeResizeKlibContainer (hash_set_);
592
585
if (absent) {
593
586
uniques.Append (val);
594
- result.Append (0 );
595
587
}
596
588
}
597
589
}
598
590
599
- return result;
591
+
592
+ std::vector<uint8_t > tmp;
593
+ tmp.resize (hash_set_.n_buckets (), 0 );
594
+ if (seen_na) {
595
+ tmp[na_pos] = 1 ;
596
+ }
597
+
598
+ return PandasVector (std::move (tmp));
600
599
}
601
600
602
601
auto UniquesOnly (const nb::ndarray<const T, nb::ndim<1 >> &values,
@@ -612,7 +611,6 @@ template <typename T, bool IsMasked> class PandasHashTable {
612
611
if (k == hash_map_.end ()) {
613
612
int dummy;
614
613
k = hash_map_.put (val, &dummy);
615
- MaybeResizeKlibContainer (hash_map_);
616
614
uniques.Append (val);
617
615
}
618
616
}
0 commit comments