Skip to content

Commit b8759ce

Browse files
committed
improvements / fixes
1 parent 0db5f0c commit b8759ce

File tree

1 file changed

+24
-26
lines changed

1 file changed

+24
-26
lines changed

pandas/_libs/new_vector.cpp

+24-26
Original file line numberDiff line numberDiff line change
@@ -76,16 +76,16 @@ template <typename T> auto PandasIsNA(bool mask_value, T &scalar_value) {
7676
}
7777
}
7878

79-
template <typename T> auto MaybeResizeKlibContainer(T &container) {
80-
const auto current_size = container.size();
81-
if (container.n_buckets() == current_size) {
82-
container.resize(current_size * 4);
83-
}
84-
}
85-
8679
template <typename T> class PandasVector {
8780
public:
88-
explicit PandasVector<T>() : external_view_exists_(false) {}
81+
static constexpr size_t INIT_VEC_CAP = 128;
82+
83+
explicit PandasVector<T>() : external_view_exists_(false) {
84+
vec_.reserve(INIT_VEC_CAP);
85+
}
86+
explicit PandasVector<T>(std::vector<T>&& vec) : vec_(vec), external_view_exists_(false) {
87+
vec_.reserve(INIT_VEC_CAP);
88+
}
8989
~PandasVector<T>() = default;
9090
PandasVector<T>(PandasVector<T> const &) = delete;
9191
void operator=(PandasVector<T> const &) = delete;
@@ -137,8 +137,9 @@ template <typename T, bool IsMasked> class PandasHashTable {
137137
explicit PandasHashTable<T, IsMasked>(HashValueT new_size) {
138138
#if __APPLE__
139139
// macOS cannot resolve size_t to uint32_t or uint64_t that khash needs
140-
hash_map_.resize(static_cast<uint64_t>(new_size));
141-
hash_set_.resize(static_cast<uint64_t>(new_size));
140+
const auto ns = static_cast<uint64_t>(new_size);
141+
hash_map_.resize(ns);
142+
hash_set_.resize(ns);
142143
#else
143144
hash_map_.resize(new_size);
144145
hash_set_.resize(new_size);
@@ -226,7 +227,6 @@ template <typename T, bool IsMasked> class PandasHashTable {
226227
const auto n = values_v.shape(0);
227228
for (auto i = decltype(n){0}; i < n; i++) {
228229
hash_map_[keys_v(i)] = values_v(i);
229-
MaybeResizeKlibContainer(hash_map_);
230230
}
231231
}
232232

@@ -251,15 +251,13 @@ template <typename T, bool IsMasked> class PandasHashTable {
251251
na_position = i;
252252
} else {
253253
hash_map_[values_v(i)] = i;
254-
MaybeResizeKlibContainer(hash_map_);
255254
}
256255
}
257256
na_position_ = na_position;
258257
} else {
259258
for (auto i = decltype(n){0}; i < n; i++) {
260259
const auto key = values_v(i);
261260
hash_map_[key] = i;
262-
MaybeResizeKlibContainer(hash_map_);
263261
}
264262
}
265263
}
@@ -428,7 +426,6 @@ template <typename T, bool IsMasked> class PandasHashTable {
428426
int dummy;
429427
k = hash_map_.put(val, &dummy);
430428
hash_map_.value(k) = count;
431-
MaybeResizeKlibContainer(hash_map_);
432429
uniques.Append(val);
433430
labels[i] = count;
434431
count++;
@@ -487,7 +484,6 @@ template <typename T, bool IsMasked> class PandasHashTable {
487484
k = hash_map_.put(val, &dummy);
488485
uniques.Append(val);
489486
hash_map_.value(k) = count_prior;
490-
MaybeResizeKlibContainer(hash_map_);
491487
labels[i] = count_prior;
492488
count_prior++;
493489
} else {
@@ -521,7 +517,6 @@ template <typename T, bool IsMasked> class PandasHashTable {
521517
k = hash_map_.put(val, &dummy);
522518
uniques.Append(val);
523519
hash_map_.value(k) = count_prior;
524-
MaybeResizeKlibContainer(hash_map_);
525520
labels[i] = count_prior;
526521
count_prior++;
527522
} else {
@@ -550,8 +545,10 @@ template <typename T, bool IsMasked> class PandasHashTable {
550545

551546
const auto values_v = values.view();
552547
const auto n = values.shape(0);
553-
PandasVector<uint8_t> result;
548+
bool seen_na = false;
549+
auto na_pos = decltype(n){0};
554550

551+
std::vector<uint8_t> missing_vec;
555552
if constexpr (IsMasked) {
556553
using MaskT = nb::ndarray<const uint8_t, nb::ndim<1>>;
557554
MaskT mask;
@@ -560,43 +557,45 @@ template <typename T, bool IsMasked> class PandasHashTable {
560557
}
561558
nb::call_guard<nb::gil_scoped_release>();
562559
const auto mask_v = mask.view();
563-
564-
bool seen_na = false;
565560
for (auto i = decltype(n){0}; i < n; i++) {
566561
const auto val = values_v(i);
567562

568563
if (PandasIsNA(mask_v(i), val)) {
569564
if (!seen_na) {
570565
uniques.Append(val);
571-
result.Append(1);
566+
na_pos = i;
572567
seen_na = true;
573568
}
574569
continue;
575570
}
576571

577572
int absent;
578573
hash_set_.put(val, &absent);
579-
MaybeResizeKlibContainer(hash_set_);
580574
if (absent) {
581575
uniques.Append(val);
582-
result.Append(0);
583576
}
584577
}
585578
} else {
579+
// TODO: why do we even have this branch?
586580
nb::call_guard<nb::gil_scoped_release>();
587581
for (auto i = decltype(n){0}; i < n; i++) {
588582
const auto val = values_v(i);
589583
int absent;
590584
hash_set_.put(val, &absent);
591-
MaybeResizeKlibContainer(hash_set_);
592585
if (absent) {
593586
uniques.Append(val);
594-
result.Append(0);
595587
}
596588
}
597589
}
598590

599-
return result;
591+
592+
std::vector<uint8_t> tmp;
593+
tmp.resize(hash_set_.n_buckets(), 0);
594+
if (seen_na) {
595+
tmp[na_pos] = 1;
596+
}
597+
598+
return PandasVector(std::move(tmp));
600599
}
601600

602601
auto UniquesOnly(const nb::ndarray<const T, nb::ndim<1>> &values,
@@ -612,7 +611,6 @@ template <typename T, bool IsMasked> class PandasHashTable {
612611
if (k == hash_map_.end()) {
613612
int dummy;
614613
k = hash_map_.put(val, &dummy);
615-
MaybeResizeKlibContainer(hash_map_);
616614
uniques.Append(val);
617615
}
618616
}

0 commit comments

Comments
 (0)