Skip to content

Commit 3c96b8f

Browse files
authored
PERF: StringEngine for string dtype indexing ops (pandas-dev#56997)
* add StringEngine for string dtype indexes * whatsnew * ensure str * mypy * subclass IndexEngine * update to match class implementation
1 parent 85b7445 commit 3c96b8f

File tree

5 files changed

+18
-0
lines changed

5 files changed

+18
-0
lines changed

doc/source/whatsnew/v3.0.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,7 @@ Performance improvements
107107
- Performance improvement in :meth:`DataFrameGroupBy.ffill`, :meth:`DataFrameGroupBy.bfill`, :meth:`SeriesGroupBy.ffill`, and :meth:`SeriesGroupBy.bfill` (:issue:`56902`)
108108
- Performance improvement in :meth:`Index.take` when ``indices`` is a full range indexer from zero to length of index (:issue:`56806`)
109109
- Performance improvement in :meth:`MultiIndex.equals` for equal length indexes (:issue:`56990`)
110+
- Performance improvement in indexing operations for string dtypes (:issue:`56997`)
110111
-
111112

112113
.. ---------------------------------------------------------------------------

pandas/_libs/hashtable_class_helper.pxi.in

+3
Original file line numberDiff line numberDiff line change
@@ -933,6 +933,9 @@ cdef class StringHashTable(HashTable):
933933
kh_destroy_str(self.table)
934934
self.table = NULL
935935

936+
def __len__(self) -> int:
937+
return self.table.size
938+
936939
def sizeof(self, deep: bool = False) -> int:
937940
overhead = 4 * sizeof(uint32_t) + 3 * sizeof(uint32_t*)
938941
for_flags = max(1, self.table.n_buckets >> 5) * sizeof(uint32_t)

pandas/_libs/index.pyi

+1
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ class UInt32Engine(IndexEngine): ...
5050
class UInt16Engine(IndexEngine): ...
5151
class UInt8Engine(IndexEngine): ...
5252
class ObjectEngine(IndexEngine): ...
53+
class StringEngine(IndexEngine): ...
5354
class DatetimeEngine(Int64Engine): ...
5455
class TimedeltaEngine(DatetimeEngine): ...
5556
class PeriodEngine(Int64Engine): ...

pandas/_libs/index.pyx

+11
Original file line numberDiff line numberDiff line change
@@ -533,6 +533,17 @@ cdef class ObjectEngine(IndexEngine):
533533
return loc
534534

535535

536+
cdef class StringEngine(IndexEngine):
537+
538+
cdef _make_hash_table(self, Py_ssize_t n):
539+
return _hash.StringHashTable(n)
540+
541+
cdef _check_type(self, object val):
542+
if not isinstance(val, str):
543+
raise KeyError(val)
544+
return str(val)
545+
546+
536547
cdef class DatetimeEngine(Int64Engine):
537548

538549
cdef:

pandas/core/indexes/base.py

+2
Original file line numberDiff line numberDiff line change
@@ -883,6 +883,8 @@ def _engine(
883883
# error: Item "ExtensionArray" of "Union[ExtensionArray,
884884
# ndarray[Any, Any]]" has no attribute "_ndarray" [union-attr]
885885
target_values = self._data._ndarray # type: ignore[union-attr]
886+
elif is_string_dtype(self.dtype) and not is_object_dtype(self.dtype):
887+
return libindex.StringEngine(target_values)
886888

887889
# error: Argument 1 to "ExtensionEngine" has incompatible type
888890
# "ndarray[Any, Any]"; expected "ExtensionArray"

0 commit comments

Comments
 (0)