From 6f5a8ea4164da423059eeb3425514e3558ef9ec6 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Mon, 31 Oct 2022 16:54:56 +0100 Subject: [PATCH 01/20] Implement masked engine --- asv_bench/benchmarks/indexing.py | 28 +++ asv_bench/benchmarks/indexing_engines.py | 78 +++++++++ pandas/_libs/hashtable.pyi | 2 + pandas/_libs/hashtable_class_helper.pxi.in | 12 +- pandas/_libs/index.pyi | 27 ++- pandas/_libs/index.pyx | 189 ++++++++++++++++++--- pandas/_libs/index_class_helper.pxi.in | 15 +- pandas/core/indexes/base.py | 60 +++++-- 8 files changed, 371 insertions(+), 40 deletions(-) diff --git a/asv_bench/benchmarks/indexing.py b/asv_bench/benchmarks/indexing.py index 69e3d166943a8..7301c8e1d8204 100644 --- a/asv_bench/benchmarks/indexing.py +++ b/asv_bench/benchmarks/indexing.py @@ -10,9 +10,11 @@ import numpy as np from pandas import ( + NA, CategoricalIndex, DataFrame, Float64Index, + Index, Int64Index, IntervalIndex, MultiIndex, @@ -87,6 +89,32 @@ def time_loc_slice(self, index, index_structure): self.data.loc[:800000] +class NumericMaskedIndexing: + params = [ + ("Int64", "UInt64", "Float64"), + (True, False), + ] + param_names = ["dtype", "monotonic"] + + def setup(self, dtype, monotonic): + N = 10**6 + indices = { + True: Index(range(N), dtype=dtype), + False: Index( + list(range(55)) + [54] + list(range(55, N - 1)), dtype=dtype + ).append(Index([NA], dtype=dtype)), + } + self.data = indices[monotonic] + self.indexer = np.arange(300, 1_000) + self.data_dups = self.data.append(self.data) + + def time_get_indexer(self, dtype, monotonic): + self.data.get_indexer(self.indexer) + + def time_get_indexer_dups(self, dtype, monotonic): + self.data.get_indexer_for(self.indexer) + + class NonNumericSeriesIndexing: params = [ diff --git a/asv_bench/benchmarks/indexing_engines.py b/asv_bench/benchmarks/indexing_engines.py index 0c6cb89f49da1..e8726a36cb2c9 100644 --- a/asv_bench/benchmarks/indexing_engines.py +++ b/asv_bench/benchmarks/indexing_engines.py @@ -9,6 +9,11 @@ from pandas._libs import index as libindex +from pandas import ( + NA, + Series, +) + def _get_numeric_engines(): engine_names = [ @@ -30,6 +35,26 @@ def _get_numeric_engines(): ] +def _get_masked_engines(): + engine_names = [ + ("MaskedInt64Engine", "Int64"), + ("MaskedInt32Engine", "Int32"), + ("MaskedInt16Engine", "Int16"), + ("MaskedInt8Engine", "Int8"), + ("MaskedUInt64Engine", "UInt64"), + ("MaskedUInt32Engine", "UInt32"), + ("MaskedUInt16engine", "UInt16"), + ("MaskedUInt8Engine", "UInt8"), + ("MaskedFloat64Engine", "Float64"), + ("MaskedFloat32Engine", "Float32"), + ] + return [ + (getattr(libindex, engine_name), dtype) + for engine_name, dtype in engine_names + if hasattr(libindex, engine_name) + ] + + class NumericEngineIndexing: params = [ @@ -80,6 +105,59 @@ def time_get_loc_near_middle(self, engine_and_dtype, index_type, unique, N): self.data.get_loc(self.key_middle) +class MaskedNumericEngineIndexing: + + params = [ + _get_masked_engines(), + ["monotonic_incr", "monotonic_decr", "non_monotonic"], + [True, False], + [10**5, 2 * 10**6], # 2e6 is above SIZE_CUTOFF + ] + param_names = ["engine_and_dtype", "index_type", "unique", "N"] + + def setup(self, engine_and_dtype, index_type, unique, N): + engine, dtype = engine_and_dtype + + if index_type == "monotonic_incr": + if unique: + ser = Series(N * 3, dtype=dtype) + else: + values = list([1] * N + [2] * N + [3] * N) + ser = Series(values, dtype=dtype) + elif index_type == "monotonic_decr": + if unique: + ser = Series(N * 3, dtype=dtype)[::-1] + else: + values = list([1] * N + [2] * N + [3] * N) + ser = Series(values, dtype=dtype)[::-1] + else: + assert index_type == "non_monotonic" + if unique: + ser = Series(np.zeros(N * 3, dtype="uint8"), dtype=dtype) + ser[:N] = Series(np.arange(N * 2, N * 3), dtype=dtype) + ser[N:] = Series(np.arange(N * 2), dtype=dtype) + ser[-1] = NA + + else: + ser = Series([1, 2, 3] * N, dtype=dtype) + ser[-1] = NA + + self.data = engine(ser._values._data, ser._values._mask) + # code belows avoids populating the mapping etc. while timing. + self.data.get_loc(2) + + self.key_middle = ser[len(ser) // 2] + self.key_early = ser[2] + + def time_get_loc(self, engine_and_dtype, index_type, unique, N): + self.data.get_loc(self.key_early) + + def time_get_loc_near_middle(self, engine_and_dtype, index_type, unique, N): + # searchsorted performance may be different near the middle of a range + # vs near an endpoint + self.data.get_loc(self.key_middle) + + class ObjectEngineIndexing: params = [("monotonic_incr", "monotonic_decr", "non_monotonic")] diff --git a/pandas/_libs/hashtable.pyi b/pandas/_libs/hashtable.pyi index e60ccdb29c6b2..ae4fa5dfba97b 100644 --- a/pandas/_libs/hashtable.pyi +++ b/pandas/_libs/hashtable.pyi @@ -125,10 +125,12 @@ class HashTable: def map_locations( self, values: np.ndarray, # np.ndarray[subclass-specific] + mask: np.ndarray | None = ..., ) -> None: ... def lookup( self, values: np.ndarray, # np.ndarray[subclass-specific] + mask: np.ndarray | None = ..., ) -> npt.NDArray[np.intp]: ... def get_labels( self, diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index c6d8783d6f115..a9ce03ae83587 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -966,8 +966,9 @@ cdef class StringHashTable(HashTable): return labels @cython.boundscheck(False) - def lookup(self, ndarray[object] values) -> ndarray: + def lookup(self, ndarray[object] values, object mask = None) -> ndarray: # -> np.ndarray[np.intp] + # mask not yet implemented cdef: Py_ssize_t i, n = len(values) int ret = 0 @@ -1002,7 +1003,8 @@ cdef class StringHashTable(HashTable): return np.asarray(locs) @cython.boundscheck(False) - def map_locations(self, ndarray[object] values) -> None: + def map_locations(self, ndarray[object] values, object mask = None) -> None: + # mask not yet implemented cdef: Py_ssize_t i, n = len(values) int ret = 0 @@ -1275,7 +1277,8 @@ cdef class PyObjectHashTable(HashTable): else: raise KeyError(key) - def map_locations(self, ndarray[object] values) -> None: + def map_locations(self, ndarray[object] values, object mask = None) -> None: + # mask not yet implemented cdef: Py_ssize_t i, n = len(values) int ret = 0 @@ -1289,8 +1292,9 @@ cdef class PyObjectHashTable(HashTable): k = kh_put_pymap(self.table, val, &ret) self.table.vals[k] = i - def lookup(self, ndarray[object] values) -> ndarray: + def lookup(self, ndarray[object] values, object mask = None) -> ndarray: # -> np.ndarray[np.intp] + # mask not yet implemented cdef: Py_ssize_t i, n = len(values) int ret = 0 diff --git a/pandas/_libs/index.pyi b/pandas/_libs/index.pyi index 8fff335352617..18f6306f1b56b 100644 --- a/pandas/_libs/index.pyi +++ b/pandas/_libs/index.pyi @@ -7,7 +7,7 @@ from pandas.core.arrays import ExtensionArray class IndexEngine: over_size_threshold: bool - def __init__(self, values: np.ndarray) -> None: ... + def __init__(self, values: np.ndarray, mask: np.ndarray | None = ...) -> None: ... def __contains__(self, val: object) -> bool: ... # -> int | slice | np.ndarray[bool] @@ -23,12 +23,19 @@ class IndexEngine: @property def is_mapping_populated(self) -> bool: ... def clear_mapping(self): ... - def get_indexer(self, values: np.ndarray) -> npt.NDArray[np.intp]: ... + def get_indexer( + self, values: np.ndarray, mask: np.ndarray | None = ... + ) -> npt.NDArray[np.intp]: ... def get_indexer_non_unique( self, targets: np.ndarray, ) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: ... +class MaskedEngine(IndexEngine): + def get_indexer_non_unique( # type: ignore[override] + self, targets: np.ndarray, target_mask: np.ndarray + ) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: ... + class Float64Engine(IndexEngine): ... class Float32Engine(IndexEngine): ... class Complex128Engine(IndexEngine): ... @@ -46,6 +53,19 @@ class DatetimeEngine(Int64Engine): ... class TimedeltaEngine(DatetimeEngine): ... class PeriodEngine(Int64Engine): ... class BoolEngine(UInt8Engine): ... +class MaskedBoolEngine(MaskedUInt8Engine): ... +class MaskedFloat64Engine(MaskedEngine): ... +class MaskedFloat32Engine(MaskedEngine): ... +class MaskedComplex128Engine(MaskedEngine): ... +class MaskedComplex64Engine(MaskedEngine): ... +class MaskedInt64Engine(MaskedEngine): ... +class MaskedInt32Engine(MaskedEngine): ... +class MaskedInt16Engine(MaskedEngine): ... +class MaskedInt8Engine(MaskedEngine): ... +class MaskedUInt64Engine(MaskedEngine): ... +class MaskedUInt32Engine(MaskedEngine): ... +class MaskedUInt16Engine(MaskedEngine): ... +class MaskedUInt8Engine(MaskedEngine): ... class BaseMultiIndexCodesEngine: levels: list[np.ndarray] @@ -58,8 +78,7 @@ class BaseMultiIndexCodesEngine: offsets: np.ndarray, # np.ndarray[np.uint64, ndim=1] ) -> None: ... def get_indexer( - self, - target: npt.NDArray[np.object_], + self, target: npt.NDArray[np.object_], mask: np.ndarray | None = ... ) -> npt.NDArray[np.intp]: ... def _extract_level_codes(self, target: MultiIndex) -> np.ndarray: ... def get_indexer_with_fill( diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index f968e879498b2..9bb28156f5c3c 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -32,6 +32,7 @@ from pandas._libs import ( from pandas._libs.lib cimport eq_NA_compat from pandas._libs.missing cimport ( + C_NA, checknull, is_matching_na, ) @@ -45,7 +46,7 @@ cdef inline bint is_definitely_invalid_key(object val): return False -cdef ndarray _get_bool_indexer(ndarray values, object val): +cdef ndarray _get_bool_indexer(ndarray values, object val, ndarray mask = None): """ Return a ndarray[bool] of locations where val matches self.values. @@ -58,6 +59,7 @@ cdef ndarray _get_bool_indexer(ndarray values, object val): object item if values.descr.type_num == cnp.NPY_OBJECT: + assert mask is None # no mask for object dtype # i.e. values.dtype == object if not checknull(val): indexer = eq_NA_compat(values, val) @@ -71,10 +73,16 @@ cdef ndarray _get_bool_indexer(ndarray values, object val): indexer[i] = is_matching_na(item, val) else: - if util.is_nan(val): - indexer = np.isnan(values) + if mask is not None: + if checknull(val) and val is C_NA: + indexer = mask == 1 + else: + indexer = (values == val) & ~mask else: - indexer = values == val + if util.is_nan(val): + indexer = np.isnan(values) + else: + indexer = values == val return indexer.view(bool) @@ -108,6 +116,7 @@ cdef class IndexEngine: cdef readonly: ndarray values + ndarray mask HashTable mapping bint over_size_threshold @@ -116,8 +125,9 @@ cdef class IndexEngine: bint need_monotonic_check, need_unique_check object _np_type - def __init__(self, ndarray values): + def __init__(self, ndarray values, ndarray mask = None): self.values = values + self.mask = mask self.over_size_threshold = len(values) >= _SIZE_CUTOFF self.clear_mapping() @@ -158,6 +168,8 @@ cdef class IndexEngine: return self._get_loc_duplicates(val) try: + if self.mask is not None and checknull(val) and val is C_NA: + return self.mapping.get_na() return self.mapping.get_item(val) except OverflowError as err: # GH#41775 OverflowError e.g. if we are uint64 and val is -1 @@ -203,7 +215,7 @@ cdef class IndexEngine: cdef: ndarray[uint8_t, ndim=1, cast=True] indexer - indexer = _get_bool_indexer(self.values, val) + indexer = _get_bool_indexer(self.values, val, self.mask) return _unpack_bool_indexer(indexer, val) def sizeof(self, deep: bool = False) -> int: @@ -244,21 +256,25 @@ cdef class IndexEngine: cdef inline _do_monotonic_check(self): cdef: bint is_unique - try: - values = self.values - self.monotonic_inc, self.monotonic_dec, is_unique = \ - self._call_monotonic(values) - except TypeError: + if self.mask is not None and np.any(self.mask): self.monotonic_inc = 0 self.monotonic_dec = 0 - is_unique = 0 + else: + try: + values = self.values + self.monotonic_inc, self.monotonic_dec, is_unique = \ + self._call_monotonic(values) + except TypeError: + self.monotonic_inc = 0 + self.monotonic_dec = 0 + is_unique = 0 - self.need_monotonic_check = 0 + self.need_monotonic_check = 0 - # we can only be sure of uniqueness if is_unique=1 - if is_unique: - self.unique = 1 - self.need_unique_check = 0 + # we can only be sure of uniqueness if is_unique=1 + if is_unique: + self.unique = 1 + self.need_unique_check = 0 cdef _call_monotonic(self, values): return algos.is_monotonic(values, timelike=False) @@ -283,7 +299,7 @@ cdef class IndexEngine: values = self.values self.mapping = self._make_hash_table(len(values)) - self.mapping.map_locations(values) + self.mapping.map_locations(values, self.mask) if len(self.mapping) == len(values): self.unique = 1 @@ -299,9 +315,9 @@ cdef class IndexEngine: self.monotonic_inc = 0 self.monotonic_dec = 0 - def get_indexer(self, ndarray values) -> np.ndarray: + def get_indexer(self, ndarray values, ndarray mask = None) -> np.ndarray: self._ensure_mapping_populated() - return self.mapping.lookup(values) + return self.mapping.lookup(values, mask) def get_indexer_non_unique(self, ndarray targets): """ @@ -684,7 +700,7 @@ cdef class BaseMultiIndexCodesEngine: in zip(self.levels, zt)] return self._codes_to_ints(np.array(level_codes, dtype='uint64').T) - def get_indexer(self, target: np.ndarray) -> np.ndarray: + def get_indexer(self, target: np.ndarray, ndarray mask = None) -> np.ndarray: """ Returns an array giving the positions of each value of `target` in `self.values`, where -1 represents a value in `target` which does not @@ -693,12 +709,14 @@ cdef class BaseMultiIndexCodesEngine: Parameters ---------- target : np.ndarray + mask: Compatibility with IndexEngine Returns ------- np.ndarray[intp_t, ndim=1] of the indexer of `target` into `self.values` """ + assert mask is None # should never be not None return self._base.get_indexer(self, target) def get_indexer_with_fill(self, ndarray target, ndarray values, @@ -830,6 +848,15 @@ cdef class BoolEngine(UInt8Engine): return val +cdef class MaskedBoolEngine(MaskedUInt8Engine): + cdef _check_type(self, object val): + if checknull(val) and val is C_NA: + return val + if not util.is_bool_object(val): + raise KeyError(val) + return val + + @cython.internal @cython.freelist(32) cdef class SharedEngine: @@ -1099,3 +1126,123 @@ cdef class ExtensionEngine(SharedEngine): cdef _check_type(self, object val): hash(val) + + +cdef class MaskedIndexEngine(IndexEngine): + + def __init__(self, ndarray values, ndarray mask): + super().__init__(values, mask) + + def get_indexer_non_unique(self, ndarray targets, ndarray target_mask): + """ + Return an indexer suitable for taking from a non unique index + return the labels in the same order as the target + and a missing indexer into the targets (which correspond + to the -1 indices in the results + + Returns + ------- + indexer : np.ndarray[np.intp] + missing : np.ndarray[np.intp] + """ + cdef: + ndarray values, mask + ndarray[intp_t] result, missing + set stargets + list na_pos + dict d = {} + object val + Py_ssize_t count = 0, count_missing = 0 + Py_ssize_t i, j, n, n_t, n_alloc, start, end, na_idx + bint check_na_values = False, found_na = False + + values = self.values + assert not values.dtype == object # go through object path instead + + mask = self.mask + stargets = set(targets[~target_mask]) + + n = len(values) + n_t = len(targets) + if n > 10_000: + n_alloc = 10_000 + else: + n_alloc = n + + result = np.empty(n_alloc, dtype=np.intp) + missing = np.empty(n_t, dtype=np.intp) + + # map each starget to its position in the index + if ( + stargets and + len(stargets) < 5 and + not np.any(target_mask) and + self.is_monotonic_increasing + ): + # if there are few enough stargets and the index is monotonically + # increasing, then use binary search for each starget + for starget in stargets: + start = values.searchsorted(starget, side='left') + end = values.searchsorted(starget, side='right') + if start != end: + d[starget] = list(range(start, end)) + + stargets = set() + + if stargets: + # otherwise, map by iterating through all items in the index + + na_pos = [] + + for i in range(n): + val = values[i] + + if mask[i]: + na_pos.append(i) + + else: + if val in stargets: + if val not in d: + d[val] = [] + d[val].append(i) + + for i in range(n_t): + val = targets[i] + + if target_mask[i]: + if na_pos: + for na_idx in na_pos: + # realloc if needed + if count >= n_alloc: + n_alloc += 10_000 + result = np.resize(result, n_alloc) + + result[count] = na_idx + count += 1 + continue + + elif val in d: + # found + key = val + + for j in d[key]: + + # realloc if needed + if count >= n_alloc: + n_alloc += 10_000 + result = np.resize(result, n_alloc) + + result[count] = j + count += 1 + continue + + # value not found + if count >= n_alloc: + n_alloc += 10_000 + result = np.resize(result, n_alloc) + result[count] = -1 + count += 1 + missing[count_missing] = i + count_missing += 1 + + return result[0:count], missing[0:count_missing] diff --git a/pandas/_libs/index_class_helper.pxi.in b/pandas/_libs/index_class_helper.pxi.in index b9c02ba64f69c..6001ff5b5bce0 100644 --- a/pandas/_libs/index_class_helper.pxi.in +++ b/pandas/_libs/index_class_helper.pxi.in @@ -24,17 +24,29 @@ dtypes = [('Float64', 'float64'), ('Complex64', 'complex64'), ('Complex128', 'complex128'), ] + +engines = [('', 'IndexEngine'), ('Masked', 'MaskedIndexEngine')] + }} {{for name, dtype in dtypes}} +{{for prefix, engine in engines}} -cdef class {{name}}Engine(IndexEngine): +cdef class {{prefix}}{{name}}Engine({{engine}}): cdef _make_hash_table(self, Py_ssize_t n): + {{if engine == 'MaskedIndexEngine'}} + return _hash.{{name}}HashTable(n, uses_mask=True) + {{else}} return _hash.{{name}}HashTable(n) + {{endif}} cdef _check_type(self, object val): + {{if engine == 'MaskedIndexEngine'}} + if checknull(val): + return val + {{endif}} {{if name not in {'Float64', 'Float32', 'Complex64', 'Complex128'} }} if not util.is_integer_object(val): if util.is_float_object(val): @@ -61,5 +73,6 @@ cdef class {{name}}Engine(IndexEngine): {{endif}} return val +{{endfor}} {{endfor}} diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index d8300bb29c274..77b6839000187 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -208,6 +208,22 @@ _dtype_obj = np.dtype("object") +_masked_engines = { + "Complex128": libindex.MaskedComplex128Engine, + "Complex64": libindex.MaskedComplex64Engine, + "Float64": libindex.MaskedFloat64Engine, + "Float32": libindex.MaskedFloat32Engine, + "UInt64": libindex.MaskedUInt64Engine, + "UInt32": libindex.MaskedUInt32Engine, + "UInt16": libindex.MaskedUInt16Engine, + "UInt8": libindex.MaskedUInt8Engine, + "Int64": libindex.MaskedInt64Engine, + "Int32": libindex.MaskedInt32Engine, + "Int16": libindex.MaskedInt16Engine, + "Int8": libindex.MaskedInt8Engine, + "boolean": libindex.MaskedBoolEngine, +} + def _maybe_return_indexers(meth: F) -> F: """ @@ -824,14 +840,19 @@ def _cleanup(self) -> None: @cache_readonly def _engine( self, - ) -> libindex.IndexEngine | libindex.ExtensionEngine: + ) -> libindex.IndexEngine | libindex.ExtensionEngine | libindex.MaskedEngine: # For base class (object dtype) we get ObjectEngine target_values = self._get_engine_target() - if ( - isinstance(target_values, ExtensionArray) - and self._engine_type is libindex.ObjectEngine - ): - return libindex.ExtensionEngine(target_values) + if isinstance(target_values, ExtensionArray): + is_masked = hasattr(target_values, "_mask") + # error: "ExtensionArray" has no attribute "_data" + if is_masked: + return _masked_engines[target_values.dtype.name]( + target_values._data, + target_values._mask, # type: ignore[attr-defined] + ) + elif self._engine_type is libindex.ObjectEngine: + return libindex.ExtensionEngine(target_values) target_values = cast(np.ndarray, target_values) # to avoid a reference cycle, bind `target_values` to a local variable, so @@ -3872,7 +3893,14 @@ def _get_indexer( else: tgt_values = target._get_engine_target() - indexer = self._engine.get_indexer(tgt_values) + if is_extension_array_dtype(tgt_values): + # Too many arguments for "get_indexer_non_unique" of "IndexEngine" + indexer = self._engine.get_indexer( # type: ignore[call-arg] + tgt_values._data, + tgt_values._mask, + ) + else: + indexer = self._engine.get_indexer(tgt_values) return ensure_platform_int(indexer) @@ -5040,8 +5068,11 @@ def _get_engine_target(self) -> ArrayLike: # GH#45652 much more performant than ExtensionEngine return vals._ndarray if type(self) is Index and isinstance(self._values, ExtensionArray): - # TODO(ExtensionIndex): remove special-case, just use self._values - return self._values.astype(object) + from pandas.core.arrays import BaseMaskedArray + + if not isinstance(self._values, BaseMaskedArray): + # TODO(ExtensionIndex): remove special-case, just use self._values + return self._values.astype(object) return vals def _from_join_target(self, result: np.ndarray) -> ArrayLike: @@ -5867,8 +5898,17 @@ def get_indexer_non_unique( # Item "IndexEngine" of "Union[IndexEngine, ExtensionEngine]" has # no attribute "_extract_level_codes" tgt_values = engine._extract_level_codes(target) # type: ignore[union-attr] + if is_extension_array_dtype(tgt_values): + # Too many arguments for "get_indexer_non_unique" of "IndexEngine" + # start = time.time() + indexer, missing = self._engine.get_indexer_non_unique( + tgt_values._data, + tgt_values._mask, # type: ignore[call-arg] + ) + # print(time.time()-start) + else: + indexer, missing = self._engine.get_indexer_non_unique(tgt_values) - indexer, missing = self._engine.get_indexer_non_unique(tgt_values) return ensure_platform_int(indexer), ensure_platform_int(missing) @final From a051815c11cf100232b6e017ed9bcd8271ec5acb Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Mon, 31 Oct 2022 19:08:19 +0100 Subject: [PATCH 02/20] Add tests --- asv_bench/benchmarks/indexing_engines.py | 12 ++-- pandas/tests/indexes/numeric/test_indexing.py | 64 +++++++++++++++++++ 2 files changed, 71 insertions(+), 5 deletions(-) diff --git a/asv_bench/benchmarks/indexing_engines.py b/asv_bench/benchmarks/indexing_engines.py index e8726a36cb2c9..613ef868fba24 100644 --- a/asv_bench/benchmarks/indexing_engines.py +++ b/asv_bench/benchmarks/indexing_engines.py @@ -120,22 +120,24 @@ def setup(self, engine_and_dtype, index_type, unique, N): if index_type == "monotonic_incr": if unique: - ser = Series(N * 3, dtype=dtype) + ser = Series(np.arange(N * 3, dtype=dtype.lower()), dtype=dtype) else: values = list([1] * N + [2] * N + [3] * N) ser = Series(values, dtype=dtype) elif index_type == "monotonic_decr": if unique: - ser = Series(N * 3, dtype=dtype)[::-1] + ser = Series(np.arange(N * 3, dtype=dtype.lower()), dtype=dtype)[::-1] else: values = list([1] * N + [2] * N + [3] * N) ser = Series(values, dtype=dtype)[::-1] else: assert index_type == "non_monotonic" if unique: - ser = Series(np.zeros(N * 3, dtype="uint8"), dtype=dtype) - ser[:N] = Series(np.arange(N * 2, N * 3), dtype=dtype) - ser[N:] = Series(np.arange(N * 2), dtype=dtype) + ser = Series(np.zeros(N * 3, dtype=dtype.lower()), dtype=dtype) + ser[:N] = Series( + np.arange(N * 2, N * 3, dtype=dtype.lower()), dtype=dtype + ) + ser[N:] = Series(np.arange(N * 2, dtype=dtype.lower()), dtype=dtype) ser[-1] = NA else: diff --git a/pandas/tests/indexes/numeric/test_indexing.py b/pandas/tests/indexes/numeric/test_indexing.py index 0c2c5e0b903bc..08e6980e3ee94 100644 --- a/pandas/tests/indexes/numeric/test_indexing.py +++ b/pandas/tests/indexes/numeric/test_indexing.py @@ -4,12 +4,14 @@ from pandas.errors import InvalidIndexError from pandas import ( + NA, Index, RangeIndex, Series, Timestamp, ) import pandas._testing as tm +from pandas.core.arrays import FloatingArray from pandas.core.indexes.api import ( Float64Index, Int64Index, @@ -391,6 +393,68 @@ def test_get_indexer_uint64(self, index_large): expected = np.array([0, 1, 1, 2, 3, 4, -1, -1, -1, -1], dtype=np.intp) tm.assert_numpy_array_equal(indexer, expected) + @pytest.mark.parametrize("val, val2", [(4, 5), (4, 4), (4, NA), (NA, NA)]) + def test_get_loc_masked(self, val, val2, any_numeric_ea_dtype): + # GH#39133 + idx = Index([1, 2, 3, val, val2], dtype=any_numeric_ea_dtype) + result = idx.get_loc(2) + assert result == 1 + + with pytest.raises(KeyError, match="9"): + idx.get_loc(9) + + def test_get_loc_masked_na(self, any_numeric_ea_dtype): + # GH#39133 + idx = Index([1, 2, NA], dtype=any_numeric_ea_dtype) + result = idx.get_loc(NA) + assert result == 2 + + idx = Index([1, 2, NA, NA], dtype=any_numeric_ea_dtype) + result = idx.get_loc(NA) + tm.assert_numpy_array_equal(result, np.array([False, False, True, True])) + + idx = Index([1, 2, 3], dtype=any_numeric_ea_dtype) + with pytest.raises(KeyError, match="NA"): + idx.get_loc(NA) + + def test_get_loc_masked_na_and_nan(self): + # GH#39133 + idx = Index( + FloatingArray( + np.array([1, 2, 1, np.nan]), mask=np.array([False, False, True, False]) + ) + ) + result = idx.get_loc(NA) + assert result == 2 + result = idx.get_loc(np.nan) + assert result == 3 + + idx = Index( + FloatingArray(np.array([1, 2, 1.0]), mask=np.array([False, False, True])) + ) + result = idx.get_loc(NA) + assert result == 2 + with pytest.raises(KeyError, match="nan"): + idx.get_loc(np.nan) + + idx = Index( + FloatingArray( + np.array([1, 2, np.nan]), mask=np.array([False, False, False]) + ) + ) + result = idx.get_loc(np.nan) + assert result == 2 + with pytest.raises(KeyError, match="NA"): + idx.get_loc(NA) + + @pytest.mark.parametrize("val", [4, 2]) + def test_get_indexer_masked_na(self, any_numeric_ea_dtype, val): + # GH#39133 + idx = Index([1, 2, NA, 3, val], dtype=any_numeric_ea_dtype) + result = idx.get_indexer_for([1, NA, 5]) + expected = np.array([0, 2, -1]) + tm.assert_numpy_array_equal(result, expected) + class TestWhere: @pytest.mark.parametrize( From 3513430afbb20dce6a3486f81706227d1ef874c8 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Mon, 31 Oct 2022 19:18:05 +0100 Subject: [PATCH 03/20] Fix asv --- asv_bench/benchmarks/indexing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/asv_bench/benchmarks/indexing.py b/asv_bench/benchmarks/indexing.py index 7301c8e1d8204..b764dc7ada0fb 100644 --- a/asv_bench/benchmarks/indexing.py +++ b/asv_bench/benchmarks/indexing.py @@ -101,7 +101,7 @@ def setup(self, dtype, monotonic): indices = { True: Index(range(N), dtype=dtype), False: Index( - list(range(55)) + [54] + list(range(55, N - 1)), dtype=dtype + list(range(50)) + [54, 53, 52, 51] + list(range(55, N - 1)), dtype=dtype ).append(Index([NA], dtype=dtype)), } self.data = indices[monotonic] From 2246123fb49097e4569028fe398d8a6427f7e7b2 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Mon, 31 Oct 2022 19:20:37 +0100 Subject: [PATCH 04/20] Fix mypy --- pandas/core/indexes/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 77b6839000187..021ca3b416adb 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -848,7 +848,7 @@ def _engine( # error: "ExtensionArray" has no attribute "_data" if is_masked: return _masked_engines[target_values.dtype.name]( - target_values._data, + target_values._data, # type: ignore[attr-defined] target_values._mask, # type: ignore[attr-defined] ) elif self._engine_type is libindex.ObjectEngine: From 1d35e2e487d7446662dd03c9bed3d522bd3d3def Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Mon, 31 Oct 2022 19:25:43 +0100 Subject: [PATCH 05/20] Add test --- pandas/_libs/index.pyx | 6 +++--- pandas/tests/indexes/numeric/test_indexing.py | 8 ++++++++ 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index 9bb28156f5c3c..9d17f686d09b5 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -74,7 +74,7 @@ cdef ndarray _get_bool_indexer(ndarray values, object val, ndarray mask = None): else: if mask is not None: - if checknull(val) and val is C_NA: + if val is C_NA: indexer = mask == 1 else: indexer = (values == val) & ~mask @@ -168,7 +168,7 @@ cdef class IndexEngine: return self._get_loc_duplicates(val) try: - if self.mask is not None and checknull(val) and val is C_NA: + if self.mask is not None and val is C_NA: return self.mapping.get_na() return self.mapping.get_item(val) except OverflowError as err: @@ -850,7 +850,7 @@ cdef class BoolEngine(UInt8Engine): cdef class MaskedBoolEngine(MaskedUInt8Engine): cdef _check_type(self, object val): - if checknull(val) and val is C_NA: + if val is C_NA: return val if not util.is_bool_object(val): raise KeyError(val) diff --git a/pandas/tests/indexes/numeric/test_indexing.py b/pandas/tests/indexes/numeric/test_indexing.py index 08e6980e3ee94..2ae92265103f6 100644 --- a/pandas/tests/indexes/numeric/test_indexing.py +++ b/pandas/tests/indexes/numeric/test_indexing.py @@ -455,6 +455,14 @@ def test_get_indexer_masked_na(self, any_numeric_ea_dtype, val): expected = np.array([0, 2, -1]) tm.assert_numpy_array_equal(result, expected) + def test_get_indexer_masked_na_boolean(self): + # GH#39133 + idx = Index([True, False, NA], dtype="boolean") + result = idx.get_loc(False) + assert result == 1 + result = idx.get_loc(NA) + assert result == 2 + class TestWhere: @pytest.mark.parametrize( From 797269a36da08e0d3384dfe71e8915aef2385934 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Mon, 31 Oct 2022 20:22:58 +0100 Subject: [PATCH 06/20] Fix error --- pandas/_libs/index_class_helper.pxi.in | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/index_class_helper.pxi.in b/pandas/_libs/index_class_helper.pxi.in index 6001ff5b5bce0..bf3d88edd9386 100644 --- a/pandas/_libs/index_class_helper.pxi.in +++ b/pandas/_libs/index_class_helper.pxi.in @@ -44,7 +44,7 @@ cdef class {{prefix}}{{name}}Engine({{engine}}): cdef _check_type(self, object val): {{if engine == 'MaskedIndexEngine'}} - if checknull(val): + if val is C_NA: return val {{endif}} {{if name not in {'Float64', 'Float32', 'Complex64', 'Complex128'} }} From 32e6db411d20ee19bfbe001002a8a8e802c69872 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Tue, 1 Nov 2022 15:36:41 +0100 Subject: [PATCH 07/20] Fix windows builds --- pandas/tests/indexes/numeric/test_indexing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/indexes/numeric/test_indexing.py b/pandas/tests/indexes/numeric/test_indexing.py index 2ae92265103f6..fd72f929c9403 100644 --- a/pandas/tests/indexes/numeric/test_indexing.py +++ b/pandas/tests/indexes/numeric/test_indexing.py @@ -453,7 +453,7 @@ def test_get_indexer_masked_na(self, any_numeric_ea_dtype, val): idx = Index([1, 2, NA, 3, val], dtype=any_numeric_ea_dtype) result = idx.get_indexer_for([1, NA, 5]) expected = np.array([0, 2, -1]) - tm.assert_numpy_array_equal(result, expected) + tm.assert_numpy_array_equal(result, expected, check_dtype=False) def test_get_indexer_masked_na_boolean(self): # GH#39133 From 36a0d29c3206ae501cc5eb58191ed58a1bfa7d62 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Tue, 1 Nov 2022 15:44:18 +0100 Subject: [PATCH 08/20] Fix typing --- pandas/_libs/index.pyi | 26 +++++++++++++------------- pandas/core/indexes/base.py | 2 +- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/pandas/_libs/index.pyi b/pandas/_libs/index.pyi index 18f6306f1b56b..0c2639478f4e1 100644 --- a/pandas/_libs/index.pyi +++ b/pandas/_libs/index.pyi @@ -31,7 +31,7 @@ class IndexEngine: targets: np.ndarray, ) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: ... -class MaskedEngine(IndexEngine): +class MaskedIndexEngine(IndexEngine): def get_indexer_non_unique( # type: ignore[override] self, targets: np.ndarray, target_mask: np.ndarray ) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: ... @@ -54,18 +54,18 @@ class TimedeltaEngine(DatetimeEngine): ... class PeriodEngine(Int64Engine): ... class BoolEngine(UInt8Engine): ... class MaskedBoolEngine(MaskedUInt8Engine): ... -class MaskedFloat64Engine(MaskedEngine): ... -class MaskedFloat32Engine(MaskedEngine): ... -class MaskedComplex128Engine(MaskedEngine): ... -class MaskedComplex64Engine(MaskedEngine): ... -class MaskedInt64Engine(MaskedEngine): ... -class MaskedInt32Engine(MaskedEngine): ... -class MaskedInt16Engine(MaskedEngine): ... -class MaskedInt8Engine(MaskedEngine): ... -class MaskedUInt64Engine(MaskedEngine): ... -class MaskedUInt32Engine(MaskedEngine): ... -class MaskedUInt16Engine(MaskedEngine): ... -class MaskedUInt8Engine(MaskedEngine): ... +class MaskedFloat64Engine(MaskedIndexEngine): ... +class MaskedFloat32Engine(MaskedIndexEngine): ... +class MaskedComplex128Engine(MaskedIndexEngine): ... +class MaskedComplex64Engine(MaskedIndexEngine): ... +class MaskedInt64Engine(MaskedIndexEngine): ... +class MaskedInt32Engine(MaskedIndexEngine): ... +class MaskedInt16Engine(MaskedIndexEngine): ... +class MaskedInt8Engine(MaskedIndexEngine): ... +class MaskedUInt64Engine(MaskedIndexEngine): ... +class MaskedUInt32Engine(MaskedIndexEngine): ... +class MaskedUInt16Engine(MaskedIndexEngine): ... +class MaskedUInt8Engine(MaskedIndexEngine): ... class BaseMultiIndexCodesEngine: levels: list[np.ndarray] diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 021ca3b416adb..47a306163cdc4 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -840,7 +840,7 @@ def _cleanup(self) -> None: @cache_readonly def _engine( self, - ) -> libindex.IndexEngine | libindex.ExtensionEngine | libindex.MaskedEngine: + ) -> libindex.IndexEngine | libindex.ExtensionEngine | libindex.MaskedIndexEngine: # For base class (object dtype) we get ObjectEngine target_values = self._get_engine_target() if isinstance(target_values, ExtensionArray): From ddcdb13f5087f146e871bb8526c8ab8c3852db89 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Wed, 2 Nov 2022 15:05:30 +0100 Subject: [PATCH 09/20] Use np arrays --- asv_bench/benchmarks/indexing_engines.py | 35 ++++++++++-------------- 1 file changed, 15 insertions(+), 20 deletions(-) diff --git a/asv_bench/benchmarks/indexing_engines.py b/asv_bench/benchmarks/indexing_engines.py index 613ef868fba24..a07b504b4f856 100644 --- a/asv_bench/benchmarks/indexing_engines.py +++ b/asv_bench/benchmarks/indexing_engines.py @@ -9,11 +9,6 @@ from pandas._libs import index as libindex -from pandas import ( - NA, - Series, -) - def _get_numeric_engines(): engine_names = [ @@ -120,36 +115,36 @@ def setup(self, engine_and_dtype, index_type, unique, N): if index_type == "monotonic_incr": if unique: - ser = Series(np.arange(N * 3, dtype=dtype.lower()), dtype=dtype) + arr = np.arange(N * 3, dtype=dtype.lower()) else: values = list([1] * N + [2] * N + [3] * N) - ser = Series(values, dtype=dtype) + arr = np.array(values, dtype=dtype.lower()) + mask = np.zeros(N * 3, dtype="uint8") elif index_type == "monotonic_decr": if unique: - ser = Series(np.arange(N * 3, dtype=dtype.lower()), dtype=dtype)[::-1] + arr = np.arange(N * 3, dtype=dtype.lower())[::-1] else: values = list([1] * N + [2] * N + [3] * N) - ser = Series(values, dtype=dtype)[::-1] + arr = np.array(values, dtype=dtype.lower())[::-1] + mask = np.zeros(N * 3, dtype="uint8") else: assert index_type == "non_monotonic" if unique: - ser = Series(np.zeros(N * 3, dtype=dtype.lower()), dtype=dtype) - ser[:N] = Series( - np.arange(N * 2, N * 3, dtype=dtype.lower()), dtype=dtype - ) - ser[N:] = Series(np.arange(N * 2, dtype=dtype.lower()), dtype=dtype) - ser[-1] = NA + arr = np.zeros(N * 3, dtype=dtype.lower()) + arr[:N] = np.arange(N * 2, N * 3, dtype=dtype.lower()) + arr[N:] = np.arange(N * 2, dtype=dtype.lower()) else: - ser = Series([1, 2, 3] * N, dtype=dtype) - ser[-1] = NA + arr = np.array([1, 2, 3] * N, dtype=dtype.lower()) + mask = np.zeros(N * 3, dtype="uint8") + mask[-1] = True - self.data = engine(ser._values._data, ser._values._mask) + self.data = engine(arr, mask) # code belows avoids populating the mapping etc. while timing. self.data.get_loc(2) - self.key_middle = ser[len(ser) // 2] - self.key_early = ser[2] + self.key_middle = arr[len(arr) // 2] + self.key_early = arr[2] def time_get_loc(self, engine_and_dtype, index_type, unique, N): self.data.get_loc(self.key_early) From e29a9701581484d7f7964272570913e12abfa249 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Thu, 10 Nov 2022 21:52:16 +0100 Subject: [PATCH 10/20] Adress review --- pandas/_libs/index.pyx | 4 ++-- pandas/core/indexes/base.py | 14 ++++++++------ 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index 9d17f686d09b5..6ddbf9c149041 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -166,10 +166,10 @@ cdef class IndexEngine: self._ensure_mapping_populated() if not self.unique: return self._get_loc_duplicates(val) + if self.mask is not None and val is C_NA: + return self.mapping.get_na() try: - if self.mask is not None and val is C_NA: - return self.mapping.get_na() return self.mapping.get_item(val) except OverflowError as err: # GH#41775 OverflowError e.g. if we are uint64 and val is -1 diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 47a306163cdc4..b4ccca6741ad9 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -844,12 +844,12 @@ def _engine( # For base class (object dtype) we get ObjectEngine target_values = self._get_engine_target() if isinstance(target_values, ExtensionArray): - is_masked = hasattr(target_values, "_mask") - # error: "ExtensionArray" has no attribute "_data" - if is_masked: + from pandas.core.arrays import BaseMaskedArray + + if isinstance(target_values, BaseMaskedArray): return _masked_engines[target_values.dtype.name]( - target_values._data, # type: ignore[attr-defined] - target_values._mask, # type: ignore[attr-defined] + target_values._data, + target_values._mask, ) elif self._engine_type is libindex.ObjectEngine: return libindex.ExtensionEngine(target_values) @@ -3893,7 +3893,9 @@ def _get_indexer( else: tgt_values = target._get_engine_target() - if is_extension_array_dtype(tgt_values): + from pandas.core.arrays import BaseMaskedArray + + if isinstance(tgt_values, BaseMaskedArray): # Too many arguments for "get_indexer_non_unique" of "IndexEngine" indexer = self._engine.get_indexer( # type: ignore[call-arg] tgt_values._data, From aec65a721e27a884387dda139bf670622a99c9e1 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Tue, 22 Nov 2022 21:20:09 +0000 Subject: [PATCH 11/20] Adapt to join difference --- pandas/core/indexes/base.py | 29 +++++++++++++++++++++-------- pandas/core/indexes/datetimelike.py | 4 ++++ pandas/core/indexes/extension.py | 3 +++ 3 files changed, 28 insertions(+), 8 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index ada478cbb121b..d6d2671ee19bf 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -355,8 +355,8 @@ class Index(IndexOpsMixin, PandasObject): @final def _left_indexer_unique(self: _IndexT, other: _IndexT) -> npt.NDArray[np.intp]: # Caller is responsible for ensuring other.dtype == self.dtype - sv = self._get_engine_target() - ov = other._get_engine_target() + sv = self._get_join_target() + ov = other._get_join_target() # can_use_libjoin assures sv and ov are ndarrays sv = cast(np.ndarray, sv) ov = cast(np.ndarray, ov) @@ -367,8 +367,8 @@ def _left_indexer( self: _IndexT, other: _IndexT ) -> tuple[ArrayLike, npt.NDArray[np.intp], npt.NDArray[np.intp]]: # Caller is responsible for ensuring other.dtype == self.dtype - sv = self._get_engine_target() - ov = other._get_engine_target() + sv = self._get_join_target() + ov = other._get_join_target() # can_use_libjoin assures sv and ov are ndarrays sv = cast(np.ndarray, sv) ov = cast(np.ndarray, ov) @@ -381,8 +381,8 @@ def _inner_indexer( self: _IndexT, other: _IndexT ) -> tuple[ArrayLike, npt.NDArray[np.intp], npt.NDArray[np.intp]]: # Caller is responsible for ensuring other.dtype == self.dtype - sv = self._get_engine_target() - ov = other._get_engine_target() + sv = self._get_join_target() + ov = other._get_join_target() # can_use_libjoin assures sv and ov are ndarrays sv = cast(np.ndarray, sv) ov = cast(np.ndarray, ov) @@ -395,8 +395,8 @@ def _outer_indexer( self: _IndexT, other: _IndexT ) -> tuple[ArrayLike, npt.NDArray[np.intp], npt.NDArray[np.intp]]: # Caller is responsible for ensuring other.dtype == self.dtype - sv = self._get_engine_target() - ov = other._get_engine_target() + sv = self._get_join_target() + ov = other._get_join_target() # can_use_libjoin assures sv and ov are ndarrays sv = cast(np.ndarray, sv) ov = cast(np.ndarray, ov) @@ -4929,6 +4929,19 @@ def _get_engine_target(self) -> ArrayLike: return self._values.astype(object) return vals + def _get_join_target(self) -> ArrayLike: + """ + Get the ndarray or ExtensionArray that we can pass to the IndexEngine + constructor. + """ + vals = self._values + if isinstance(vals, StringArray): + # GH#45652 much more performant than ExtensionEngine + return vals._ndarray + if type(self) is Index and isinstance(self._values, ExtensionArray): + return self._values.astype(object) + return vals + def _from_join_target(self, result: np.ndarray) -> ArrayLike: """ Cast the ndarray returned from one of the libjoin.foo_indexer functions diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index c6c8695ab01da..ef8dfb39790cd 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -612,6 +612,10 @@ def _get_engine_target(self) -> np.ndarray: # engine methods and libjoin methods need dt64/td64 values cast to i8 return self._data._ndarray.view("i8") + def _get_join_target(self) -> np.ndarray: + # engine methods and libjoin methods need dt64/td64 values cast to i8 + return self._data._ndarray.view("i8") + def _from_join_target(self, result: np.ndarray): # view e.g. i8 back to M8[ns] result = result.view(self._data._ndarray.dtype) diff --git a/pandas/core/indexes/extension.py b/pandas/core/indexes/extension.py index 81d502b60d609..1c3ea927c41c5 100644 --- a/pandas/core/indexes/extension.py +++ b/pandas/core/indexes/extension.py @@ -187,6 +187,9 @@ class NDArrayBackedExtensionIndex(ExtensionIndex): def _get_engine_target(self) -> np.ndarray: return self._data._ndarray + def _get_join_target(self) -> np.ndarray: + return self._data._ndarray + def _from_join_target(self, result: np.ndarray) -> ArrayLike: assert result.dtype == self._data._ndarray.dtype return self._data._from_backing_data(result) From 69113cb71b15d5f925ed548f05760ea9b7a52015 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Wed, 30 Nov 2022 22:30:02 +0100 Subject: [PATCH 12/20] Address review --- pandas/_libs/hashtable.pyi | 4 ++-- pandas/_libs/index.pyx | 4 ---- pandas/core/indexes/base.py | 23 +++++++++++++---------- pandas/core/indexes/datetimelike.py | 4 ---- pandas/core/indexes/extension.py | 2 +- 5 files changed, 16 insertions(+), 21 deletions(-) diff --git a/pandas/_libs/hashtable.pyi b/pandas/_libs/hashtable.pyi index 4d38d4bf1a91b..18dfad3d64e67 100644 --- a/pandas/_libs/hashtable.pyi +++ b/pandas/_libs/hashtable.pyi @@ -162,12 +162,12 @@ class HashTable: def map_locations( self, values: np.ndarray, # np.ndarray[subclass-specific] - mask: np.ndarray | None = ..., + mask: npt.NDArray[np.bool_] | None = ..., ) -> None: ... def lookup( self, values: np.ndarray, # np.ndarray[subclass-specific] - mask: np.ndarray | None = ..., + mask: npt.NDArray[np.bool_] | None = ..., ) -> npt.NDArray[np.intp]: ... def get_labels( self, diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index 7e045988cc57d..be73dafdfe035 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -1141,9 +1141,6 @@ cdef class ExtensionEngine(SharedEngine): cdef class MaskedIndexEngine(IndexEngine): - def __init__(self, ndarray values, ndarray mask): - super().__init__(values, mask) - def get_indexer_non_unique(self, ndarray targets, ndarray target_mask): """ Return an indexer suitable for taking from a non unique index @@ -1165,7 +1162,6 @@ cdef class MaskedIndexEngine(IndexEngine): object val Py_ssize_t count = 0, count_missing = 0 Py_ssize_t i, j, n, n_t, n_alloc, start, end, na_idx - bint check_na_values = False, found_na = False values = self.values assert not values.dtype == object # go through object path instead diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 4f102a227a0a0..74060b32e7e00 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -4911,8 +4911,7 @@ def _get_engine_target(self) -> ArrayLike: def _get_join_target(self) -> ArrayLike: """ - Get the ndarray or ExtensionArray that we can pass to the IndexEngine - constructor. + Get the ndarray that we can pass to the join functions. """ vals = self._values if isinstance(vals, StringArray): @@ -5714,14 +5713,18 @@ def get_indexer_non_unique( # Item "IndexEngine" of "Union[IndexEngine, ExtensionEngine]" has # no attribute "_extract_level_codes" tgt_values = engine._extract_level_codes(target) # type: ignore[union-attr] - if is_extension_array_dtype(tgt_values): - # Too many arguments for "get_indexer_non_unique" of "IndexEngine" - # start = time.time() - indexer, missing = self._engine.get_indexer_non_unique( - tgt_values._data, - tgt_values._mask, # type: ignore[call-arg] - ) - # print(time.time()-start) + if is_extension_array_dtype(tgt_values.dtype): + # Hide here for performance reasons + from pandas.core.arrays import BaseMaskedArray + + if isinstance(tgt_values, BaseMaskedArray): + # Too many arguments for "get_indexer_non_unique" of "IndexEngine" + indexer, missing = self._engine.get_indexer_non_unique( + tgt_values._data, + tgt_values._mask, # type: ignore[call-arg] + ) + else: + indexer, missing = self._engine.get_indexer_non_unique(tgt_values) else: indexer, missing = self._engine.get_indexer_non_unique(tgt_values) diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index d9e62f2e8e9b9..a6c396555e9a7 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -635,10 +635,6 @@ def _get_engine_target(self) -> np.ndarray: # engine methods and libjoin methods need dt64/td64 values cast to i8 return self._data._ndarray.view("i8") - def _get_join_target(self) -> np.ndarray: - # engine methods and libjoin methods need dt64/td64 values cast to i8 - return self._data._ndarray.view("i8") - def _from_join_target(self, result: np.ndarray): # view e.g. i8 back to M8[ns] result = result.view(self._data._ndarray.dtype) diff --git a/pandas/core/indexes/extension.py b/pandas/core/indexes/extension.py index 1c3ea927c41c5..0fc8b30abb882 100644 --- a/pandas/core/indexes/extension.py +++ b/pandas/core/indexes/extension.py @@ -188,7 +188,7 @@ def _get_engine_target(self) -> np.ndarray: return self._data._ndarray def _get_join_target(self) -> np.ndarray: - return self._data._ndarray + return self._get_engine_target() def _from_join_target(self, result: np.ndarray) -> ArrayLike: assert result.dtype == self._data._ndarray.dtype From cadc2394658c45033ce4641c1dc5287875488481 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Wed, 30 Nov 2022 22:30:26 +0100 Subject: [PATCH 13/20] Add todo --- pandas/_libs/index.pyx | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index be73dafdfe035..44cd8ae200aea 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -1153,6 +1153,7 @@ cdef class MaskedIndexEngine(IndexEngine): indexer : np.ndarray[np.intp] missing : np.ndarray[np.intp] """ + # TODO: Unify with parent class cdef: ndarray values, mask ndarray[intp_t] result, missing From 06fbe03814d290b9993bf24c02a9bab40d9abdbc Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Wed, 14 Dec 2022 00:26:44 +0100 Subject: [PATCH 14/20] Move import --- pandas/core/indexes/base.py | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 9d3ec08d0bf37..399e5a489fac0 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -145,6 +145,7 @@ validate_putmask, ) from pandas.core.arrays import ( + BaseMaskedArray, Categorical, ExtensionArray, ) @@ -812,7 +813,6 @@ def _engine( # For base class (object dtype) we get ObjectEngine target_values = self._get_engine_target() if isinstance(target_values, ExtensionArray): - from pandas.core.arrays import BaseMaskedArray if isinstance(target_values, BaseMaskedArray): return _masked_engines[target_values.dtype.name]( @@ -3652,8 +3652,6 @@ def _get_indexer( else: tgt_values = target._get_engine_target() - from pandas.core.arrays import BaseMaskedArray - if isinstance(tgt_values, BaseMaskedArray): # Too many arguments for "get_indexer_non_unique" of "IndexEngine" indexer = self._engine.get_indexer( # type: ignore[call-arg] @@ -4825,8 +4823,6 @@ def _get_engine_target(self) -> ArrayLike: # GH#45652 much more performant than ExtensionEngine return vals._ndarray if type(self) is Index and isinstance(self._values, ExtensionArray): - from pandas.core.arrays import BaseMaskedArray - if not isinstance(self._values, BaseMaskedArray): # TODO(ExtensionIndex): remove special-case, just use self._values return self._values.astype(object) @@ -5662,9 +5658,6 @@ def get_indexer_non_unique( # no attribute "_extract_level_codes" tgt_values = engine._extract_level_codes(target) # type: ignore[union-attr] if is_extension_array_dtype(tgt_values.dtype): - # Hide here for performance reasons - from pandas.core.arrays import BaseMaskedArray - if isinstance(tgt_values, BaseMaskedArray): # Too many arguments for "get_indexer_non_unique" of "IndexEngine" indexer, missing = self._engine.get_indexer_non_unique( From 44ba473c14d4b65fd4c5c3324defb514d6bc9134 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Fri, 16 Dec 2022 22:57:13 +0100 Subject: [PATCH 15/20] Refactor --- asv_bench/benchmarks/indexing_engines.py | 10 ++++--- pandas/_libs/index.pyi | 15 ++++------ pandas/_libs/index.pyx | 36 +++++++++++++++--------- pandas/core/indexes/base.py | 27 +++--------------- 4 files changed, 38 insertions(+), 50 deletions(-) diff --git a/asv_bench/benchmarks/indexing_engines.py b/asv_bench/benchmarks/indexing_engines.py index a07b504b4f856..ad15d821ba116 100644 --- a/asv_bench/benchmarks/indexing_engines.py +++ b/asv_bench/benchmarks/indexing_engines.py @@ -9,6 +9,8 @@ from pandas._libs import index as libindex +from pandas.core.arrays import BaseMaskedArray + def _get_numeric_engines(): engine_names = [ @@ -119,14 +121,14 @@ def setup(self, engine_and_dtype, index_type, unique, N): else: values = list([1] * N + [2] * N + [3] * N) arr = np.array(values, dtype=dtype.lower()) - mask = np.zeros(N * 3, dtype="uint8") + mask = np.zeros(N * 3, dtype=np.bool_) elif index_type == "monotonic_decr": if unique: arr = np.arange(N * 3, dtype=dtype.lower())[::-1] else: values = list([1] * N + [2] * N + [3] * N) arr = np.array(values, dtype=dtype.lower())[::-1] - mask = np.zeros(N * 3, dtype="uint8") + mask = np.zeros(N * 3, dtype=np.bool_) else: assert index_type == "non_monotonic" if unique: @@ -136,10 +138,10 @@ def setup(self, engine_and_dtype, index_type, unique, N): else: arr = np.array([1, 2, 3] * N, dtype=dtype.lower()) - mask = np.zeros(N * 3, dtype="uint8") + mask = np.zeros(N * 3, dtype=np.bool_) mask[-1] = True - self.data = engine(arr, mask) + self.data = engine(BaseMaskedArray(arr, mask)) # code belows avoids populating the mapping etc. while timing. self.data.get_loc(2) diff --git a/pandas/_libs/index.pyi b/pandas/_libs/index.pyi index 0c2639478f4e1..4b4c4d65d1ea4 100644 --- a/pandas/_libs/index.pyi +++ b/pandas/_libs/index.pyi @@ -7,7 +7,7 @@ from pandas.core.arrays import ExtensionArray class IndexEngine: over_size_threshold: bool - def __init__(self, values: np.ndarray, mask: np.ndarray | None = ...) -> None: ... + def __init__(self, values: np.ndarray) -> None: ... def __contains__(self, val: object) -> bool: ... # -> int | slice | np.ndarray[bool] @@ -23,17 +23,16 @@ class IndexEngine: @property def is_mapping_populated(self) -> bool: ... def clear_mapping(self): ... - def get_indexer( - self, values: np.ndarray, mask: np.ndarray | None = ... - ) -> npt.NDArray[np.intp]: ... + def get_indexer(self, values: np.ndarray) -> npt.NDArray[np.intp]: ... def get_indexer_non_unique( self, targets: np.ndarray, ) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: ... class MaskedIndexEngine(IndexEngine): - def get_indexer_non_unique( # type: ignore[override] - self, targets: np.ndarray, target_mask: np.ndarray + def __init__(self, values: object) -> None: ... + def get_indexer_non_unique( + self, targets: object ) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: ... class Float64Engine(IndexEngine): ... @@ -77,9 +76,7 @@ class BaseMultiIndexCodesEngine: labels: list[np.ndarray], # all entries integer-dtyped offsets: np.ndarray, # np.ndarray[np.uint64, ndim=1] ) -> None: ... - def get_indexer( - self, target: npt.NDArray[np.object_], mask: np.ndarray | None = ... - ) -> npt.NDArray[np.intp]: ... + def get_indexer(self, target: npt.NDArray[np.object_]) -> npt.NDArray[np.intp]: ... def _extract_level_codes(self, target: MultiIndex) -> np.ndarray: ... def get_indexer_with_fill( self, diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index a12018a935214..94d21f39dc61a 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -128,9 +128,9 @@ cdef class IndexEngine: bint need_monotonic_check, need_unique_check object _np_type - def __init__(self, ndarray values, ndarray mask = None): + def __init__(self, ndarray values): self.values = values - self.mask = mask + self.mask = None self.over_size_threshold = len(values) >= _SIZE_CUTOFF self.clear_mapping() @@ -318,9 +318,9 @@ cdef class IndexEngine: self.monotonic_inc = 0 self.monotonic_dec = 0 - def get_indexer(self, ndarray values, ndarray mask = None) -> np.ndarray: + def get_indexer(self, ndarray values) -> np.ndarray: self._ensure_mapping_populated() - return self.mapping.lookup(values, mask) + return self.mapping.lookup(values) def get_indexer_non_unique(self, ndarray targets): """ @@ -711,7 +711,7 @@ cdef class BaseMultiIndexCodesEngine: level_codes.append(result) return self._codes_to_ints(np.array(level_codes, dtype="uint64").T) - def get_indexer(self, target: np.ndarray, ndarray mask = None) -> np.ndarray: + def get_indexer(self, target: np.ndarray) -> np.ndarray: """ Returns an array giving the positions of each value of `target` in `self.values`, where -1 represents a value in `target` which does not @@ -720,14 +720,12 @@ cdef class BaseMultiIndexCodesEngine: Parameters ---------- target : np.ndarray - mask: Compatibility with IndexEngine Returns ------- np.ndarray[intp_t, ndim=1] of the indexer of `target` into `self.values` """ - assert mask is None # should never be not None return self._base.get_indexer(self, target) def get_indexer_with_fill(self, ndarray target, ndarray values, @@ -1140,8 +1138,15 @@ cdef class ExtensionEngine(SharedEngine): cdef class MaskedIndexEngine(IndexEngine): + def __init__(self, object values): + super().__init__(values._data) + self.mask = values._mask + + def get_indexer(self, object values) -> np.ndarray: + self._ensure_mapping_populated() + return self.mapping.lookup(values._data, values._mask) - def get_indexer_non_unique(self, ndarray targets, ndarray target_mask): + def get_indexer_non_unique(self, object targets): """ Return an indexer suitable for taking from a non unique index return the labels in the same order as the target @@ -1155,7 +1160,7 @@ cdef class MaskedIndexEngine(IndexEngine): """ # TODO: Unify with parent class cdef: - ndarray values, mask + ndarray values, mask, target_vals, target_mask ndarray[intp_t] result, missing set stargets list na_pos @@ -1164,14 +1169,17 @@ cdef class MaskedIndexEngine(IndexEngine): Py_ssize_t count = 0, count_missing = 0 Py_ssize_t i, j, n, n_t, n_alloc, start, end, na_idx + target_vals = targets._data + target_mask = targets._mask + values = self.values assert not values.dtype == object # go through object path instead mask = self.mask - stargets = set(targets[~target_mask]) + stargets = set(target_vals[~target_mask]) n = len(values) - n_t = len(targets) + n_t = len(target_vals) if n > 10_000: n_alloc = 10_000 else: @@ -1190,8 +1198,8 @@ cdef class MaskedIndexEngine(IndexEngine): # if there are few enough stargets and the index is monotonically # increasing, then use binary search for each starget for starget in stargets: - start = values.searchsorted(starget, side='left') - end = values.searchsorted(starget, side='right') + start = values.searchsorted(starget, side="left") + end = values.searchsorted(starget, side="right") if start != end: d[starget] = list(range(start, end)) @@ -1215,7 +1223,7 @@ cdef class MaskedIndexEngine(IndexEngine): d[val].append(i) for i in range(n_t): - val = targets[i] + val = target_vals[i] if target_mask[i]: if na_pos: diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index e09045637ee66..8a95bf748d739 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -815,10 +815,7 @@ def _engine( if isinstance(target_values, ExtensionArray): if isinstance(target_values, BaseMaskedArray): - return _masked_engines[target_values.dtype.name]( - target_values._data, - target_values._mask, - ) + return _masked_engines[target_values.dtype.name](target_values) elif self._engine_type is libindex.ObjectEngine: return libindex.ExtensionEngine(target_values) @@ -3652,14 +3649,7 @@ def _get_indexer( else: tgt_values = target._get_engine_target() - if isinstance(tgt_values, BaseMaskedArray): - # Too many arguments for "get_indexer_non_unique" of "IndexEngine" - indexer = self._engine.get_indexer( # type: ignore[call-arg] - tgt_values._data, - tgt_values._mask, - ) - else: - indexer = self._engine.get_indexer(tgt_values) + indexer = self._engine.get_indexer(tgt_values) return ensure_platform_int(indexer) @@ -5657,17 +5647,8 @@ def get_indexer_non_unique( # Item "IndexEngine" of "Union[IndexEngine, ExtensionEngine]" has # no attribute "_extract_level_codes" tgt_values = engine._extract_level_codes(target) # type: ignore[union-attr] - if is_extension_array_dtype(tgt_values.dtype): - if isinstance(tgt_values, BaseMaskedArray): - # Too many arguments for "get_indexer_non_unique" of "IndexEngine" - indexer, missing = self._engine.get_indexer_non_unique( - tgt_values._data, - tgt_values._mask, # type: ignore[call-arg] - ) - else: - indexer, missing = self._engine.get_indexer_non_unique(tgt_values) - else: - indexer, missing = self._engine.get_indexer_non_unique(tgt_values) + + indexer, missing = self._engine.get_indexer_non_unique(tgt_values) return ensure_platform_int(indexer), ensure_platform_int(missing) From 1a6fec075e07c2465b2ce14caebe320a7c0f1d2c Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Mon, 19 Dec 2022 23:15:21 +0100 Subject: [PATCH 16/20] Remove unnecessary function --- pandas/core/indexes/base.py | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index da8b707fadec7..40005ee8df07b 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -4781,18 +4781,6 @@ def _get_engine_target(self) -> ArrayLike: return self._values.astype(object) return vals - def _get_join_target(self) -> ArrayLike: - """ - Get the ndarray that we can pass to the join functions. - """ - vals = self._values - if isinstance(vals, StringArray): - # GH#45652 much more performant than ExtensionEngine - return vals._ndarray - if type(self) is Index and isinstance(self._values, ExtensionArray): - return self._values.astype(object) - return vals - def _get_join_target(self) -> ArrayLike: """ Get the ndarray or ExtensionArray that we can pass to the join From befac7326112f54300795409d11625cec054c91c Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Mon, 19 Dec 2022 23:16:10 +0100 Subject: [PATCH 17/20] Remove unnecessary function --- pandas/core/indexes/base.py | 1 - pandas/core/indexes/extension.py | 3 --- 2 files changed, 4 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 40005ee8df07b..adcf3de747762 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -5612,7 +5612,6 @@ def get_indexer_non_unique( tgt_values = engine._extract_level_codes(target) # type: ignore[union-attr] indexer, missing = self._engine.get_indexer_non_unique(tgt_values) - return ensure_platform_int(indexer), ensure_platform_int(missing) @final diff --git a/pandas/core/indexes/extension.py b/pandas/core/indexes/extension.py index 0fc8b30abb882..81d502b60d609 100644 --- a/pandas/core/indexes/extension.py +++ b/pandas/core/indexes/extension.py @@ -187,9 +187,6 @@ class NDArrayBackedExtensionIndex(ExtensionIndex): def _get_engine_target(self) -> np.ndarray: return self._data._ndarray - def _get_join_target(self) -> np.ndarray: - return self._get_engine_target() - def _from_join_target(self, result: np.ndarray) -> ArrayLike: assert result.dtype == self._data._ndarray.dtype return self._data._from_backing_data(result) From efde9dd0c44afae35f4170beff5c2ffc5fe1588d Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Tue, 3 Jan 2023 22:50:49 +0100 Subject: [PATCH 18/20] Combine conditions --- pandas/core/indexes/base.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 7c19faf3b49c2..1b613790dece0 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -4778,10 +4778,13 @@ def _get_engine_target(self) -> ArrayLike: if isinstance(vals, StringArray): # GH#45652 much more performant than ExtensionEngine return vals._ndarray - if type(self) is Index and isinstance(self._values, ExtensionArray): - if not isinstance(self._values, BaseMaskedArray): - # TODO(ExtensionIndex): remove special-case, just use self._values - return self._values.astype(object) + if ( + type(self) is Index + and isinstance(self._values, ExtensionArray) + and not isinstance(self._values, BaseMaskedArray) + ): + # TODO(ExtensionIndex): remove special-case, just use self._values + return self._values.astype(object) return vals def _get_join_target(self) -> ArrayLike: From 7572eb53559ff0c558c3b2404edf3b8885af8cc1 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Mon, 23 Jan 2023 20:31:41 -0500 Subject: [PATCH 19/20] Adjust asv --- asv_bench/benchmarks/indexing.py | 15 ++++++++++----- asv_bench/benchmarks/indexing_engines.py | 5 ++++- 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/asv_bench/benchmarks/indexing.py b/asv_bench/benchmarks/indexing.py index b2fdcf5480279..519db1886d18b 100644 --- a/asv_bench/benchmarks/indexing.py +++ b/asv_bench/benchmarks/indexing.py @@ -85,6 +85,11 @@ def time_loc_slice(self, index, index_structure): class NumericMaskedIndexing: + monotonic_list = list(range(10**6)) + non_monotonic_list = ( + list(range(50)) + [54, 53, 52, 51] + list(range(55, 10**6 - 1)) + ) + params = [ ("Int64", "UInt64", "Float64"), (True, False), @@ -92,12 +97,12 @@ class NumericMaskedIndexing: param_names = ["dtype", "monotonic"] def setup(self, dtype, monotonic): - N = 10**6 + indices = { - True: Index(range(N), dtype=dtype), - False: Index( - list(range(50)) + [54, 53, 52, 51] + list(range(55, N - 1)), dtype=dtype - ).append(Index([NA], dtype=dtype)), + True: Index(self.monotonic_list, dtype=dtype), + False: Index(self.non_monotonic_list, dtype=dtype).append( + Index([NA], dtype=dtype) + ), } self.data = indices[monotonic] self.indexer = np.arange(300, 1_000) diff --git a/asv_bench/benchmarks/indexing_engines.py b/asv_bench/benchmarks/indexing_engines.py index ad15d821ba116..ce208761638c5 100644 --- a/asv_bench/benchmarks/indexing_engines.py +++ b/asv_bench/benchmarks/indexing_engines.py @@ -1,5 +1,8 @@ """ -Benchmarks in this file depend exclusively on code in _libs/ +Benchmarks in this file depend mostly on code in _libs/ + +We have to created masked arrays to test the masked engine though. The +array is unpacked on the Cython level. If a PR does not edit anything in _libs, it is very unlikely that benchmarks in this file will be affected. From aab7ed028ccdfb82985a9da577a03b7275525fd2 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Wed, 25 Jan 2023 19:35:34 -0500 Subject: [PATCH 20/20] Add whatsnew --- doc/source/whatsnew/v2.0.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index dc05745c8c0e5..f8ac9645f758d 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -884,6 +884,7 @@ Performance improvements - Performance improvement in :func:`to_datetime` when parsing strings with timezone offsets (:issue:`50107`) - Performance improvement in :meth:`DataFrame.loc` and :meth:`Series.loc` for tuple-based indexing of a :class:`MultiIndex` (:issue:`48384`) - Performance improvement for :meth:`MultiIndex.unique` (:issue:`48335`) +- Performance improvement for indexing operations with nullable dtypes (:issue:`49420`) - Performance improvement for :func:`concat` with extension array backed indexes (:issue:`49128`, :issue:`49178`) - Reduce memory usage of :meth:`DataFrame.to_pickle`/:meth:`Series.to_pickle` when using BZ2 or LZMA (:issue:`49068`) - Performance improvement for :class:`~arrays.StringArray` constructor passing a numpy array with type ``np.str_`` (:issue:`49109`)