Skip to content

ENH: Add masked engine #49420

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 35 commits into from
Jan 26, 2023
Merged
Show file tree
Hide file tree
Changes from 10 commits
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
6f5a8ea
Implement masked engine
phofl Oct 31, 2022
a051815
Add tests
phofl Oct 31, 2022
3513430
Fix asv
phofl Oct 31, 2022
2246123
Fix mypy
phofl Oct 31, 2022
1d35e2e
Add test
phofl Oct 31, 2022
797269a
Fix error
phofl Oct 31, 2022
32e6db4
Fix windows builds
phofl Nov 1, 2022
9380873
Merge remote-tracking branch 'upstream/main' into enh_masked_engine
phofl Nov 1, 2022
36a0d29
Fix typing
phofl Nov 1, 2022
ddcdb13
Use np arrays
phofl Nov 2, 2022
e29a970
Adress review
phofl Nov 10, 2022
d258bc3
Merge remote-tracking branch 'upstream/main' into enh_masked_engine
phofl Nov 16, 2022
6fac3b8
Merge remote-tracking branch 'upstream/main' into enh_masked_engine
phofl Nov 22, 2022
aec65a7
Adapt to join difference
phofl Nov 22, 2022
f0e749a
Merge remote-tracking branch 'upstream/main' into enh_masked_engine
phofl Nov 29, 2022
69113cb
Address review
phofl Nov 30, 2022
cadc239
Add todo
phofl Nov 30, 2022
7569039
Merge remote-tracking branch 'upstream/main' into enh_masked_engine
phofl Dec 13, 2022
06fbe03
Move import
phofl Dec 13, 2022
e431337
Merge remote-tracking branch 'upstream/main' into enh_masked_engine
phofl Dec 16, 2022
9326c14
Merge remote-tracking branch 'upstream/main' into enh_masked_engine
phofl Dec 16, 2022
44ba473
Refactor
phofl Dec 16, 2022
844155f
Merge remote-tracking branch 'upstream/main' into enh_masked_engine
phofl Dec 19, 2022
1a6fec0
Remove unnecessary function
phofl Dec 19, 2022
befac73
Remove unnecessary function
phofl Dec 19, 2022
9945a07
Merge branch 'main' into enh_masked_engine
phofl Dec 20, 2022
5faa2fe
Merge branch 'main' into enh_masked_engine
phofl Dec 28, 2022
efde9dd
Combine conditions
phofl Jan 3, 2023
7711ac0
Merge branch 'main' into enh_masked_engine
phofl Jan 4, 2023
ca6e8dd
Merge branch 'main' into enh_masked_engine
phofl Jan 7, 2023
958b40e
Merge branch 'main' into enh_masked_engine
phofl Jan 14, 2023
7572eb5
Adjust asv
phofl Jan 24, 2023
f2b0a98
Merge remote-tracking branch 'origin/enh_masked_engine' into enh_mask…
phofl Jan 24, 2023
be6fb4d
Merge branch 'main' into enh_masked_engine
phofl Jan 24, 2023
aab7ed0
Add whatsnew
phofl Jan 26, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 28 additions & 0 deletions asv_bench/benchmarks/indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,11 @@
import numpy as np

from pandas import (
NA,
CategoricalIndex,
DataFrame,
Float64Index,
Index,
Int64Index,
IntervalIndex,
MultiIndex,
Expand Down Expand Up @@ -87,6 +89,32 @@ def time_loc_slice(self, index, index_structure):
self.data.loc[:800000]


class NumericMaskedIndexing:
params = [
("Int64", "UInt64", "Float64"),
(True, False),
]
param_names = ["dtype", "monotonic"]

def setup(self, dtype, monotonic):
N = 10**6
indices = {
True: Index(range(N), dtype=dtype),
False: Index(
list(range(50)) + [54, 53, 52, 51] + list(range(55, N - 1)), dtype=dtype
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

NBD but i think if you do this at the class level outside of setup it might only construct once. might add up

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we run into risks of caching stuff on the Index level that could impact performance?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

its possible. could just create the arrays instead of the Index objects

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good point, moved the list construction outside. We should probably think about going over the asvs, we are doing this in a lot of places.

).append(Index([NA], dtype=dtype)),
}
self.data = indices[monotonic]
self.indexer = np.arange(300, 1_000)
self.data_dups = self.data.append(self.data)

def time_get_indexer(self, dtype, monotonic):
self.data.get_indexer(self.indexer)

def time_get_indexer_dups(self, dtype, monotonic):
self.data.get_indexer_for(self.indexer)


class NonNumericSeriesIndexing:

params = [
Expand Down
75 changes: 75 additions & 0 deletions asv_bench/benchmarks/indexing_engines.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,26 @@ def _get_numeric_engines():
]


def _get_masked_engines():
engine_names = [
("MaskedInt64Engine", "Int64"),
("MaskedInt32Engine", "Int32"),
("MaskedInt16Engine", "Int16"),
("MaskedInt8Engine", "Int8"),
("MaskedUInt64Engine", "UInt64"),
("MaskedUInt32Engine", "UInt32"),
("MaskedUInt16engine", "UInt16"),
("MaskedUInt8Engine", "UInt8"),
("MaskedFloat64Engine", "Float64"),
("MaskedFloat32Engine", "Float32"),
]
return [
(getattr(libindex, engine_name), dtype)
for engine_name, dtype in engine_names
if hasattr(libindex, engine_name)
]


class NumericEngineIndexing:

params = [
Expand Down Expand Up @@ -80,6 +100,61 @@ def time_get_loc_near_middle(self, engine_and_dtype, index_type, unique, N):
self.data.get_loc(self.key_middle)


class MaskedNumericEngineIndexing:

params = [
_get_masked_engines(),
["monotonic_incr", "monotonic_decr", "non_monotonic"],
[True, False],
[10**5, 2 * 10**6], # 2e6 is above SIZE_CUTOFF
]
param_names = ["engine_and_dtype", "index_type", "unique", "N"]

def setup(self, engine_and_dtype, index_type, unique, N):
engine, dtype = engine_and_dtype

if index_type == "monotonic_incr":
if unique:
arr = np.arange(N * 3, dtype=dtype.lower())
else:
values = list([1] * N + [2] * N + [3] * N)
arr = np.array(values, dtype=dtype.lower())
mask = np.zeros(N * 3, dtype="uint8")
elif index_type == "monotonic_decr":
if unique:
arr = np.arange(N * 3, dtype=dtype.lower())[::-1]
else:
values = list([1] * N + [2] * N + [3] * N)
arr = np.array(values, dtype=dtype.lower())[::-1]
mask = np.zeros(N * 3, dtype="uint8")
else:
assert index_type == "non_monotonic"
if unique:
arr = np.zeros(N * 3, dtype=dtype.lower())
arr[:N] = np.arange(N * 2, N * 3, dtype=dtype.lower())
arr[N:] = np.arange(N * 2, dtype=dtype.lower())

else:
arr = np.array([1, 2, 3] * N, dtype=dtype.lower())
mask = np.zeros(N * 3, dtype="uint8")
mask[-1] = True

self.data = engine(arr, mask)
# code belows avoids populating the mapping etc. while timing.
self.data.get_loc(2)

self.key_middle = arr[len(arr) // 2]
self.key_early = arr[2]

def time_get_loc(self, engine_and_dtype, index_type, unique, N):
self.data.get_loc(self.key_early)

def time_get_loc_near_middle(self, engine_and_dtype, index_type, unique, N):
# searchsorted performance may be different near the middle of a range
# vs near an endpoint
self.data.get_loc(self.key_middle)


class ObjectEngineIndexing:

params = [("monotonic_incr", "monotonic_decr", "non_monotonic")]
Expand Down
2 changes: 2 additions & 0 deletions pandas/_libs/hashtable.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -125,10 +125,12 @@ class HashTable:
def map_locations(
self,
values: np.ndarray, # np.ndarray[subclass-specific]
mask: np.ndarray | None = ...,
) -> None: ...
def lookup(
self,
values: np.ndarray, # np.ndarray[subclass-specific]
mask: np.ndarray | None = ...,
) -> npt.NDArray[np.intp]: ...
def get_labels(
self,
Expand Down
12 changes: 8 additions & 4 deletions pandas/_libs/hashtable_class_helper.pxi.in
Original file line number Diff line number Diff line change
Expand Up @@ -966,8 +966,9 @@ cdef class StringHashTable(HashTable):
return labels

@cython.boundscheck(False)
def lookup(self, ndarray[object] values) -> ndarray:
def lookup(self, ndarray[object] values, object mask = None) -> ndarray:
# -> np.ndarray[np.intp]
# mask not yet implemented
cdef:
Py_ssize_t i, n = len(values)
int ret = 0
Expand Down Expand Up @@ -1002,7 +1003,8 @@ cdef class StringHashTable(HashTable):
return np.asarray(locs)

@cython.boundscheck(False)
def map_locations(self, ndarray[object] values) -> None:
def map_locations(self, ndarray[object] values, object mask = None) -> None:
# mask not yet implemented
cdef:
Py_ssize_t i, n = len(values)
int ret = 0
Expand Down Expand Up @@ -1275,7 +1277,8 @@ cdef class PyObjectHashTable(HashTable):
else:
raise KeyError(key)

def map_locations(self, ndarray[object] values) -> None:
def map_locations(self, ndarray[object] values, object mask = None) -> None:
# mask not yet implemented
cdef:
Py_ssize_t i, n = len(values)
int ret = 0
Expand All @@ -1289,8 +1292,9 @@ cdef class PyObjectHashTable(HashTable):
k = kh_put_pymap(self.table, <PyObject*>val, &ret)
self.table.vals[k] = i

def lookup(self, ndarray[object] values) -> ndarray:
def lookup(self, ndarray[object] values, object mask = None) -> ndarray:
# -> np.ndarray[np.intp]
# mask not yet implemented
cdef:
Py_ssize_t i, n = len(values)
int ret = 0
Expand Down
27 changes: 23 additions & 4 deletions pandas/_libs/index.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ from pandas.core.arrays import ExtensionArray

class IndexEngine:
over_size_threshold: bool
def __init__(self, values: np.ndarray) -> None: ...
def __init__(self, values: np.ndarray, mask: np.ndarray | None = ...) -> None: ...
def __contains__(self, val: object) -> bool: ...

# -> int | slice | np.ndarray[bool]
Expand All @@ -23,12 +23,19 @@ class IndexEngine:
@property
def is_mapping_populated(self) -> bool: ...
def clear_mapping(self): ...
def get_indexer(self, values: np.ndarray) -> npt.NDArray[np.intp]: ...
def get_indexer(
self, values: np.ndarray, mask: np.ndarray | None = ...
) -> npt.NDArray[np.intp]: ...
def get_indexer_non_unique(
self,
targets: np.ndarray,
) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: ...

class MaskedIndexEngine(IndexEngine):
def get_indexer_non_unique( # type: ignore[override]
self, targets: np.ndarray, target_mask: np.ndarray
) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: ...

class Float64Engine(IndexEngine): ...
class Float32Engine(IndexEngine): ...
class Complex128Engine(IndexEngine): ...
Expand All @@ -46,6 +53,19 @@ class DatetimeEngine(Int64Engine): ...
class TimedeltaEngine(DatetimeEngine): ...
class PeriodEngine(Int64Engine): ...
class BoolEngine(UInt8Engine): ...
class MaskedBoolEngine(MaskedUInt8Engine): ...
class MaskedFloat64Engine(MaskedIndexEngine): ...
class MaskedFloat32Engine(MaskedIndexEngine): ...
class MaskedComplex128Engine(MaskedIndexEngine): ...
class MaskedComplex64Engine(MaskedIndexEngine): ...
class MaskedInt64Engine(MaskedIndexEngine): ...
class MaskedInt32Engine(MaskedIndexEngine): ...
class MaskedInt16Engine(MaskedIndexEngine): ...
class MaskedInt8Engine(MaskedIndexEngine): ...
class MaskedUInt64Engine(MaskedIndexEngine): ...
class MaskedUInt32Engine(MaskedIndexEngine): ...
class MaskedUInt16Engine(MaskedIndexEngine): ...
class MaskedUInt8Engine(MaskedIndexEngine): ...

class BaseMultiIndexCodesEngine:
levels: list[np.ndarray]
Expand All @@ -58,8 +78,7 @@ class BaseMultiIndexCodesEngine:
offsets: np.ndarray, # np.ndarray[np.uint64, ndim=1]
) -> None: ...
def get_indexer(
self,
target: npt.NDArray[np.object_],
self, target: npt.NDArray[np.object_], mask: np.ndarray | None = ...
) -> npt.NDArray[np.intp]: ...
def _extract_level_codes(self, target: MultiIndex) -> np.ndarray: ...
def get_indexer_with_fill(
Expand Down
Loading