Skip to content

Commit d4d2e19

Browse files
authored
PERF: Always using panda's hashtable approach, dropping np.in1d (#36611)
1 parent 4f738a4 commit d4d2e19

File tree

3 files changed

+55
-4
lines changed

3 files changed

+55
-4
lines changed

asv_bench/benchmarks/series_methods.py

+49
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,55 @@ def time_isin_long_series_long_values_floats(self):
9090
self.s_long_floats.isin(self.vals_long_floats)
9191

9292

93+
class IsInLongSeriesLookUpDominates:
94+
params = [
95+
["int64", "int32", "float64", "float32", "object"],
96+
[5, 1000],
97+
["random_hits", "random_misses", "monotone_hits", "monotone_misses"],
98+
]
99+
param_names = ["dtype", "MaxNumber", "series_type"]
100+
101+
def setup(self, dtype, MaxNumber, series_type):
102+
N = 10 ** 7
103+
if series_type == "random_hits":
104+
np.random.seed(42)
105+
array = np.random.randint(0, MaxNumber, N)
106+
if series_type == "random_misses":
107+
np.random.seed(42)
108+
array = np.random.randint(0, MaxNumber, N) + MaxNumber
109+
if series_type == "monotone_hits":
110+
array = np.repeat(np.arange(MaxNumber), N // MaxNumber)
111+
if series_type == "monotone_misses":
112+
array = np.arange(N) + MaxNumber
113+
self.series = Series(array).astype(dtype)
114+
self.values = np.arange(MaxNumber).astype(dtype)
115+
116+
def time_isin(self, dtypes, MaxNumber, series_type):
117+
self.series.isin(self.values)
118+
119+
120+
class IsInLongSeriesValuesDominate:
121+
params = [
122+
["int64", "int32", "float64", "float32", "object"],
123+
["random", "monotone"],
124+
]
125+
param_names = ["dtype", "series_type"]
126+
127+
def setup(self, dtype, series_type):
128+
N = 10 ** 7
129+
if series_type == "random":
130+
np.random.seed(42)
131+
vals = np.random.randint(0, 10 * N, N)
132+
if series_type == "monotone":
133+
vals = np.arange(N)
134+
self.values = vals.astype(dtype)
135+
M = 10 ** 6 + 1
136+
self.series = Series(np.arange(M)).astype(dtype)
137+
138+
def time_isin(self, dtypes, series_type):
139+
self.series.isin(self.values)
140+
141+
93142
class NSort:
94143

95144
params = ["first", "last", "all"]

doc/source/whatsnew/v1.2.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -482,6 +482,7 @@ Performance improvements
482482
- faster ``dir`` calls when many index labels, e.g. ``dir(ser)`` (:issue:`37450`)
483483
- Performance improvement in :class:`ExpandingGroupby` (:issue:`37064`)
484484
- Performance improvement in :meth:`pd.DataFrame.groupby` for ``float`` ``dtype`` (:issue:`28303`), changes of the underlying hash-function can lead to changes in float based indexes sort ordering for ties (e.g. :meth:`pd.Index.value_counts`)
485+
- Performance improvement in :meth:`pd.isin` for inputs with more than 1e6 elements
485486

486487
.. ---------------------------------------------------------------------------
487488

pandas/core/algorithms.py

+5-4
Original file line numberDiff line numberDiff line change
@@ -433,19 +433,20 @@ def isin(comps: AnyArrayLike, values: AnyArrayLike) -> np.ndarray:
433433
comps, dtype = _ensure_data(comps)
434434
values, _ = _ensure_data(values, dtype=dtype)
435435

436-
# faster for larger cases to use np.in1d
437436
f = htable.ismember_object
438437

439438
# GH16012
440439
# Ensure np.in1d doesn't get object types or it *may* throw an exception
441-
if len(comps) > 1_000_000 and not is_object_dtype(comps):
440+
# Albeit hashmap has O(1) look-up (vs. O(logn) in sorted array),
441+
# in1d is faster for small sizes
442+
if len(comps) > 1_000_000 and len(values) <= 26 and not is_object_dtype(comps):
442443
# If the the values include nan we need to check for nan explicitly
443444
# since np.nan it not equal to np.nan
444445
if isna(values).any():
445446
f = lambda c, v: np.logical_or(np.in1d(c, v), np.isnan(c))
446447
else:
447448
f = np.in1d
448-
elif is_integer_dtype(comps):
449+
elif is_integer_dtype(comps.dtype):
449450
try:
450451
values = values.astype("int64", copy=False)
451452
comps = comps.astype("int64", copy=False)
@@ -454,7 +455,7 @@ def isin(comps: AnyArrayLike, values: AnyArrayLike) -> np.ndarray:
454455
values = values.astype(object)
455456
comps = comps.astype(object)
456457

457-
elif is_float_dtype(comps):
458+
elif is_float_dtype(comps.dtype):
458459
try:
459460
values = values.astype("float64", copy=False)
460461
comps = comps.astype("float64", copy=False)

0 commit comments

Comments
 (0)