Skip to content

PERF: Always using panda's hashtable approach, dropping np.in1d #36611

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 14 commits into from
Nov 17, 2020
Merged
49 changes: 49 additions & 0 deletions asv_bench/benchmarks/series_methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,55 @@ def time_isin_long_series_long_values_floats(self):
self.s_long_floats.isin(self.vals_long_floats)


class IsInLongSeriesLookUpDominates:
params = [
["int64", "int32", "float64", "float32", "object"],
[5, 1000],
["random_hits", "random_misses", "monotone_hits", "monotone_misses"],
]
param_names = ["dtype", "MaxNumber", "series_type"]

def setup(self, dtype, MaxNumber, series_type):
N = 10 ** 7
if series_type == "random_hits":
np.random.seed(42)
array = np.random.randint(0, MaxNumber, N)
if series_type == "random_misses":
np.random.seed(42)
array = np.random.randint(0, MaxNumber, N) + MaxNumber
if series_type == "monotone_hits":
array = np.repeat(np.arange(MaxNumber), N // MaxNumber)
if series_type == "monotone_misses":
array = np.arange(N) + MaxNumber
self.series = Series(array).astype(dtype)
self.values = np.arange(MaxNumber).astype(dtype)

def time_isin(self, dtypes, MaxNumber, series_type):
self.series.isin(self.values)


class IsInLongSeriesValuesDominate:
params = [
["int64", "int32", "float64", "float32", "object"],
["random", "monotone"],
]
param_names = ["dtype", "series_type"]

def setup(self, dtype, series_type):
N = 10 ** 7
if series_type == "random":
np.random.seed(42)
vals = np.random.randint(0, 10 * N, N)
if series_type == "monotone":
vals = np.arange(N)
self.values = vals.astype(dtype)
M = 10 ** 6 + 1
self.series = Series(np.arange(M)).astype(dtype)

def time_isin(self, dtypes, series_type):
self.series.isin(self.values)


class NSort:

params = ["first", "last", "all"]
Expand Down
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.2.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -482,6 +482,7 @@ Performance improvements
- faster ``dir`` calls when many index labels, e.g. ``dir(ser)`` (:issue:`37450`)
- Performance improvement in :class:`ExpandingGroupby` (:issue:`37064`)
- Performance improvement in :meth:`pd.DataFrame.groupby` for ``float`` ``dtype`` (:issue:`28303`), changes of the underlying hash-function can lead to changes in float based indexes sort ordering for ties (e.g. :meth:`pd.Index.value_counts`)
- Performance improvement in :meth:`pd.isin` for inputs with more than 1e6 elements

.. ---------------------------------------------------------------------------

Expand Down
9 changes: 5 additions & 4 deletions pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -433,19 +433,20 @@ def isin(comps: AnyArrayLike, values: AnyArrayLike) -> np.ndarray:
comps, dtype = _ensure_data(comps)
values, _ = _ensure_data(values, dtype=dtype)

# faster for larger cases to use np.in1d
f = htable.ismember_object

# GH16012
# Ensure np.in1d doesn't get object types or it *may* throw an exception
if len(comps) > 1_000_000 and not is_object_dtype(comps):
# Albeit hashmap has O(1) look-up (vs. O(logn) in sorted array),
# in1d is faster for small sizes
if len(comps) > 1_000_000 and len(values) <= 26 and not is_object_dtype(comps):
# If the the values include nan we need to check for nan explicitly
# since np.nan it not equal to np.nan
if isna(values).any():
f = lambda c, v: np.logical_or(np.in1d(c, v), np.isnan(c))
else:
f = np.in1d
elif is_integer_dtype(comps):
elif is_integer_dtype(comps.dtype):
try:
values = values.astype("int64", copy=False)
comps = comps.astype("int64", copy=False)
Expand All @@ -454,7 +455,7 @@ def isin(comps: AnyArrayLike, values: AnyArrayLike) -> np.ndarray:
values = values.astype(object)
comps = comps.astype(object)

elif is_float_dtype(comps):
elif is_float_dtype(comps.dtype):
try:
values = values.astype("float64", copy=False)
comps = comps.astype("float64", copy=False)
Expand Down