diff --git a/pandas/_libs/hashtable_func_helper.pxi.in b/pandas/_libs/hashtable_func_helper.pxi.in index fa373905ef08a..0608af8f8504b 100644 --- a/pandas/_libs/hashtable_func_helper.pxi.in +++ b/pandas/_libs/hashtable_func_helper.pxi.in @@ -11,14 +11,14 @@ WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in {{py: # dtype, ttype -dtypes = [('float64', 'float64'), - ('uint64', 'uint64'), - ('object', 'pymap'), - ('int64', 'int64')] +dtypes = [('float64', 'float64', 'float64_t'), + ('uint64', 'uint64', 'uint64_t'), + ('object', 'pymap', 'object'), + ('int64', 'int64', 'int64_t')] }} -{{for dtype, ttype in dtypes}} +{{for dtype, ttype, scalar in dtypes}} @cython.wraparound(False) @@ -34,9 +34,7 @@ cdef build_count_table_{{dtype}}({{dtype}}_t[:] values, khiter_t k Py_ssize_t i, n = len(values) - {{if dtype != 'object'}} - {{dtype}}_t val - {{endif}} + {{scalar}} val int ret = 0 @@ -79,7 +77,7 @@ cdef build_count_table_{{dtype}}({{dtype}}_t[:] values, {{if dtype == 'object'}} cpdef value_count_{{dtype}}(ndarray[{{dtype}}] values, bint dropna): {{else}} -cpdef value_count_{{dtype}}({{dtype}}_t[:] values, bint dropna): +cpdef value_count_{{dtype}}({{scalar}}[:] values, bint dropna): {{endif}} cdef: Py_ssize_t i=0 @@ -130,12 +128,11 @@ cpdef value_count_{{dtype}}({{dtype}}_t[:] values, bint dropna): @cython.boundscheck(False) {{if dtype == 'object'}} - def duplicated_{{dtype}}(ndarray[{{dtype}}] values, object keep='first'): {{else}} -def duplicated_{{dtype}}({{dtype}}_t[:] values, object keep='first'): +def duplicated_{{dtype}}({{scalar}}[:] values, object keep='first'): {{endif}} cdef: int ret = 0 @@ -203,8 +200,87 @@ def duplicated_{{dtype}}({{dtype}}_t[:] values, object keep='first'): kh_destroy_{{ttype}}(table) return out + +#---------------------------------------------------------------------- +# Membership +#---------------------------------------------------------------------- + + +@cython.wraparound(False) +@cython.boundscheck(False) +{{if dtype == 'object'}} + +def ismember_{{dtype}}(ndarray[{{scalar}}] arr, ndarray[{{scalar}}] values, bint hasnans=0): +{{else}} + +def ismember_{{dtype}}({{scalar}}[:] arr, {{scalar}}[:] values, bint hasnans=0): +{{endif}} + + """ + Return boolean of values in arr on an + element by-element basis + + Parameters + ---------- + arr : {{dtype}} ndarray + values : {{dtype}} ndarray + hasnans : bint, optional + + Returns + ------- + boolean ndarry len of (arr) + """ + cdef: + Py_ssize_t i, n, k + int ret = 0 + ndarray[uint8_t] result + {{scalar}} val + kh_{{ttype}}_t * table = kh_init_{{ttype}}() + + + # construct the table + n = len(values) + kh_resize_{{ttype}}(table, min(n, len(values))) + + {{if dtype == 'object'}} + for i in range(n): + kh_put_{{ttype}}(table, values[i], &ret) + {{else}} + with nogil: + for i in range(n): + kh_put_{{ttype}}(table, values[i], &ret) + {{endif}} + + # test membership + n = len(arr) + result = np.empty(n, dtype=np.uint8) + + {{if dtype == 'object'}} + for i in range(n): + val = arr[i] + k = kh_get_{{ttype}}(table, val) + if k != table.n_buckets: + result[i] = 1 + else: + result[i] = hasnans and val != val + {{else}} + with nogil: + for i in range(n): + val = arr[i] + k = kh_get_{{ttype}}(table, val) + if k != table.n_buckets: + result[i] = 1 + else: + result[i] = hasnans and val != val + {{endif}} + + kh_destroy_{{ttype}}(table) + return result.view(np.bool_) + {{endfor}} + + #---------------------------------------------------------------------- # Mode Computations #---------------------------------------------------------------------- diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index b4724bc3dd59b..f78040e5a52f2 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -13,6 +13,7 @@ cdef extern from "numpy/arrayobject.h": cdef enum NPY_TYPES: NPY_intp "NPY_INTP" +from libc.stdlib cimport malloc, free from cpython cimport (PyDict_New, PyDict_GetItem, PyDict_SetItem, PyDict_Contains, PyDict_Keys, @@ -111,77 +112,6 @@ cpdef map_indices_list(list index): return result -from libc.stdlib cimport malloc, free - - -def ismember_nans(float64_t[:] arr, set values, bint hasnans): - cdef: - Py_ssize_t i, n - ndarray[uint8_t] result - float64_t val - - n = len(arr) - result = np.empty(n, dtype=np.uint8) - for i in range(n): - val = arr[i] - result[i] = val in values or hasnans and isnan(val) - - return result.view(np.bool_) - - -def ismember(ndarray arr, set values): - """ - Checks whether - - Parameters - ---------- - arr : ndarray - values : set - - Returns - ------- - ismember : ndarray (boolean dtype) - """ - cdef: - Py_ssize_t i, n - ndarray[uint8_t] result - object val - - n = len(arr) - result = np.empty(n, dtype=np.uint8) - for i in range(n): - val = util.get_value_at(arr, i) - result[i] = val in values - - return result.view(np.bool_) - - -def ismember_int64(ndarray[int64_t] arr, set values): - """ - Checks whether - - Parameters - ---------- - arr : ndarray of int64 - values : set - - Returns - ------- - ismember : ndarray (boolean dtype) - """ - cdef: - Py_ssize_t i, n - ndarray[uint8_t] result - int64_t v - - n = len(arr) - result = np.empty(n, dtype=np.uint8) - for i in range(n): - result[i] = arr[i] in values - - return result.view(np.bool_) - - @cython.wraparound(False) @cython.boundscheck(False) def memory_usage_of_objects(ndarray[object, ndim=1] arr): diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 00a3264e6c74a..9a8d0a779105e 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -12,12 +12,12 @@ from pandas.types.common import (is_unsigned_integer_dtype, is_signed_integer_dtype, is_integer_dtype, - is_int64_dtype, is_categorical_dtype, is_extension_type, is_datetimetz, is_period_dtype, is_period_arraylike, + is_numeric_dtype, is_float_dtype, is_bool_dtype, needs_i8_conversion, @@ -197,19 +197,37 @@ def isin(comps, values): except TypeError: # object array conversion will fail pass - else: + elif is_numeric_dtype(comps): comps = np.asarray(comps) values = np.asarray(values) + else: + comps = np.asarray(comps).astype(object) + values = np.asarray(values).astype(object) # GH11232 # work-around for numpy < 1.8 and comparisions on py3 # faster for larger cases to use np.in1d + f = lambda x, y: htable.ismember_object(x, values) if (_np_version_under1p8 and compat.PY3) or len(comps) > 1000000: - f = lambda x, y: np.in1d(x, np.asarray(list(y))) - elif is_int64_dtype(comps): - f = lambda x, y: lib.ismember_int64(x, set(y)) - else: - f = lambda x, y: lib.ismember(x, set(values)) + f = lambda x, y: np.in1d(x, y) + elif is_integer_dtype(comps): + try: + values = values.astype('int64', copy=False) + comps = comps.astype('int64', copy=False) + f = lambda x, y: htable.ismember_int64(x, y) + except (TypeError, ValueError): + values = values.astype(object) + comps = comps.astype(object) + + elif is_float_dtype(comps): + try: + values = values.astype('float64', copy=False) + comps = comps.astype('float64', copy=False) + checknull = isnull(values).any() + f = lambda x, y: htable.ismember_float64(x, y, checknull) + except (TypeError, ValueError): + values = values.astype(object) + comps = comps.astype(object) return f(comps, values) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 732d88b47ae2a..b49aa926d1923 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5358,8 +5358,8 @@ def isin(self, values): "you passed a " "{0!r}".format(type(values).__name__)) return DataFrame( - lib.ismember(self.values.ravel(), - set(values)).reshape(self.shape), self.index, + algorithms.isin(self.values.ravel(), + values).reshape(self.shape), self.index, self.columns) # ---------------------------------------------------------------------- diff --git a/pandas/indexes/multi.py b/pandas/indexes/multi.py index 978492131ca89..e6ae0605d4758 100644 --- a/pandas/indexes/multi.py +++ b/pandas/indexes/multi.py @@ -1392,7 +1392,7 @@ def _drop_from_level(self, labels, level): index = self.levels[i] values = index.get_indexer(labels) - mask = ~lib.ismember(self.labels[i], set(values)) + mask = ~algos.isin(self.labels[i], values) return self[mask] @@ -2463,7 +2463,8 @@ def _wrap_joined_index(self, joined, other): @Appender(Index.isin.__doc__) def isin(self, values, level=None): if level is None: - return lib.ismember(np.array(self), set(values)) + return algos.isin(self.values, + MultiIndex.from_tuples(values).values) else: num = self._get_level_number(level) levs = self.levels[num] diff --git a/pandas/indexes/numeric.py b/pandas/indexes/numeric.py index 2f897c81975c2..31258c785d9e8 100644 --- a/pandas/indexes/numeric.py +++ b/pandas/indexes/numeric.py @@ -1,13 +1,13 @@ import numpy as np -from pandas._libs import (lib, index as libindex, +from pandas._libs import (index as libindex, algos as libalgos, join as libjoin) from pandas.types.common import (is_dtype_equal, pandas_dtype, is_float_dtype, is_object_dtype, is_integer_dtype, is_scalar) -from pandas.types.missing import isnull from pandas.core.common import _asarray_tuplesafe, _values_from_object from pandas import compat +from pandas.core import algorithms from pandas.indexes.base import Index, InvalidIndexError, _index_shared_docs from pandas.util.decorators import Appender, cache_readonly import pandas.indexes.base as ibase @@ -379,11 +379,9 @@ def is_unique(self): @Appender(Index.isin.__doc__) def isin(self, values, level=None): - value_set = set(values) if level is not None: self._validate_index_level(level) - return lib.ismember_nans(np.array(self), value_set, - isnull(list(value_set)).any()) + return algorithms.isin(np.array(self), values) Float64Index._add_numeric_methods() diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 18343670fb39e..90d72c0bceeb7 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -26,6 +26,7 @@ from pandas.core.series import Series from pandas.core.frame import DataFrame from pandas.core.categorical import Categorical +from pandas.core import algorithms from pandas.core.common import AbstractMethodError from pandas.io.date_converters import generic_parser from pandas.io.common import (get_filepath_or_buffer, _validate_header_arg, @@ -1388,7 +1389,8 @@ def _convert_to_ndarrays(self, dct, na_values, na_fvalues, verbose=False, try: values = lib.map_infer(values, conv_f) except ValueError: - mask = lib.ismember(values, na_values).view(np.uint8) + mask = algorithms.isin( + values, list(na_values)).view(np.uint8) values = lib.map_infer_mask(values, conv_f, mask) cvals, na_count = self._infer_types( @@ -1436,7 +1438,7 @@ def _infer_types(self, values, na_values, try_num_bool=True): na_count = 0 if issubclass(values.dtype.type, (np.number, np.bool_)): - mask = lib.ismember(values, na_values) + mask = algorithms.isin(values, list(na_values)) na_count = mask.sum() if na_count > 0: if is_integer_dtype(values): diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 7199a38bb7a80..c4dc10d8174cc 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -1363,14 +1363,17 @@ def test_isin_nan(self): np.array([False, False])) tm.assert_numpy_array_equal(Index(['a', np.nan]).isin([pd.NaT]), np.array([False, False])) + # Float64Index overrides isin, so must be checked separately tm.assert_numpy_array_equal(Float64Index([1.0, np.nan]).isin([np.nan]), np.array([False, True])) tm.assert_numpy_array_equal( Float64Index([1.0, np.nan]).isin([float('nan')]), np.array([False, True])) + + # we cannot compare NaT with NaN tm.assert_numpy_array_equal(Float64Index([1.0, np.nan]).isin([pd.NaT]), - np.array([False, True])) + np.array([False, False])) def test_isin_level_kwarg(self): def check_idx(idx): diff --git a/pandas/tseries/tools.py b/pandas/tseries/tools.py index 093331e861fa7..5dc9746c6d6f9 100644 --- a/pandas/tseries/tools.py +++ b/pandas/tseries/tools.py @@ -13,6 +13,7 @@ from pandas.types.generic import (ABCIndexClass, ABCSeries, ABCDataFrame) from pandas.types.missing import notnull +from pandas.core import algorithms import pandas.compat as compat @@ -577,7 +578,7 @@ def calc_with_mask(carg, mask): # string with NaN-like try: - mask = ~lib.ismember(arg, tslib._nat_strings) + mask = ~algorithms.isin(arg, list(tslib._nat_strings)) return calc_with_mask(arg, mask) except: pass