CLN: relocate lib.ismember* to hashtable space

jreback · jreback · commit 32dd92912f15 · 2017-03-21T19:19:32.000-04:00
- fixes .isin on 32-bit (hopefully) - perf about 30% better - releases GIL Author: Jeff Reback <jeff@reback.net> Closes pandas-dev#15773 from jreback/ismember and squashes the following commits: a7dfe51 [Jeff Reback] CLN: relocate lib.ismember* to hashtable space
diff --git a/pandas/_libs/hashtable_func_helper.pxi.in b/pandas/_libs/hashtable_func_helper.pxi.in
@@ -11,14 +11,14 @@ WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in
 {{py:
 
 # dtype, ttype
-dtypes = [('float64', 'float64'),
-          ('uint64', 'uint64'),
-          ('object', 'pymap'),
-          ('int64', 'int64')]
+dtypes = [('float64', 'float64', 'float64_t'),
+          ('uint64', 'uint64', 'uint64_t'),
+          ('object', 'pymap', 'object'),
+          ('int64', 'int64', 'int64_t')]
 
 }}
 
-{{for dtype, ttype in dtypes}}
+{{for dtype, ttype, scalar in dtypes}}
 
 
 @cython.wraparound(False)
@@ -34,9 +34,7 @@ cdef build_count_table_{{dtype}}({{dtype}}_t[:] values,
         khiter_t k
         Py_ssize_t i, n = len(values)
 
-        {{if dtype != 'object'}}
-        {{dtype}}_t val
-        {{endif}}
+        {{scalar}} val
 
         int ret = 0
 
@@ -79,7 +77,7 @@ cdef build_count_table_{{dtype}}({{dtype}}_t[:] values,
 {{if dtype == 'object'}}
 cpdef value_count_{{dtype}}(ndarray[{{dtype}}] values, bint dropna):
 {{else}}
-cpdef value_count_{{dtype}}({{dtype}}_t[:] values, bint dropna):
+cpdef value_count_{{dtype}}({{scalar}}[:] values, bint dropna):
 {{endif}}
     cdef:
         Py_ssize_t i=0
@@ -130,12 +128,11 @@ cpdef value_count_{{dtype}}({{dtype}}_t[:] values, bint dropna):
 @cython.boundscheck(False)
 {{if dtype == 'object'}}
 
-
 def duplicated_{{dtype}}(ndarray[{{dtype}}] values, object keep='first'):
 {{else}}
 
 
-def duplicated_{{dtype}}({{dtype}}_t[:] values, object keep='first'):
+def duplicated_{{dtype}}({{scalar}}[:] values, object keep='first'):
 {{endif}}
     cdef:
         int ret = 0
@@ -203,8 +200,87 @@ def duplicated_{{dtype}}({{dtype}}_t[:] values, object keep='first'):
     kh_destroy_{{ttype}}(table)
     return out
 
+
+#----------------------------------------------------------------------
+# Membership
+#----------------------------------------------------------------------
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+{{if dtype == 'object'}}
+
+def ismember_{{dtype}}(ndarray[{{scalar}}] arr, ndarray[{{scalar}}] values, bint hasnans=0):
+{{else}}
+
+def ismember_{{dtype}}({{scalar}}[:] arr, {{scalar}}[:] values, bint hasnans=0):
+{{endif}}
+
+    """
+    Return boolean of values in arr on an
+    element by-element basis
+
+    Parameters
+    ----------
+    arr : {{dtype}} ndarray
+    values : {{dtype}} ndarray
+    hasnans : bint, optional
+
+    Returns
+    -------
+    boolean ndarry len of (arr)
+    """
+    cdef:
+        Py_ssize_t i, n, k
+        int ret = 0
+        ndarray[uint8_t] result
+        {{scalar}} val
+        kh_{{ttype}}_t * table = kh_init_{{ttype}}()
+
+
+    # construct the table
+    n = len(values)
+    kh_resize_{{ttype}}(table, min(n, len(values)))
+
+    {{if dtype == 'object'}}
+    for i in range(n):
+        kh_put_{{ttype}}(table, <PyObject*> values[i], &ret)
+    {{else}}
+    with nogil:
+        for i in range(n):
+            kh_put_{{ttype}}(table, values[i], &ret)
+    {{endif}}
+
+    # test membership
+    n = len(arr)
+    result = np.empty(n, dtype=np.uint8)
+
+    {{if dtype == 'object'}}
+    for i in range(n):
+        val = arr[i]
+        k = kh_get_{{ttype}}(table, <PyObject*> val)
+        if k != table.n_buckets:
+            result[i] = 1
+        else:
+            result[i] = hasnans and val != val
+    {{else}}
+    with nogil:
+        for i in range(n):
+            val = arr[i]
+            k = kh_get_{{ttype}}(table, val)
+            if k != table.n_buckets:
+                result[i] = 1
+            else:
+                result[i] = hasnans and val != val
+    {{endif}}
+
+    kh_destroy_{{ttype}}(table)
+    return result.view(np.bool_)
+
 {{endfor}}
 
+
+
 #----------------------------------------------------------------------
 # Mode Computations
 #----------------------------------------------------------------------
diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
@@ -13,6 +13,7 @@ cdef extern from "numpy/arrayobject.h":
     cdef enum NPY_TYPES:
         NPY_intp "NPY_INTP"
 
+from libc.stdlib cimport malloc, free
 
 from cpython cimport (PyDict_New, PyDict_GetItem, PyDict_SetItem,
                       PyDict_Contains, PyDict_Keys,
@@ -111,77 +112,6 @@ cpdef map_indices_list(list index):
     return result
 
 
-from libc.stdlib cimport malloc, free
-
-
-def ismember_nans(float64_t[:] arr, set values, bint hasnans):
-    cdef:
-        Py_ssize_t i, n
-        ndarray[uint8_t] result
-        float64_t val
-
-    n = len(arr)
-    result = np.empty(n, dtype=np.uint8)
-    for i in range(n):
-        val = arr[i]
-        result[i] = val in values or hasnans and isnan(val)
-
-    return result.view(np.bool_)
-
-
-def ismember(ndarray arr, set values):
-    """
-    Checks whether
-
-    Parameters
-    ----------
-    arr : ndarray
-    values : set
-
-    Returns
-    -------
-    ismember : ndarray (boolean dtype)
-    """
-    cdef:
-        Py_ssize_t i, n
-        ndarray[uint8_t] result
-        object val
-
-    n = len(arr)
-    result = np.empty(n, dtype=np.uint8)
-    for i in range(n):
-        val = util.get_value_at(arr, i)
-        result[i] = val in values
-
-    return result.view(np.bool_)
-
-
-def ismember_int64(ndarray[int64_t] arr, set values):
-    """
-    Checks whether
-
-    Parameters
-    ----------
-    arr : ndarray of int64
-    values : set
-
-    Returns
-    -------
-    ismember : ndarray (boolean dtype)
-    """
-    cdef:
-        Py_ssize_t i, n
-        ndarray[uint8_t] result
-        int64_t v
-
-    n = len(arr)
-    result = np.empty(n, dtype=np.uint8)
-    for i in range(n):
-        result[i] = arr[i] in values
-
-    return result.view(np.bool_)
-
-
 @cython.wraparound(False)
 @cython.boundscheck(False)
 def memory_usage_of_objects(ndarray[object, ndim=1] arr):
diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
@@ -12,12 +12,12 @@
 from pandas.types.common import (is_unsigned_integer_dtype,
                                  is_signed_integer_dtype,
                                  is_integer_dtype,
-                                 is_int64_dtype,
                                  is_categorical_dtype,
                                  is_extension_type,
                                  is_datetimetz,
                                  is_period_dtype,
                                  is_period_arraylike,
+                                 is_numeric_dtype,
                                  is_float_dtype,
                                  is_bool_dtype,
                                  needs_i8_conversion,
@@ -197,19 +197,37 @@ def isin(comps, values):
         except TypeError:
             # object array conversion will fail
             pass
-    else:
+    elif is_numeric_dtype(comps):
         comps = np.asarray(comps)
         values = np.asarray(values)
+    else:
+        comps = np.asarray(comps).astype(object)
+        values = np.asarray(values).astype(object)
 
     # GH11232
     # work-around for numpy < 1.8 and comparisions on py3
     # faster for larger cases to use np.in1d
+    f = lambda x, y: htable.ismember_object(x, values)
     if (_np_version_under1p8 and compat.PY3) or len(comps) > 1000000:
-        f = lambda x, y: np.in1d(x, np.asarray(list(y)))
-    elif is_int64_dtype(comps):
-        f = lambda x, y: lib.ismember_int64(x, set(y))
-    else:
-        f = lambda x, y: lib.ismember(x, set(values))
+        f = lambda x, y: np.in1d(x, y)
+    elif is_integer_dtype(comps):
+        try:
+            values = values.astype('int64', copy=False)
+            comps = comps.astype('int64', copy=False)
+            f = lambda x, y: htable.ismember_int64(x, y)
+        except (TypeError, ValueError):
+            values = values.astype(object)
+            comps = comps.astype(object)
+
+    elif is_float_dtype(comps):
+        try:
+            values = values.astype('float64', copy=False)
+            comps = comps.astype('float64', copy=False)
+            checknull = isnull(values).any()
+            f = lambda x, y: htable.ismember_float64(x, y, checknull)
+        except (TypeError, ValueError):
+            values = values.astype(object)
+            comps = comps.astype(object)
 
     return f(comps, values)
 
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -5358,8 +5358,8 @@ def isin(self, values):
                                 "you passed a "
                                 "{0!r}".format(type(values).__name__))
             return DataFrame(
-                lib.ismember(self.values.ravel(),
-                             set(values)).reshape(self.shape), self.index,
+                algorithms.isin(self.values.ravel(),
+                                values).reshape(self.shape), self.index,
                 self.columns)
 
     # ----------------------------------------------------------------------
diff --git a/pandas/indexes/multi.py b/pandas/indexes/multi.py
@@ -1392,7 +1392,7 @@ def _drop_from_level(self, labels, level):
         index = self.levels[i]
         values = index.get_indexer(labels)
 
-        mask = ~lib.ismember(self.labels[i], set(values))
+        mask = ~algos.isin(self.labels[i], values)
 
         return self[mask]
 
@@ -2463,7 +2463,8 @@ def _wrap_joined_index(self, joined, other):
     @Appender(Index.isin.__doc__)
     def isin(self, values, level=None):
         if level is None:
-            return lib.ismember(np.array(self), set(values))
+            return algos.isin(self.values,
+                              MultiIndex.from_tuples(values).values)
         else:
             num = self._get_level_number(level)
             levs = self.levels[num]
diff --git a/pandas/indexes/numeric.py b/pandas/indexes/numeric.py
@@ -1,13 +1,13 @@
 import numpy as np
-from pandas._libs import (lib, index as libindex,
+from pandas._libs import (index as libindex,
                           algos as libalgos, join as libjoin)
 from pandas.types.common import (is_dtype_equal, pandas_dtype,
                                  is_float_dtype, is_object_dtype,
                                  is_integer_dtype, is_scalar)
-from pandas.types.missing import isnull
 from pandas.core.common import _asarray_tuplesafe, _values_from_object
 
 from pandas import compat
+from pandas.core import algorithms
 from pandas.indexes.base import Index, InvalidIndexError, _index_shared_docs
 from pandas.util.decorators import Appender, cache_readonly
 import pandas.indexes.base as ibase
@@ -379,11 +379,9 @@ def is_unique(self):
 
     @Appender(Index.isin.__doc__)
     def isin(self, values, level=None):
-        value_set = set(values)
         if level is not None:
             self._validate_index_level(level)
-        return lib.ismember_nans(np.array(self), value_set,
-                                 isnull(list(value_set)).any())
+        return algorithms.isin(np.array(self), values)
 
 
 Float64Index._add_numeric_methods()
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
@@ -26,6 +26,7 @@
 from pandas.core.series import Series
 from pandas.core.frame import DataFrame
 from pandas.core.categorical import Categorical
+from pandas.core import algorithms
 from pandas.core.common import AbstractMethodError
 from pandas.io.date_converters import generic_parser
 from pandas.io.common import (get_filepath_or_buffer, _validate_header_arg,
@@ -1388,7 +1389,8 @@ def _convert_to_ndarrays(self, dct, na_values, na_fvalues, verbose=False,
                 try:
                     values = lib.map_infer(values, conv_f)
                 except ValueError:
-                    mask = lib.ismember(values, na_values).view(np.uint8)
+                    mask = algorithms.isin(
+                        values, list(na_values)).view(np.uint8)
                     values = lib.map_infer_mask(values, conv_f, mask)
 
                 cvals, na_count = self._infer_types(
@@ -1436,7 +1438,7 @@ def _infer_types(self, values, na_values, try_num_bool=True):
 
         na_count = 0
         if issubclass(values.dtype.type, (np.number, np.bool_)):
-            mask = lib.ismember(values, na_values)
+            mask = algorithms.isin(values, list(na_values))
             na_count = mask.sum()
             if na_count > 0:
                 if is_integer_dtype(values):
diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py
@@ -1363,14 +1363,17 @@ def test_isin_nan(self):
                                     np.array([False, False]))
         tm.assert_numpy_array_equal(Index(['a', np.nan]).isin([pd.NaT]),
                                     np.array([False, False]))
+
         # Float64Index overrides isin, so must be checked separately
         tm.assert_numpy_array_equal(Float64Index([1.0, np.nan]).isin([np.nan]),
                                     np.array([False, True]))
         tm.assert_numpy_array_equal(
             Float64Index([1.0, np.nan]).isin([float('nan')]),
             np.array([False, True]))
+
+        # we cannot compare NaT with NaN
         tm.assert_numpy_array_equal(Float64Index([1.0, np.nan]).isin([pd.NaT]),
-                                    np.array([False, True]))
+                                    np.array([False, False]))
 
     def test_isin_level_kwarg(self):
         def check_idx(idx):
diff --git a/pandas/tseries/tools.py b/pandas/tseries/tools.py