Skip to content

CLN: relocate lib.ismember* to hashtable space #15773

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
98 changes: 87 additions & 11 deletions pandas/_libs/hashtable_func_helper.pxi.in
Original file line number Diff line number Diff line change
Expand Up @@ -11,14 +11,14 @@ WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in
{{py:

# dtype, ttype
dtypes = [('float64', 'float64'),
('uint64', 'uint64'),
('object', 'pymap'),
('int64', 'int64')]
dtypes = [('float64', 'float64', 'float64_t'),
('uint64', 'uint64', 'uint64_t'),
('object', 'pymap', 'object'),
('int64', 'int64', 'int64_t')]

}}

{{for dtype, ttype in dtypes}}
{{for dtype, ttype, scalar in dtypes}}


@cython.wraparound(False)
Expand All @@ -34,9 +34,7 @@ cdef build_count_table_{{dtype}}({{dtype}}_t[:] values,
khiter_t k
Py_ssize_t i, n = len(values)

{{if dtype != 'object'}}
{{dtype}}_t val
{{endif}}
{{scalar}} val

int ret = 0

Expand Down Expand Up @@ -79,7 +77,7 @@ cdef build_count_table_{{dtype}}({{dtype}}_t[:] values,
{{if dtype == 'object'}}
cpdef value_count_{{dtype}}(ndarray[{{dtype}}] values, bint dropna):
{{else}}
cpdef value_count_{{dtype}}({{dtype}}_t[:] values, bint dropna):
cpdef value_count_{{dtype}}({{scalar}}[:] values, bint dropna):
{{endif}}
cdef:
Py_ssize_t i=0
Expand Down Expand Up @@ -130,12 +128,11 @@ cpdef value_count_{{dtype}}({{dtype}}_t[:] values, bint dropna):
@cython.boundscheck(False)
{{if dtype == 'object'}}


def duplicated_{{dtype}}(ndarray[{{dtype}}] values, object keep='first'):
{{else}}


def duplicated_{{dtype}}({{dtype}}_t[:] values, object keep='first'):
def duplicated_{{dtype}}({{scalar}}[:] values, object keep='first'):
{{endif}}
cdef:
int ret = 0
Expand Down Expand Up @@ -203,8 +200,87 @@ def duplicated_{{dtype}}({{dtype}}_t[:] values, object keep='first'):
kh_destroy_{{ttype}}(table)
return out


#----------------------------------------------------------------------
# Membership
#----------------------------------------------------------------------


@cython.wraparound(False)
@cython.boundscheck(False)
{{if dtype == 'object'}}

def ismember_{{dtype}}(ndarray[{{scalar}}] arr, ndarray[{{scalar}}] values, bint hasnans=0):
{{else}}

def ismember_{{dtype}}({{scalar}}[:] arr, {{scalar}}[:] values, bint hasnans=0):
{{endif}}

"""
Return boolean of values in arr on an
element by-element basis

Parameters
----------
arr : {{dtype}} ndarray
values : {{dtype}} ndarray
hasnans : bint, optional

Returns
-------
boolean ndarry len of (arr)
"""
cdef:
Py_ssize_t i, n, k
int ret = 0
ndarray[uint8_t] result
{{scalar}} val
kh_{{ttype}}_t * table = kh_init_{{ttype}}()


# construct the table
n = len(values)
kh_resize_{{ttype}}(table, min(n, len(values)))

{{if dtype == 'object'}}
for i in range(n):
kh_put_{{ttype}}(table, <PyObject*> values[i], &ret)
{{else}}
with nogil:
for i in range(n):
kh_put_{{ttype}}(table, values[i], &ret)
{{endif}}

# test membership
n = len(arr)
result = np.empty(n, dtype=np.uint8)

{{if dtype == 'object'}}
for i in range(n):
val = arr[i]
k = kh_get_{{ttype}}(table, <PyObject*> val)
if k != table.n_buckets:
result[i] = 1
else:
result[i] = hasnans and val != val
{{else}}
with nogil:
for i in range(n):
val = arr[i]
k = kh_get_{{ttype}}(table, val)
if k != table.n_buckets:
result[i] = 1
else:
result[i] = hasnans and val != val
{{endif}}

kh_destroy_{{ttype}}(table)
return result.view(np.bool_)

{{endfor}}



#----------------------------------------------------------------------
# Mode Computations
#----------------------------------------------------------------------
Expand Down
72 changes: 1 addition & 71 deletions pandas/_libs/lib.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ cdef extern from "numpy/arrayobject.h":
cdef enum NPY_TYPES:
NPY_intp "NPY_INTP"

from libc.stdlib cimport malloc, free

from cpython cimport (PyDict_New, PyDict_GetItem, PyDict_SetItem,
PyDict_Contains, PyDict_Keys,
Expand Down Expand Up @@ -111,77 +112,6 @@ cpdef map_indices_list(list index):
return result


from libc.stdlib cimport malloc, free


def ismember_nans(float64_t[:] arr, set values, bint hasnans):
cdef:
Py_ssize_t i, n
ndarray[uint8_t] result
float64_t val

n = len(arr)
result = np.empty(n, dtype=np.uint8)
for i in range(n):
val = arr[i]
result[i] = val in values or hasnans and isnan(val)

return result.view(np.bool_)


def ismember(ndarray arr, set values):
"""
Checks whether

Parameters
----------
arr : ndarray
values : set

Returns
-------
ismember : ndarray (boolean dtype)
"""
cdef:
Py_ssize_t i, n
ndarray[uint8_t] result
object val

n = len(arr)
result = np.empty(n, dtype=np.uint8)
for i in range(n):
val = util.get_value_at(arr, i)
result[i] = val in values

return result.view(np.bool_)


def ismember_int64(ndarray[int64_t] arr, set values):
"""
Checks whether

Parameters
----------
arr : ndarray of int64
values : set

Returns
-------
ismember : ndarray (boolean dtype)
"""
cdef:
Py_ssize_t i, n
ndarray[uint8_t] result
int64_t v

n = len(arr)
result = np.empty(n, dtype=np.uint8)
for i in range(n):
result[i] = arr[i] in values

return result.view(np.bool_)


@cython.wraparound(False)
@cython.boundscheck(False)
def memory_usage_of_objects(ndarray[object, ndim=1] arr):
Expand Down
32 changes: 25 additions & 7 deletions pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,12 @@
from pandas.types.common import (is_unsigned_integer_dtype,
is_signed_integer_dtype,
is_integer_dtype,
is_int64_dtype,
is_categorical_dtype,
is_extension_type,
is_datetimetz,
is_period_dtype,
is_period_arraylike,
is_numeric_dtype,
is_float_dtype,
is_bool_dtype,
needs_i8_conversion,
Expand Down Expand Up @@ -197,19 +197,37 @@ def isin(comps, values):
except TypeError:
# object array conversion will fail
pass
else:
elif is_numeric_dtype(comps):
comps = np.asarray(comps)
values = np.asarray(values)
else:
comps = np.asarray(comps).astype(object)
values = np.asarray(values).astype(object)

# GH11232
# work-around for numpy < 1.8 and comparisions on py3
# faster for larger cases to use np.in1d
f = lambda x, y: htable.ismember_object(x, values)
if (_np_version_under1p8 and compat.PY3) or len(comps) > 1000000:
f = lambda x, y: np.in1d(x, np.asarray(list(y)))
elif is_int64_dtype(comps):
f = lambda x, y: lib.ismember_int64(x, set(y))
else:
f = lambda x, y: lib.ismember(x, set(values))
f = lambda x, y: np.in1d(x, y)
elif is_integer_dtype(comps):
try:
values = values.astype('int64', copy=False)
comps = comps.astype('int64', copy=False)
f = lambda x, y: htable.ismember_int64(x, y)
except (TypeError, ValueError):
values = values.astype(object)
comps = comps.astype(object)

elif is_float_dtype(comps):
try:
values = values.astype('float64', copy=False)
comps = comps.astype('float64', copy=False)
checknull = isnull(values).any()
f = lambda x, y: htable.ismember_float64(x, y, checknull)
except (TypeError, ValueError):
values = values.astype(object)
comps = comps.astype(object)

return f(comps, values)

Expand Down
4 changes: 2 additions & 2 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -5358,8 +5358,8 @@ def isin(self, values):
"you passed a "
"{0!r}".format(type(values).__name__))
return DataFrame(
lib.ismember(self.values.ravel(),
set(values)).reshape(self.shape), self.index,
algorithms.isin(self.values.ravel(),
values).reshape(self.shape), self.index,
self.columns)

# ----------------------------------------------------------------------
Expand Down
5 changes: 3 additions & 2 deletions pandas/indexes/multi.py
Original file line number Diff line number Diff line change
Expand Up @@ -1392,7 +1392,7 @@ def _drop_from_level(self, labels, level):
index = self.levels[i]
values = index.get_indexer(labels)

mask = ~lib.ismember(self.labels[i], set(values))
mask = ~algos.isin(self.labels[i], values)

return self[mask]

Expand Down Expand Up @@ -2463,7 +2463,8 @@ def _wrap_joined_index(self, joined, other):
@Appender(Index.isin.__doc__)
def isin(self, values, level=None):
if level is None:
return lib.ismember(np.array(self), set(values))
return algos.isin(self.values,
MultiIndex.from_tuples(values).values)
else:
num = self._get_level_number(level)
levs = self.levels[num]
Expand Down
8 changes: 3 additions & 5 deletions pandas/indexes/numeric.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
import numpy as np
from pandas._libs import (lib, index as libindex,
from pandas._libs import (index as libindex,
algos as libalgos, join as libjoin)
from pandas.types.common import (is_dtype_equal, pandas_dtype,
is_float_dtype, is_object_dtype,
is_integer_dtype, is_scalar)
from pandas.types.missing import isnull
from pandas.core.common import _asarray_tuplesafe, _values_from_object

from pandas import compat
from pandas.core import algorithms
from pandas.indexes.base import Index, InvalidIndexError, _index_shared_docs
from pandas.util.decorators import Appender, cache_readonly
import pandas.indexes.base as ibase
Expand Down Expand Up @@ -379,11 +379,9 @@ def is_unique(self):

@Appender(Index.isin.__doc__)
def isin(self, values, level=None):
value_set = set(values)
if level is not None:
self._validate_index_level(level)
return lib.ismember_nans(np.array(self), value_set,
isnull(list(value_set)).any())
return algorithms.isin(np.array(self), values)


Float64Index._add_numeric_methods()
Expand Down
6 changes: 4 additions & 2 deletions pandas/io/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
from pandas.core.series import Series
from pandas.core.frame import DataFrame
from pandas.core.categorical import Categorical
from pandas.core import algorithms
from pandas.core.common import AbstractMethodError
from pandas.io.date_converters import generic_parser
from pandas.io.common import (get_filepath_or_buffer, _validate_header_arg,
Expand Down Expand Up @@ -1388,7 +1389,8 @@ def _convert_to_ndarrays(self, dct, na_values, na_fvalues, verbose=False,
try:
values = lib.map_infer(values, conv_f)
except ValueError:
mask = lib.ismember(values, na_values).view(np.uint8)
mask = algorithms.isin(
values, list(na_values)).view(np.uint8)
values = lib.map_infer_mask(values, conv_f, mask)

cvals, na_count = self._infer_types(
Expand Down Expand Up @@ -1436,7 +1438,7 @@ def _infer_types(self, values, na_values, try_num_bool=True):

na_count = 0
if issubclass(values.dtype.type, (np.number, np.bool_)):
mask = lib.ismember(values, na_values)
mask = algorithms.isin(values, list(na_values))
na_count = mask.sum()
if na_count > 0:
if is_integer_dtype(values):
Expand Down
5 changes: 4 additions & 1 deletion pandas/tests/indexes/test_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -1363,14 +1363,17 @@ def test_isin_nan(self):
np.array([False, False]))
tm.assert_numpy_array_equal(Index(['a', np.nan]).isin([pd.NaT]),
np.array([False, False]))

# Float64Index overrides isin, so must be checked separately
tm.assert_numpy_array_equal(Float64Index([1.0, np.nan]).isin([np.nan]),
np.array([False, True]))
tm.assert_numpy_array_equal(
Float64Index([1.0, np.nan]).isin([float('nan')]),
np.array([False, True]))

# we cannot compare NaT with NaN
tm.assert_numpy_array_equal(Float64Index([1.0, np.nan]).isin([pd.NaT]),
np.array([False, True]))
np.array([False, False]))

def test_isin_level_kwarg(self):
def check_idx(idx):
Expand Down
Loading