Skip to content

Commit 32dd929

Browse files
committed
CLN: relocate lib.ismember* to hashtable space
- fixes .isin on 32-bit (hopefully) - perf about 30% better - releases GIL Author: Jeff Reback <[email protected]> Closes pandas-dev#15773 from jreback/ismember and squashes the following commits: a7dfe51 [Jeff Reback] CLN: relocate lib.ismember* to hashtable space
1 parent 1c9d46a commit 32dd929

File tree

9 files changed

+131
-102
lines changed

9 files changed

+131
-102
lines changed

pandas/_libs/hashtable_func_helper.pxi.in

+87-11
Original file line numberDiff line numberDiff line change
@@ -11,14 +11,14 @@ WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in
1111
{{py:
1212

1313
# dtype, ttype
14-
dtypes = [('float64', 'float64'),
15-
('uint64', 'uint64'),
16-
('object', 'pymap'),
17-
('int64', 'int64')]
14+
dtypes = [('float64', 'float64', 'float64_t'),
15+
('uint64', 'uint64', 'uint64_t'),
16+
('object', 'pymap', 'object'),
17+
('int64', 'int64', 'int64_t')]
1818

1919
}}
2020

21-
{{for dtype, ttype in dtypes}}
21+
{{for dtype, ttype, scalar in dtypes}}
2222

2323

2424
@cython.wraparound(False)
@@ -34,9 +34,7 @@ cdef build_count_table_{{dtype}}({{dtype}}_t[:] values,
3434
khiter_t k
3535
Py_ssize_t i, n = len(values)
3636

37-
{{if dtype != 'object'}}
38-
{{dtype}}_t val
39-
{{endif}}
37+
{{scalar}} val
4038

4139
int ret = 0
4240

@@ -79,7 +77,7 @@ cdef build_count_table_{{dtype}}({{dtype}}_t[:] values,
7977
{{if dtype == 'object'}}
8078
cpdef value_count_{{dtype}}(ndarray[{{dtype}}] values, bint dropna):
8179
{{else}}
82-
cpdef value_count_{{dtype}}({{dtype}}_t[:] values, bint dropna):
80+
cpdef value_count_{{dtype}}({{scalar}}[:] values, bint dropna):
8381
{{endif}}
8482
cdef:
8583
Py_ssize_t i=0
@@ -130,12 +128,11 @@ cpdef value_count_{{dtype}}({{dtype}}_t[:] values, bint dropna):
130128
@cython.boundscheck(False)
131129
{{if dtype == 'object'}}
132130

133-
134131
def duplicated_{{dtype}}(ndarray[{{dtype}}] values, object keep='first'):
135132
{{else}}
136133

137134

138-
def duplicated_{{dtype}}({{dtype}}_t[:] values, object keep='first'):
135+
def duplicated_{{dtype}}({{scalar}}[:] values, object keep='first'):
139136
{{endif}}
140137
cdef:
141138
int ret = 0
@@ -203,8 +200,87 @@ def duplicated_{{dtype}}({{dtype}}_t[:] values, object keep='first'):
203200
kh_destroy_{{ttype}}(table)
204201
return out
205202

203+
204+
#----------------------------------------------------------------------
205+
# Membership
206+
#----------------------------------------------------------------------
207+
208+
209+
@cython.wraparound(False)
210+
@cython.boundscheck(False)
211+
{{if dtype == 'object'}}
212+
213+
def ismember_{{dtype}}(ndarray[{{scalar}}] arr, ndarray[{{scalar}}] values, bint hasnans=0):
214+
{{else}}
215+
216+
def ismember_{{dtype}}({{scalar}}[:] arr, {{scalar}}[:] values, bint hasnans=0):
217+
{{endif}}
218+
219+
"""
220+
Return boolean of values in arr on an
221+
element by-element basis
222+
223+
Parameters
224+
----------
225+
arr : {{dtype}} ndarray
226+
values : {{dtype}} ndarray
227+
hasnans : bint, optional
228+
229+
Returns
230+
-------
231+
boolean ndarry len of (arr)
232+
"""
233+
cdef:
234+
Py_ssize_t i, n, k
235+
int ret = 0
236+
ndarray[uint8_t] result
237+
{{scalar}} val
238+
kh_{{ttype}}_t * table = kh_init_{{ttype}}()
239+
240+
241+
# construct the table
242+
n = len(values)
243+
kh_resize_{{ttype}}(table, min(n, len(values)))
244+
245+
{{if dtype == 'object'}}
246+
for i in range(n):
247+
kh_put_{{ttype}}(table, <PyObject*> values[i], &ret)
248+
{{else}}
249+
with nogil:
250+
for i in range(n):
251+
kh_put_{{ttype}}(table, values[i], &ret)
252+
{{endif}}
253+
254+
# test membership
255+
n = len(arr)
256+
result = np.empty(n, dtype=np.uint8)
257+
258+
{{if dtype == 'object'}}
259+
for i in range(n):
260+
val = arr[i]
261+
k = kh_get_{{ttype}}(table, <PyObject*> val)
262+
if k != table.n_buckets:
263+
result[i] = 1
264+
else:
265+
result[i] = hasnans and val != val
266+
{{else}}
267+
with nogil:
268+
for i in range(n):
269+
val = arr[i]
270+
k = kh_get_{{ttype}}(table, val)
271+
if k != table.n_buckets:
272+
result[i] = 1
273+
else:
274+
result[i] = hasnans and val != val
275+
{{endif}}
276+
277+
kh_destroy_{{ttype}}(table)
278+
return result.view(np.bool_)
279+
206280
{{endfor}}
207281

282+
283+
208284
#----------------------------------------------------------------------
209285
# Mode Computations
210286
#----------------------------------------------------------------------

pandas/_libs/lib.pyx

+1-71
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ cdef extern from "numpy/arrayobject.h":
1313
cdef enum NPY_TYPES:
1414
NPY_intp "NPY_INTP"
1515

16+
from libc.stdlib cimport malloc, free
1617

1718
from cpython cimport (PyDict_New, PyDict_GetItem, PyDict_SetItem,
1819
PyDict_Contains, PyDict_Keys,
@@ -111,77 +112,6 @@ cpdef map_indices_list(list index):
111112
return result
112113

113114

114-
from libc.stdlib cimport malloc, free
115-
116-
117-
def ismember_nans(float64_t[:] arr, set values, bint hasnans):
118-
cdef:
119-
Py_ssize_t i, n
120-
ndarray[uint8_t] result
121-
float64_t val
122-
123-
n = len(arr)
124-
result = np.empty(n, dtype=np.uint8)
125-
for i in range(n):
126-
val = arr[i]
127-
result[i] = val in values or hasnans and isnan(val)
128-
129-
return result.view(np.bool_)
130-
131-
132-
def ismember(ndarray arr, set values):
133-
"""
134-
Checks whether
135-
136-
Parameters
137-
----------
138-
arr : ndarray
139-
values : set
140-
141-
Returns
142-
-------
143-
ismember : ndarray (boolean dtype)
144-
"""
145-
cdef:
146-
Py_ssize_t i, n
147-
ndarray[uint8_t] result
148-
object val
149-
150-
n = len(arr)
151-
result = np.empty(n, dtype=np.uint8)
152-
for i in range(n):
153-
val = util.get_value_at(arr, i)
154-
result[i] = val in values
155-
156-
return result.view(np.bool_)
157-
158-
159-
def ismember_int64(ndarray[int64_t] arr, set values):
160-
"""
161-
Checks whether
162-
163-
Parameters
164-
----------
165-
arr : ndarray of int64
166-
values : set
167-
168-
Returns
169-
-------
170-
ismember : ndarray (boolean dtype)
171-
"""
172-
cdef:
173-
Py_ssize_t i, n
174-
ndarray[uint8_t] result
175-
int64_t v
176-
177-
n = len(arr)
178-
result = np.empty(n, dtype=np.uint8)
179-
for i in range(n):
180-
result[i] = arr[i] in values
181-
182-
return result.view(np.bool_)
183-
184-
185115
@cython.wraparound(False)
186116
@cython.boundscheck(False)
187117
def memory_usage_of_objects(ndarray[object, ndim=1] arr):

pandas/core/algorithms.py

+25-7
Original file line numberDiff line numberDiff line change
@@ -12,12 +12,12 @@
1212
from pandas.types.common import (is_unsigned_integer_dtype,
1313
is_signed_integer_dtype,
1414
is_integer_dtype,
15-
is_int64_dtype,
1615
is_categorical_dtype,
1716
is_extension_type,
1817
is_datetimetz,
1918
is_period_dtype,
2019
is_period_arraylike,
20+
is_numeric_dtype,
2121
is_float_dtype,
2222
is_bool_dtype,
2323
needs_i8_conversion,
@@ -197,19 +197,37 @@ def isin(comps, values):
197197
except TypeError:
198198
# object array conversion will fail
199199
pass
200-
else:
200+
elif is_numeric_dtype(comps):
201201
comps = np.asarray(comps)
202202
values = np.asarray(values)
203+
else:
204+
comps = np.asarray(comps).astype(object)
205+
values = np.asarray(values).astype(object)
203206

204207
# GH11232
205208
# work-around for numpy < 1.8 and comparisions on py3
206209
# faster for larger cases to use np.in1d
210+
f = lambda x, y: htable.ismember_object(x, values)
207211
if (_np_version_under1p8 and compat.PY3) or len(comps) > 1000000:
208-
f = lambda x, y: np.in1d(x, np.asarray(list(y)))
209-
elif is_int64_dtype(comps):
210-
f = lambda x, y: lib.ismember_int64(x, set(y))
211-
else:
212-
f = lambda x, y: lib.ismember(x, set(values))
212+
f = lambda x, y: np.in1d(x, y)
213+
elif is_integer_dtype(comps):
214+
try:
215+
values = values.astype('int64', copy=False)
216+
comps = comps.astype('int64', copy=False)
217+
f = lambda x, y: htable.ismember_int64(x, y)
218+
except (TypeError, ValueError):
219+
values = values.astype(object)
220+
comps = comps.astype(object)
221+
222+
elif is_float_dtype(comps):
223+
try:
224+
values = values.astype('float64', copy=False)
225+
comps = comps.astype('float64', copy=False)
226+
checknull = isnull(values).any()
227+
f = lambda x, y: htable.ismember_float64(x, y, checknull)
228+
except (TypeError, ValueError):
229+
values = values.astype(object)
230+
comps = comps.astype(object)
213231

214232
return f(comps, values)
215233

pandas/core/frame.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -5358,8 +5358,8 @@ def isin(self, values):
53585358
"you passed a "
53595359
"{0!r}".format(type(values).__name__))
53605360
return DataFrame(
5361-
lib.ismember(self.values.ravel(),
5362-
set(values)).reshape(self.shape), self.index,
5361+
algorithms.isin(self.values.ravel(),
5362+
values).reshape(self.shape), self.index,
53635363
self.columns)
53645364

53655365
# ----------------------------------------------------------------------

pandas/indexes/multi.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -1392,7 +1392,7 @@ def _drop_from_level(self, labels, level):
13921392
index = self.levels[i]
13931393
values = index.get_indexer(labels)
13941394

1395-
mask = ~lib.ismember(self.labels[i], set(values))
1395+
mask = ~algos.isin(self.labels[i], values)
13961396

13971397
return self[mask]
13981398

@@ -2463,7 +2463,8 @@ def _wrap_joined_index(self, joined, other):
24632463
@Appender(Index.isin.__doc__)
24642464
def isin(self, values, level=None):
24652465
if level is None:
2466-
return lib.ismember(np.array(self), set(values))
2466+
return algos.isin(self.values,
2467+
MultiIndex.from_tuples(values).values)
24672468
else:
24682469
num = self._get_level_number(level)
24692470
levs = self.levels[num]

pandas/indexes/numeric.py

+3-5
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,13 @@
11
import numpy as np
2-
from pandas._libs import (lib, index as libindex,
2+
from pandas._libs import (index as libindex,
33
algos as libalgos, join as libjoin)
44
from pandas.types.common import (is_dtype_equal, pandas_dtype,
55
is_float_dtype, is_object_dtype,
66
is_integer_dtype, is_scalar)
7-
from pandas.types.missing import isnull
87
from pandas.core.common import _asarray_tuplesafe, _values_from_object
98

109
from pandas import compat
10+
from pandas.core import algorithms
1111
from pandas.indexes.base import Index, InvalidIndexError, _index_shared_docs
1212
from pandas.util.decorators import Appender, cache_readonly
1313
import pandas.indexes.base as ibase
@@ -379,11 +379,9 @@ def is_unique(self):
379379

380380
@Appender(Index.isin.__doc__)
381381
def isin(self, values, level=None):
382-
value_set = set(values)
383382
if level is not None:
384383
self._validate_index_level(level)
385-
return lib.ismember_nans(np.array(self), value_set,
386-
isnull(list(value_set)).any())
384+
return algorithms.isin(np.array(self), values)
387385

388386

389387
Float64Index._add_numeric_methods()

pandas/io/parsers.py

+4-2
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
from pandas.core.series import Series
2727
from pandas.core.frame import DataFrame
2828
from pandas.core.categorical import Categorical
29+
from pandas.core import algorithms
2930
from pandas.core.common import AbstractMethodError
3031
from pandas.io.date_converters import generic_parser
3132
from pandas.io.common import (get_filepath_or_buffer, _validate_header_arg,
@@ -1388,7 +1389,8 @@ def _convert_to_ndarrays(self, dct, na_values, na_fvalues, verbose=False,
13881389
try:
13891390
values = lib.map_infer(values, conv_f)
13901391
except ValueError:
1391-
mask = lib.ismember(values, na_values).view(np.uint8)
1392+
mask = algorithms.isin(
1393+
values, list(na_values)).view(np.uint8)
13921394
values = lib.map_infer_mask(values, conv_f, mask)
13931395

13941396
cvals, na_count = self._infer_types(
@@ -1436,7 +1438,7 @@ def _infer_types(self, values, na_values, try_num_bool=True):
14361438

14371439
na_count = 0
14381440
if issubclass(values.dtype.type, (np.number, np.bool_)):
1439-
mask = lib.ismember(values, na_values)
1441+
mask = algorithms.isin(values, list(na_values))
14401442
na_count = mask.sum()
14411443
if na_count > 0:
14421444
if is_integer_dtype(values):

pandas/tests/indexes/test_base.py

+4-1
Original file line numberDiff line numberDiff line change
@@ -1363,14 +1363,17 @@ def test_isin_nan(self):
13631363
np.array([False, False]))
13641364
tm.assert_numpy_array_equal(Index(['a', np.nan]).isin([pd.NaT]),
13651365
np.array([False, False]))
1366+
13661367
# Float64Index overrides isin, so must be checked separately
13671368
tm.assert_numpy_array_equal(Float64Index([1.0, np.nan]).isin([np.nan]),
13681369
np.array([False, True]))
13691370
tm.assert_numpy_array_equal(
13701371
Float64Index([1.0, np.nan]).isin([float('nan')]),
13711372
np.array([False, True]))
1373+
1374+
# we cannot compare NaT with NaN
13721375
tm.assert_numpy_array_equal(Float64Index([1.0, np.nan]).isin([pd.NaT]),
1373-
np.array([False, True]))
1376+
np.array([False, False]))
13741377

13751378
def test_isin_level_kwarg(self):
13761379
def check_idx(idx):

0 commit comments

Comments
 (0)