Skip to content

CLN: Remove special handling of nans in the float64-case of isin #22117

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Aug 10, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 21 additions & 0 deletions asv_bench/benchmarks/series_methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,27 @@ def time_isin(self, dtypes):
self.s.isin(self.values)


class IsInFloat64(object):

def setup(self):
self.small = Series([1, 2], dtype=np.float64)
self.many_different_values = np.arange(10**6, dtype=np.float64)
self.few_different_values = np.zeros(10**7, dtype=np.float64)
self.only_nans_values = np.full(10**7, np.nan, dtype=np.float64)

def time_isin_many_different(self):
# runtime is dominated by creation of the lookup-table
self.small.isin(self.many_different_values)

def time_isin_few_different(self):
# runtime is dominated by creation of the lookup-table
self.small.isin(self.few_different_values)

def time_isin_nan_values(self):
# runtime is dominated by creation of the lookup-table
self.small.isin(self.few_different_values)


class IsInForObjects(object):

def setup(self):
Expand Down
15 changes: 4 additions & 11 deletions pandas/_libs/hashtable_func_helper.pxi.in
Original file line number Diff line number Diff line change
Expand Up @@ -210,10 +210,10 @@ def duplicated_{{dtype}}({{scalar}}[:] values, object keep='first'):
@cython.boundscheck(False)
{{if dtype == 'object'}}

def ismember_{{dtype}}(ndarray[{{scalar}}] arr, ndarray[{{scalar}}] values, bint hasnans=0):
def ismember_{{dtype}}(ndarray[{{scalar}}] arr, ndarray[{{scalar}}] values):
{{else}}

def ismember_{{dtype}}({{scalar}}[:] arr, {{scalar}}[:] values, bint hasnans=0):
def ismember_{{dtype}}({{scalar}}[:] arr, {{scalar}}[:] values):
{{endif}}

"""
Expand All @@ -224,7 +224,6 @@ def ismember_{{dtype}}({{scalar}}[:] arr, {{scalar}}[:] values, bint hasnans=0):
----------
arr : {{dtype}} ndarray
values : {{dtype}} ndarray
hasnans : bint, optional

Returns
-------
Expand Down Expand Up @@ -259,19 +258,13 @@ def ismember_{{dtype}}({{scalar}}[:] arr, {{scalar}}[:] values, bint hasnans=0):
for i in range(n):
val = arr[i]
k = kh_get_{{ttype}}(table, <PyObject*> val)
if k != table.n_buckets:
result[i] = 1
else:
result[i] = hasnans and val != val
result[i] = (k != table.n_buckets)
{{else}}
with nogil:
for i in range(n):
val = arr[i]
k = kh_get_{{ttype}}(table, val)
if k != table.n_buckets:
result[i] = 1
else:
result[i] = hasnans and val != val
result[i] = (k != table.n_buckets)
{{endif}}

kh_destroy_{{ttype}}(table)
Expand Down
3 changes: 1 addition & 2 deletions pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -437,8 +437,7 @@ def isin(comps, values):
try:
values = values.astype('float64', copy=False)
comps = comps.astype('float64', copy=False)
checknull = isna(values).any()
f = lambda x, y: htable.ismember_float64(x, y, checknull)
f = lambda x, y: htable.ismember_float64(x, y)
except (TypeError, ValueError):
values = values.astype(object)
comps = comps.astype(object)
Expand Down
22 changes: 22 additions & 0 deletions pandas/tests/test_algos.py
Original file line number Diff line number Diff line change
Expand Up @@ -632,6 +632,28 @@ def test_different_nan_objects(self):
result = algos.isin(comps, vals)
tm.assert_numpy_array_equal(expected, result)

def test_different_nans_as_float64(self):
# GH 21866
# create different nans from bit-patterns,
# these nans will land in different buckets in the hash-table
# if no special care is taken
NAN1 = struct.unpack("d", struct.pack("=Q", 0x7ff8000000000000))[0]
NAN2 = struct.unpack("d", struct.pack("=Q", 0x7ff8000000000001))[0]
assert NAN1 != NAN1
assert NAN2 != NAN2

# check that NAN1 and NAN2 are equivalent:
arr = np.array([NAN1, NAN2], dtype=np.float64)
lookup1 = np.array([NAN1], dtype=np.float64)
result = algos.isin(arr, lookup1)
expected = np.array([True, True])
tm.assert_numpy_array_equal(result, expected)

lookup2 = np.array([NAN2], dtype=np.float64)
result = algos.isin(arr, lookup2)
expected = np.array([True, True])
tm.assert_numpy_array_equal(result, expected)


class TestValueCounts(object):

Expand Down