From 0f950dea1d7d12604b26e13ab85b72c57f89672c Mon Sep 17 00:00:00 2001 From: Egor Dranischnikow Date: Sun, 29 Jul 2018 22:30:20 +0200 Subject: [PATCH 1/5] CLN: Remove special handling of nans in the float64-case of isin It is no longer needed because the hash-table handles the nans correctly out of the box (see GH21866) Not having to scan the values via isna(...).any() will improve the perfomance. --- pandas/_libs/hashtable_func_helper.pxi.in | 15 ++++----------- pandas/core/algorithms.py | 3 +-- pandas/tests/test_algos.py | 21 +++++++++++++++++++++ 3 files changed, 26 insertions(+), 13 deletions(-) diff --git a/pandas/_libs/hashtable_func_helper.pxi.in b/pandas/_libs/hashtable_func_helper.pxi.in index 521e564447c59..e7f3701ee83c3 100644 --- a/pandas/_libs/hashtable_func_helper.pxi.in +++ b/pandas/_libs/hashtable_func_helper.pxi.in @@ -210,10 +210,10 @@ def duplicated_{{dtype}}({{scalar}}[:] values, object keep='first'): @cython.boundscheck(False) {{if dtype == 'object'}} -def ismember_{{dtype}}(ndarray[{{scalar}}] arr, ndarray[{{scalar}}] values, bint hasnans=0): +def ismember_{{dtype}}(ndarray[{{scalar}}] arr, ndarray[{{scalar}}] values): {{else}} -def ismember_{{dtype}}({{scalar}}[:] arr, {{scalar}}[:] values, bint hasnans=0): +def ismember_{{dtype}}({{scalar}}[:] arr, {{scalar}}[:] values): {{endif}} """ @@ -224,7 +224,6 @@ def ismember_{{dtype}}({{scalar}}[:] arr, {{scalar}}[:] values, bint hasnans=0): ---------- arr : {{dtype}} ndarray values : {{dtype}} ndarray - hasnans : bint, optional Returns ------- @@ -259,19 +258,13 @@ def ismember_{{dtype}}({{scalar}}[:] arr, {{scalar}}[:] values, bint hasnans=0): for i in range(n): val = arr[i] k = kh_get_{{ttype}}(table, val) - if k != table.n_buckets: - result[i] = 1 - else: - result[i] = hasnans and val != val + result[i] = (k != table.n_buckets) {{else}} with nogil: for i in range(n): val = arr[i] k = kh_get_{{ttype}}(table, val) - if k != table.n_buckets: - result[i] = 1 - else: - result[i] = hasnans and val != val + result[i] = (k != table.n_buckets) {{endif}} kh_destroy_{{ttype}}(table) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 4bf62b021cddc..2773e7b230084 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -437,8 +437,7 @@ def isin(comps, values): try: values = values.astype('float64', copy=False) comps = comps.astype('float64', copy=False) - checknull = isna(values).any() - f = lambda x, y: htable.ismember_float64(x, y, checknull) + f = lambda x, y: htable.ismember_float64(x, y) except (TypeError, ValueError): values = values.astype(object) comps = comps.astype(object) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index f89c7545765c9..716b60a13624b 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -632,6 +632,27 @@ def test_different_nan_objects(self): result = algos.isin(comps, vals) tm.assert_numpy_array_equal(expected, result) + def test_different_nans_as_float64(self): + # create different nans from bit-patterns, + # these nans will land in different buckets in the hash-table + # if no special care is taken + NAN1 = struct.unpack("d", struct.pack("=Q", 0x7ff8000000000000))[0] + NAN2 = struct.unpack("d", struct.pack("=Q", 0x7ff8000000000001))[0] + assert NAN1 != NAN1 + assert NAN2 != NAN2 + + # check that NAN1 and NAN2 are equivalent: + arr = np.array([NAN1, NAN2], dtype=np.float64) + lookup1 = np.array([NAN1], dtype=np.float64) + result = algos.isin(arr, lookup1) + expected = np.array([True, True]) + tm.assert_numpy_array_equal(result, expected) + + lookup2 = np.array([NAN2], dtype=np.float64) + result = algos.isin(arr, lookup2) + expected = np.array([True, True]) + tm.assert_numpy_array_equal(result, expected) + class TestValueCounts(object): From 6791a2cb09930bae7d9dec09159bcb0400858629 Mon Sep 17 00:00:00 2001 From: Egor Dranischnikow Date: Mon, 30 Jul 2018 23:42:10 +0200 Subject: [PATCH 2/5] adding performance tests for isin in combination with float64 --- asv_bench/benchmarks/series_methods.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/asv_bench/benchmarks/series_methods.py b/asv_bench/benchmarks/series_methods.py index 34a8d552304d1..7e9a99ca4b2c8 100644 --- a/asv_bench/benchmarks/series_methods.py +++ b/asv_bench/benchmarks/series_methods.py @@ -27,16 +27,27 @@ def time_constructor(self, data): class IsIn(object): goal_time = 0.2 - params = ['int64', 'object'] + params = ['int64', 'object', 'float64'] param_names = ['dtype'] def setup(self, dtype): self.s = Series(np.random.randint(1, 10, 100000)).astype(dtype) self.values = [1, 2] + self.small = Series(np.random.randint(1, 10, 10)).astype(dtype) + self.many_different_values = np.arange(10**6).astype(dtype) + self.few_different_values = np.zeros(10**6).astype(dtype) def time_isin(self, dtypes): self.s.isin(self.values) + def time_isin_many_different(self, dtypes): + # runtime is dominated by creation of the lookup-table + self.small.isin(self.many_different_values) + + def time_isin_few_different(self, dtypes): + # runtime is dominated by creation of the lookup-table + self.small.isin(self.few_different_values) + class IsInForObjects(object): From f4ab90336b36735c5bdf16fbf7dd2bc463119f17 Mon Sep 17 00:00:00 2001 From: Egor Dranischnikow Date: Mon, 30 Jul 2018 23:44:29 +0200 Subject: [PATCH 3/5] adding bug-id to unit test --- pandas/tests/test_algos.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 716b60a13624b..4f8f61a9884b9 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -633,6 +633,7 @@ def test_different_nan_objects(self): tm.assert_numpy_array_equal(expected, result) def test_different_nans_as_float64(self): + # GH 21866 # create different nans from bit-patterns, # these nans will land in different buckets in the hash-table # if no special care is taken From 9f5524d58c412c4fc00dd65f180e413a6038c5cf Mon Sep 17 00:00:00 2001 From: Egor Dranischnikow Date: Sat, 4 Aug 2018 07:26:14 +0200 Subject: [PATCH 4/5] tweaking tests --- asv_bench/benchmarks/series_methods.py | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/asv_bench/benchmarks/series_methods.py b/asv_bench/benchmarks/series_methods.py index 7e9a99ca4b2c8..983b8f5ff3f1c 100644 --- a/asv_bench/benchmarks/series_methods.py +++ b/asv_bench/benchmarks/series_methods.py @@ -27,24 +27,34 @@ def time_constructor(self, data): class IsIn(object): goal_time = 0.2 - params = ['int64', 'object', 'float64'] + params = ['int64', 'object'] param_names = ['dtype'] def setup(self, dtype): self.s = Series(np.random.randint(1, 10, 100000)).astype(dtype) self.values = [1, 2] - self.small = Series(np.random.randint(1, 10, 10)).astype(dtype) - self.many_different_values = np.arange(10**6).astype(dtype) - self.few_different_values = np.zeros(10**6).astype(dtype) def time_isin(self, dtypes): self.s.isin(self.values) - def time_isin_many_different(self, dtypes): + +class IsInFloat64(object): + + def setup(self): + self.small = Series(np.random.randint(1, 10, 10)).astype(dtype=np.float64) + self.many_different_values = np.arange(10**6, dtype=np.float64) + self.few_different_values = np.zeros(10**7, dtype=np.float64) + self.only_nans_values = np.full(10**7, np.nan, dtype=np.float64) + + def time_isin_many_different(self): # runtime is dominated by creation of the lookup-table self.small.isin(self.many_different_values) - def time_isin_few_different(self, dtypes): + def time_isin_few_different(self): + # runtime is dominated by creation of the lookup-table + self.small.isin(self.few_different_values) + + def time_isin_nan_values(self): # runtime is dominated by creation of the lookup-table self.small.isin(self.few_different_values) From 567e7bb84a7e172076e759b04d1557e5021be630 Mon Sep 17 00:00:00 2001 From: Egor Dranischnikow Date: Sat, 4 Aug 2018 11:12:58 +0200 Subject: [PATCH 5/5] pep8 problems --- asv_bench/benchmarks/series_methods.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/asv_bench/benchmarks/series_methods.py b/asv_bench/benchmarks/series_methods.py index 983b8f5ff3f1c..a26c5d89bc483 100644 --- a/asv_bench/benchmarks/series_methods.py +++ b/asv_bench/benchmarks/series_methods.py @@ -41,9 +41,9 @@ def time_isin(self, dtypes): class IsInFloat64(object): def setup(self): - self.small = Series(np.random.randint(1, 10, 10)).astype(dtype=np.float64) + self.small = Series([1, 2], dtype=np.float64) self.many_different_values = np.arange(10**6, dtype=np.float64) - self.few_different_values = np.zeros(10**7, dtype=np.float64) + self.few_different_values = np.zeros(10**7, dtype=np.float64) self.only_nans_values = np.full(10**7, np.nan, dtype=np.float64) def time_isin_many_different(self):