From a482acdf4b59b2643c7e2277a2db08b5901505b0 Mon Sep 17 00:00:00 2001 From: Joerg Date: Thu, 1 Feb 2018 17:30:46 +0100 Subject: [PATCH 1/9] dirty fix and add integration test --- pandas/core/algorithms.py | 4 ++-- pandas/tests/test_algos.py | 13 +++++++++++++ 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index c754c063fce8e..bd780c74b5fa6 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -70,9 +70,9 @@ def _ensure_data(values, dtype=None): # we are actually coercing to uint64 # until our algos support uint8 directly (see TODO) return np.asarray(values).astype('uint64'), 'bool', 'uint64' - elif is_signed_integer_dtype(values) or is_signed_integer_dtype(dtype): + elif is_signed_integer_dtype(values) and is_signed_integer_dtype(dtype): return _ensure_int64(values), 'int64', 'int64' - elif (is_unsigned_integer_dtype(values) or + elif (is_unsigned_integer_dtype(values) and is_unsigned_integer_dtype(dtype)): return _ensure_uint64(values), 'uint64', 'uint64' elif is_float_dtype(values) or is_float_dtype(dtype): diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index b1e3177547ac6..400920b9b5b54 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -555,6 +555,19 @@ def test_empty(self, empty): result = algos.isin(vals, empty) tm.assert_numpy_array_equal(expected, result) + def test_regression_issue_19356(self): + # Regression test for GH19356 + l = [-9, -0.5] + expected = np.array([True, False]) + + series_float = pd.Series([-9.0, 0.0]) + result_float = series_float.isin(l) + tm.assert_numpy_array_equal(expected, result_float.values) + + series_int = pd.Series([-9, 0]) + result_int = series_int.isin(l) + tm.assert_numpy_array_equal(expected, result_int.values) + class TestValueCounts(object): From e5f6245d0e6ee7acad845eed583bff1f0b1ddbbc Mon Sep 17 00:00:00 2001 From: Joerg Date: Fri, 2 Feb 2018 10:09:04 +0100 Subject: [PATCH 2/9] unit tests pass --- pandas/core/algorithms.py | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index bd780c74b5fa6..f44751f63a03b 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -19,7 +19,7 @@ is_period_dtype, is_numeric_dtype, is_float_dtype, is_bool_dtype, needs_i8_conversion, - is_categorical, is_datetimetz, + is_categorical, is_datetimetz, is_datetime_or_timedelta_dtype, is_datetime64_any_dtype, is_datetime64tz_dtype, is_timedelta64_dtype, is_interval_dtype, is_scalar, is_list_like, @@ -70,9 +70,9 @@ def _ensure_data(values, dtype=None): # we are actually coercing to uint64 # until our algos support uint8 directly (see TODO) return np.asarray(values).astype('uint64'), 'bool', 'uint64' - elif is_signed_integer_dtype(values) and is_signed_integer_dtype(dtype): + elif is_signed_integer_dtype(values) or is_signed_integer_dtype(dtype): return _ensure_int64(values), 'int64', 'int64' - elif (is_unsigned_integer_dtype(values) and + elif (is_unsigned_integer_dtype(values) or is_unsigned_integer_dtype(dtype)): return _ensure_uint64(values), 'uint64', 'uint64' elif is_float_dtype(values) or is_float_dtype(dtype): @@ -405,7 +405,11 @@ def isin(comps, values): values = construct_1d_object_array_from_listlike(list(values)) comps, dtype, _ = _ensure_data(comps) - values, _, _ = _ensure_data(values, dtype=dtype) + if (all(is_datetime_or_timedelta_dtype(i) for i in values) and + is_datetime_or_timedelta_dtype(dtype)): + values, _, _ = _ensure_data(values, dtype=dtype) + else: + values, _, _ = _ensure_data(values) # faster for larger cases to use np.in1d f = lambda x, y: htable.ismember_object(x, values) @@ -414,7 +418,7 @@ def isin(comps, values): # Ensure np.in1d doesn't get object types or it *may* throw an exception if len(comps) > 1000000 and not is_object_dtype(comps): f = lambda x, y: np.in1d(x, y) - elif is_integer_dtype(comps): + elif is_integer_dtype(comps) and is_integer_dtype(values): try: values = values.astype('int64', copy=False) comps = comps.astype('int64', copy=False) @@ -423,7 +427,7 @@ def isin(comps, values): values = values.astype(object) comps = comps.astype(object) - elif is_float_dtype(comps): + elif is_float_dtype(comps) and is_float_dtype(values): try: values = values.astype('float64', copy=False) comps = comps.astype('float64', copy=False) @@ -432,6 +436,9 @@ def isin(comps, values): except (TypeError, ValueError): values = values.astype(object) comps = comps.astype(object) + else: + values = values.astype(object) + comps = comps.astype(object) return f(comps, values) From 0ea7348d81b7642695674d35ea9b53b4f66d090a Mon Sep 17 00:00:00 2001 From: Joerg Date: Fri, 2 Feb 2018 10:48:11 +0100 Subject: [PATCH 3/9] base tests fail --- pandas/core/algorithms.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index f44751f63a03b..6bab9214047a8 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -404,9 +404,12 @@ def isin(comps, values): if not isinstance(values, (ABCIndex, ABCSeries, np.ndarray)): values = construct_1d_object_array_from_listlike(list(values)) + from pandas.core.dtypes.common import is_datetimelike comps, dtype, _ = _ensure_data(comps) - if (all(is_datetime_or_timedelta_dtype(i) for i in values) and - is_datetime_or_timedelta_dtype(dtype)): + # Convert `values` to `dtype` if `values` is datetime-like and `dtype` is datetime-like + if (is_datetime_or_timedelta_dtype(dtype) and + (is_datetime_or_timedelta_dtype(values) or + all(is_datetimelike(i) for i in values))): values, _, _ = _ensure_data(values, dtype=dtype) else: values, _, _ = _ensure_data(values) From c94a117290230f52f60f47b02021413c9fcde311 Mon Sep 17 00:00:00 2001 From: Joerg Date: Fri, 2 Feb 2018 11:08:55 +0100 Subject: [PATCH 4/9] base tests pass --- pandas/core/algorithms.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 6bab9214047a8..88c325c702390 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -404,13 +404,17 @@ def isin(comps, values): if not isinstance(values, (ABCIndex, ABCSeries, np.ndarray)): values = construct_1d_object_array_from_listlike(list(values)) - from pandas.core.dtypes.common import is_datetimelike + from pandas.core.dtypes.common import is_datetimelike, is_float comps, dtype, _ = _ensure_data(comps) - # Convert `values` to `dtype` if `values` is datetime-like and `dtype` is datetime-like + # Convert `values` to `dtype` if `values` is datetime/float-like and `dtype` is datetime/float-like if (is_datetime_or_timedelta_dtype(dtype) and (is_datetime_or_timedelta_dtype(values) or all(is_datetimelike(i) for i in values))): values, _, _ = _ensure_data(values, dtype=dtype) + elif (is_float_dtype(dtype) and + (is_float_dtype(values) or + all(is_float(i) for i in values))): + values, _, _ = _ensure_data(values, dtype=dtype) else: values, _, _ = _ensure_data(values) From 3b66275f39c1aa0f3141d414b6b0679a069eb5d1 Mon Sep 17 00:00:00 2001 From: Joerg Date: Fri, 2 Feb 2018 11:46:24 +0100 Subject: [PATCH 5/9] rearrange if clause --- pandas/core/algorithms.py | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 88c325c702390..31e5c56468814 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -13,14 +13,14 @@ ABCIndexClass, ABCCategorical) from pandas.core.dtypes.common import ( is_unsigned_integer_dtype, is_signed_integer_dtype, - is_integer_dtype, is_complex_dtype, + is_integer_dtype, is_complex_dtype, is_integer, is_float, is_object_dtype, is_categorical_dtype, is_sparse, is_period_dtype, is_numeric_dtype, is_float_dtype, is_bool_dtype, needs_i8_conversion, is_categorical, is_datetimetz, is_datetime_or_timedelta_dtype, - is_datetime64_any_dtype, is_datetime64tz_dtype, + is_datetime64_any_dtype, is_datetime64tz_dtype, is_datetimelike, is_timedelta64_dtype, is_interval_dtype, is_scalar, is_list_like, _ensure_platform_int, _ensure_object, @@ -404,16 +404,18 @@ def isin(comps, values): if not isinstance(values, (ABCIndex, ABCSeries, np.ndarray)): values = construct_1d_object_array_from_listlike(list(values)) - from pandas.core.dtypes.common import is_datetimelike, is_float comps, dtype, _ = _ensure_data(comps) - # Convert `values` to `dtype` if `values` is datetime/float-like and `dtype` is datetime/float-like - if (is_datetime_or_timedelta_dtype(dtype) and - (is_datetime_or_timedelta_dtype(values) or - all(is_datetimelike(i) for i in values))): - values, _, _ = _ensure_data(values, dtype=dtype) - elif (is_float_dtype(dtype) and - (is_float_dtype(values) or - all(is_float(i) for i in values))): + # Convert `values` to `dtype` if dtype of `values` is like `dtype` + check_int = (is_integer_dtype(dtype) and + (is_integer_dtype(values) or + all(is_integer(i) for i in values))) + check_float = (is_float_dtype(dtype) and + (is_float_dtype(values) or + all(is_float(i) for i in values))) + check_datetime = (is_datetime_or_timedelta_dtype(dtype) and + (is_datetime_or_timedelta_dtype(values) or + all(is_datetimelike(i) for i in values))) + if check_int or check_float or check_datetime: values, _, _ = _ensure_data(values, dtype=dtype) else: values, _, _ = _ensure_data(values) From 21569bb675ccc9a2e57797e10f57989e2062a3de Mon Sep 17 00:00:00 2001 From: Joerg Date: Fri, 2 Feb 2018 12:06:39 +0100 Subject: [PATCH 6/9] add timestamp check --- pandas/core/algorithms.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 31e5c56468814..faa8109ca168d 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -32,6 +32,7 @@ from pandas.core import common as com from pandas._libs import algos, lib, hashtable as htable from pandas._libs.tslib import iNaT +from pandas._libs.tslibs.timestamps import Timestamp # --------------- # @@ -414,7 +415,8 @@ def isin(comps, values): all(is_float(i) for i in values))) check_datetime = (is_datetime_or_timedelta_dtype(dtype) and (is_datetime_or_timedelta_dtype(values) or - all(is_datetimelike(i) for i in values))) + all(is_datetimelike(i) for i in values) or + all(isinstance(i, Timestamp) for i in values))) if check_int or check_float or check_datetime: values, _, _ = _ensure_data(values, dtype=dtype) else: From bbd442a72a27c1c1f6375e59eb958f4e9b45bb68 Mon Sep 17 00:00:00 2001 From: Joerg Date: Fri, 2 Feb 2018 19:41:32 +0100 Subject: [PATCH 7/9] change test --- pandas/tests/test_algos.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 400920b9b5b54..9cccea17db5aa 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -557,17 +557,17 @@ def test_empty(self, empty): def test_regression_issue_19356(self): # Regression test for GH19356 - l = [-9, -0.5] + comp_list = [1, 0.5] expected = np.array([True, False]) - series_float = pd.Series([-9.0, 0.0]) - result_float = series_float.isin(l) - tm.assert_numpy_array_equal(expected, result_float.values) - - series_int = pd.Series([-9, 0]) - result_int = series_int.isin(l) + series_int = pd.Series([1, 0]) + result_int = series_int.isin(comp_list) tm.assert_numpy_array_equal(expected, result_int.values) + series_float = pd.Series([1.0, 0.0]) + result_float = series_float.isin(comp_list) + tm.assert_numpy_array_equal(expected, result_float.values) + class TestValueCounts(object): From cd14c56f5b3f8415e36b20934e8a011a462603d4 Mon Sep 17 00:00:00 2001 From: Joerg Date: Fri, 2 Feb 2018 20:34:46 +0100 Subject: [PATCH 8/9] extend regression test --- pandas/tests/test_algos.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 9cccea17db5aa..9f0ac0ecc89e9 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -557,16 +557,17 @@ def test_empty(self, empty): def test_regression_issue_19356(self): # Regression test for GH19356 - comp_list = [1, 0.5] - expected = np.array([True, False]) + result1 = pd.Series([1, 0]).isin([1, 0.5]) + expected1 = np.array([True, False]) + tm.assert_numpy_array_equal(expected1, result1.values) - series_int = pd.Series([1, 0]) - result_int = series_int.isin(comp_list) - tm.assert_numpy_array_equal(expected, result_int.values) + result2 = pd.Series([1.0, 0.0]).isin([1, 0.5]) + expected2 = np.array([True, False]) + tm.assert_numpy_array_equal(expected2, result2.values) - series_float = pd.Series([1.0, 0.0]) - result_float = series_float.isin(comp_list) - tm.assert_numpy_array_equal(expected, result_float.values) + result3 = pd.Series([1, 0]).isin([1.0, 0.5]) + expected3 = np.array([True, False]) + tm.assert_numpy_array_equal(expected3, result3.values) class TestValueCounts(object): From 80d9c3d34102025dd082ab07cb38ba5dca3821c4 Mon Sep 17 00:00:00 2001 From: Joerg Date: Mon, 5 Feb 2018 02:05:39 +0100 Subject: [PATCH 9/9] new approach, some tests fail --- pandas/core/algorithms.py | 33 +++++++++++++++------------------ 1 file changed, 15 insertions(+), 18 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index faa8109ca168d..31738d789e004 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -13,14 +13,14 @@ ABCIndexClass, ABCCategorical) from pandas.core.dtypes.common import ( is_unsigned_integer_dtype, is_signed_integer_dtype, - is_integer_dtype, is_complex_dtype, is_integer, is_float, + is_integer_dtype, is_complex_dtype, is_object_dtype, is_categorical_dtype, is_sparse, is_period_dtype, is_numeric_dtype, is_float_dtype, is_bool_dtype, needs_i8_conversion, is_categorical, is_datetimetz, is_datetime_or_timedelta_dtype, - is_datetime64_any_dtype, is_datetime64tz_dtype, is_datetimelike, + is_datetime64_any_dtype, is_datetime64tz_dtype, is_timedelta64_dtype, is_interval_dtype, is_scalar, is_list_like, _ensure_platform_int, _ensure_object, @@ -405,22 +405,19 @@ def isin(comps, values): if not isinstance(values, (ABCIndex, ABCSeries, np.ndarray)): values = construct_1d_object_array_from_listlike(list(values)) - comps, dtype, _ = _ensure_data(comps) - # Convert `values` to `dtype` if dtype of `values` is like `dtype` - check_int = (is_integer_dtype(dtype) and - (is_integer_dtype(values) or - all(is_integer(i) for i in values))) - check_float = (is_float_dtype(dtype) and - (is_float_dtype(values) or - all(is_float(i) for i in values))) - check_datetime = (is_datetime_or_timedelta_dtype(dtype) and - (is_datetime_or_timedelta_dtype(values) or - all(is_datetimelike(i) for i in values) or - all(isinstance(i, Timestamp) for i in values))) - if check_int or check_float or check_datetime: - values, _, _ = _ensure_data(values, dtype=dtype) - else: - values, _, _ = _ensure_data(values) + comps, dtype_comps, _ = _ensure_data(comps) + values, _, _ = _ensure_data(values) + # If items of `values` are of the same dtype... + dtypes_values_set = set([type(v) for v in values]) + if len(dtypes_values_set) == 1: + dtype_values_items = dtypes_values_set.pop() + # ...and if this dtype matches the dtype of `comps`... + is_time_like = lambda x: (is_datetime_or_timedelta_dtype(x) or + x == Timestamp) + if (dtype_comps == dtype_values_items or + (is_time_like(dtype_values_items) and is_time_like(dtype_comps))): + #...then coerce `values` to type of `comps`. + values, _, _ = _ensure_data(values, dtype=dtype_comps) # faster for larger cases to use np.in1d f = lambda x, y: htable.ismember_object(x, values)