-
-
Notifications
You must be signed in to change notification settings - Fork 18.4k
BUG: Fixes unwanted casting in .isin (GH21804) #21893
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 12 commits
d0c7ebc
143566a
dd60b4e
3209172
18751ba
031616d
9d92a76
d8b5242
050f80a
3aa8561
ee66578
84be606
f8cc271
b71dad6
5416711
9fca52c
dd37f9c
52f8131
63a5f15
2e0bb49
82386c3
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -23,7 +23,7 @@ | |
is_period_dtype, | ||
is_numeric_dtype, is_float_dtype, | ||
is_bool_dtype, needs_i8_conversion, | ||
is_datetimetz, | ||
is_datetimetz, is_datetime_or_timedelta_dtype, | ||
is_datetime64_any_dtype, is_datetime64tz_dtype, | ||
is_timedelta64_dtype, is_datetimelike, | ||
is_interval_dtype, is_scalar, is_list_like, | ||
|
@@ -39,6 +39,8 @@ | |
from pandas.util._decorators import (Appender, Substitution, | ||
deprecate_kwarg) | ||
|
||
from pandas._libs.tslibs.timestamps import Timestamp | ||
|
||
_shared_docs = {} | ||
|
||
|
||
|
@@ -413,35 +415,41 @@ def isin(comps, values): | |
return comps._values.isin(values) | ||
|
||
comps = com._values_from_object(comps) | ||
comps, dtype_comps, _ = _ensure_data(comps) | ||
|
||
comps, dtype, _ = _ensure_data(comps) | ||
values, _, _ = _ensure_data(values, dtype=dtype) | ||
is_time_like = lambda x: (is_datetime_or_timedelta_dtype(x) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. i already indicate that this is not the path forward this is much slower than the existing you need to keep it along the current structure and post benchmarks There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @jreback Thanks - have removed the flags
Also, could not get around writing a custom function Please feel free to close this PR - since I think I am not sure I understand what is the expectation in terms of maintaining the existing structure There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. there are many benchmarks the point is that this code is highly sensistive to changes and requires a lot of benchmark running to avoid regressions |
||
or isinstance(x, Timestamp)) | ||
|
||
# faster for larger cases to use np.in1d | ||
f = lambda x, y: htable.ismember_object(x, values) | ||
is_int = lambda x: ((x == np.int64) or (x == int)) | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. woa, what are you doing? this is way less understandable that before. too many more if/thens here. pls fit this into the existing structure. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Have edited to retain most of the existing structure There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Revised asv benchmarks on algorithms after 5416711
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @jreback thanks for the review - any other edits needed? |
||
is_float = lambda x: ((x == np.float64) or (x == float)) | ||
|
||
f = lambda x, y: htable.ismember_object(x.astype(object), y.astype(object)) | ||
|
||
# GH16012 | ||
# Ensure np.in1d doesn't get object types or it *may* throw an exception | ||
# faster for larger cases to use np.in1d | ||
if len(comps) > 1000000 and not is_object_dtype(comps): | ||
f = lambda x, y: np.in1d(x, y) | ||
elif is_integer_dtype(comps): | ||
try: | ||
values = values.astype('int64', copy=False) | ||
comps = comps.astype('int64', copy=False) | ||
f = lambda x, y: htable.ismember_int64(x, y) | ||
except (TypeError, ValueError): | ||
values = values.astype(object) | ||
comps = comps.astype(object) | ||
|
||
elif is_float_dtype(comps): | ||
try: | ||
values = values.astype('float64', copy=False) | ||
comps = comps.astype('float64', copy=False) | ||
checknull = isna(values).any() | ||
f = lambda x, y: htable.ismember_float64(x, y, checknull) | ||
except (TypeError, ValueError): | ||
values = values.astype(object) | ||
comps = comps.astype(object) | ||
if is_time_like(dtype_comps): | ||
values, _, _ = _ensure_data(values, dtype=dtype_comps) | ||
else: | ||
values, dtype_values, _ = _ensure_data(values) | ||
comps_types = set(type(v) for v in comps) | ||
values_types = set(type(v) for v in values) | ||
if len(comps_types) == len(values_types) == 1: | ||
comps_types = comps_types.pop() | ||
values_types = values_types.pop() | ||
if (is_int(comps_types) and is_int(values_types)): | ||
values = values.astype('int64', copy=False) | ||
comps = comps.astype('int64', copy=False) | ||
f = lambda x, y: htable.ismember_int64(x, y) | ||
elif (is_float(comps_types) and is_float(values_types)): | ||
values = values.astype('float64', copy=False) | ||
comps = comps.astype('float64', copy=False) | ||
checknull = isna(values).any() | ||
f = lambda x, y: htable.ismember_float64(x, y, checknull) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. How come you were able to remove the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In the earlier code only dtype of |
||
|
||
return f(comps, values) | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -509,42 +509,23 @@ def test_invalid(self): | |
pytest.raises(TypeError, lambda: algos.isin(1, [1])) | ||
pytest.raises(TypeError, lambda: algos.isin([1], 1)) | ||
|
||
def test_basic(self): | ||
|
||
result = algos.isin([1, 2], [1]) | ||
expected = np.array([True, False]) | ||
tm.assert_numpy_array_equal(result, expected) | ||
|
||
result = algos.isin(np.array([1, 2]), [1]) | ||
expected = np.array([True, False]) | ||
tm.assert_numpy_array_equal(result, expected) | ||
|
||
result = algos.isin(Series([1, 2]), [1]) | ||
expected = np.array([True, False]) | ||
tm.assert_numpy_array_equal(result, expected) | ||
|
||
result = algos.isin(Series([1, 2]), Series([1])) | ||
expected = np.array([True, False]) | ||
tm.assert_numpy_array_equal(result, expected) | ||
|
||
result = algos.isin(Series([1, 2]), set([1])) | ||
expected = np.array([True, False]) | ||
tm.assert_numpy_array_equal(result, expected) | ||
|
||
result = algos.isin(['a', 'b'], ['a']) | ||
expected = np.array([True, False]) | ||
tm.assert_numpy_array_equal(result, expected) | ||
|
||
result = algos.isin(Series(['a', 'b']), Series(['a'])) | ||
expected = np.array([True, False]) | ||
tm.assert_numpy_array_equal(result, expected) | ||
|
||
result = algos.isin(Series(['a', 'b']), set(['a'])) | ||
expected = np.array([True, False]) | ||
tm.assert_numpy_array_equal(result, expected) | ||
|
||
result = algos.isin(['a', 'b'], [1]) | ||
expected = np.array([False, False]) | ||
@pytest.mark.parametrize("comps,values,expected", [ | ||
([1, 2], [1], [True, False]), | ||
([1, 0], [1, 0.5], [True, False]), | ||
([1.0, 0], [1, 0.5], [True, False]), | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. To reviewers: Just for reference, here are the new tests. @KalyanGokhale : Nice refactoring! There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks @gfyoung |
||
([1.0, 0.0], [1, 0], [True, True]), | ||
(np.array([1, 2]), [1], [True, False]), | ||
(Series([1, 2]), [1], [True, False]), | ||
(Series([1, 2]), Series([1]), [True, False]), | ||
(Series([1, 2]), set([1]), [True, False]), | ||
(['a', 'b'], ['a'], [True, False]), | ||
(Series(['a', 'b']), Series(['a']), [True, False]), | ||
(Series(['a', 'b']), set(['a']), [True, False]), | ||
(['a', 'b'], [1], [False, False]) | ||
]) | ||
def test_basic(self, comps, values, expected): | ||
result = algos.isin(comps, values) | ||
expected = np.array(expected) | ||
tm.assert_numpy_array_equal(result, expected) | ||
|
||
def test_i8(self): | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
can you be more specific here, as a user I have no idea what this means.