Skip to content

BUG: Fixes unwanted casting in .isin (GH21804) #21893

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 21 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v0.23.4.txt
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ Bug Fixes

**Conversion**

-
- Bug where unwanted casting of float to int in :func:`isin` led to incorrect comparison outcome (:issue:`21804`)
-

**Indexing**
Expand Down
37 changes: 23 additions & 14 deletions pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
is_period_dtype,
is_numeric_dtype, is_float_dtype,
is_bool_dtype, needs_i8_conversion,
is_datetimetz,
is_datetimetz, is_datetime_or_timedelta_dtype,
is_datetime64_any_dtype, is_datetime64tz_dtype,
is_timedelta64_dtype, is_datetimelike,
is_interval_dtype, is_scalar, is_list_like,
Expand All @@ -39,6 +39,8 @@
from pandas.util._decorators import (Appender, Substitution,
deprecate_kwarg)

from pandas._libs.tslibs.timestamps import Timestamp

_shared_docs = {}


Expand Down Expand Up @@ -415,33 +417,40 @@ def isin(comps, values):
comps = com._values_from_object(comps)

comps, dtype, _ = _ensure_data(comps)
values, _, _ = _ensure_data(values, dtype=dtype)

is_time_like = lambda x: (is_datetime_or_timedelta_dtype(x)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i already indicate that this is not the path forward

this is much slower than the existing

you need to keep it along the current structure

and post benchmarks

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jreback Thanks - have removed the flags int_flg / float_flg as suggested per your earlier review and posted the benchmarks after each commit.
Re-pasting the benchmarks after the last commit

ASV results on algorithms after 82386c3 are similar to the most recent one 5416711

       before           after         ratio
     [537b65cb]       [82386c31]
-     2.36±0.02ms      1.92±0.02ms     0.82  algorithms.Hashing.time_series_timedeltas

SOME BENCHMARKS HAVE CHANGED SIGNIFICANTLY.

Also, could not get around writing a custom function is_time_like - had tried it in
9fca52c

Please feel free to close this PR - since I think I am not sure I understand what is the expectation in terms of maintaining the existing structure

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

there are many benchmarks

the point is that this code is highly sensistive to changes and requires a lot of benchmark running to avoid regressions

or isinstance(x, Timestamp))

is_int = lambda x: ((x == np.int64) or (x == int))
is_float = lambda x: ((x == np.float64) or (x == float))

if is_time_like(dtype):
values, _, _ = _ensure_data(values, dtype=dtype)
else:
values, _, _ = _ensure_data(values)

comps_types = set(type(v) for v in comps)
values_types = set(type(v) for v in values)

# faster for larger cases to use np.in1d
f = lambda x, y: htable.ismember_object(x, values)
f = lambda x, y: htable.ismember_object(x.astype(object), y.astype(object))

# GH16012
# Ensure np.in1d doesn't get object types or it *may* throw an exception
if len(comps) > 1000000 and not is_object_dtype(comps):
f = lambda x, y: np.in1d(x, y)
elif is_integer_dtype(comps):
try:
elif len(comps_types) == len(values_types) == 1:
comps_types = comps_types.pop()
values_types = values_types.pop()
if (is_int(comps_types) and is_int(values_types)):
values = values.astype('int64', copy=False)
comps = comps.astype('int64', copy=False)
f = lambda x, y: htable.ismember_int64(x, y)
except (TypeError, ValueError):
values = values.astype(object)
comps = comps.astype(object)

elif is_float_dtype(comps):
try:
elif (is_float(comps_types) and is_float(values_types)):
values = values.astype('float64', copy=False)
comps = comps.astype('float64', copy=False)
checknull = isna(values).any()
f = lambda x, y: htable.ismember_float64(x, y, checknull)
except (TypeError, ValueError):
values = values.astype(object)
comps = comps.astype(object)

return f(comps, values)

Expand Down
53 changes: 17 additions & 36 deletions pandas/tests/test_algos.py
Original file line number Diff line number Diff line change
Expand Up @@ -509,42 +509,23 @@ def test_invalid(self):
pytest.raises(TypeError, lambda: algos.isin(1, [1]))
pytest.raises(TypeError, lambda: algos.isin([1], 1))

def test_basic(self):

result = algos.isin([1, 2], [1])
expected = np.array([True, False])
tm.assert_numpy_array_equal(result, expected)

result = algos.isin(np.array([1, 2]), [1])
expected = np.array([True, False])
tm.assert_numpy_array_equal(result, expected)

result = algos.isin(Series([1, 2]), [1])
expected = np.array([True, False])
tm.assert_numpy_array_equal(result, expected)

result = algos.isin(Series([1, 2]), Series([1]))
expected = np.array([True, False])
tm.assert_numpy_array_equal(result, expected)

result = algos.isin(Series([1, 2]), set([1]))
expected = np.array([True, False])
tm.assert_numpy_array_equal(result, expected)

result = algos.isin(['a', 'b'], ['a'])
expected = np.array([True, False])
tm.assert_numpy_array_equal(result, expected)

result = algos.isin(Series(['a', 'b']), Series(['a']))
expected = np.array([True, False])
tm.assert_numpy_array_equal(result, expected)

result = algos.isin(Series(['a', 'b']), set(['a']))
expected = np.array([True, False])
tm.assert_numpy_array_equal(result, expected)

result = algos.isin(['a', 'b'], [1])
expected = np.array([False, False])
@pytest.mark.parametrize("comps,values,expected", [
([1, 2], [1], [True, False]),
([1, 0], [1, 0.5], [True, False]),
([1.0, 0], [1, 0.5], [True, False]),
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

To reviewers: Just for reference, here are the new tests.

@KalyanGokhale : Nice refactoring!

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks @gfyoung

([1.0, 0.0], [1, 0], [True, True]),
(np.array([1, 2]), [1], [True, False]),
(Series([1, 2]), [1], [True, False]),
(Series([1, 2]), Series([1]), [True, False]),
(Series([1, 2]), set([1]), [True, False]),
(['a', 'b'], ['a'], [True, False]),
(Series(['a', 'b']), Series(['a']), [True, False]),
(Series(['a', 'b']), set(['a']), [True, False]),
(['a', 'b'], [1], [False, False])
])
def test_basic(self, comps, values, expected):
result = algos.isin(comps, values)
expected = np.array(expected)
tm.assert_numpy_array_equal(result, expected)

def test_i8(self):
Expand Down