From 158f3f2df1458a67ce232d0826b630e4bc826051 Mon Sep 17 00:00:00 2001 From: Avinash Pancham Date: Sun, 15 Nov 2020 00:40:56 +0100 Subject: [PATCH] BUG: Prevent Series.isin from unwantedly casting isin values from float to integer (GH21804) --- doc/source/whatsnew/v1.2.0.rst | 1 + pandas/core/algorithms.py | 19 ++++++++++++++++--- pandas/tests/test_algos.py | 18 ++++++++++++------ 3 files changed, 29 insertions(+), 9 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index e690334a36c5b..aab9e2a5bfd12 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -546,6 +546,7 @@ Numeric - Bug in :class:`DataFrame` allowing arithmetic operations with list of array-likes with undefined results. Behavior changed to raising ``ValueError`` (:issue:`36702`) - Bug in :meth:`DataFrame.std`` with ``timedelta64`` dtype and ``skipna=False`` (:issue:`37392`) - Bug in :meth:`DataFrame.min` and :meth:`DataFrame.max` with ``datetime64`` dtype and ``skipna=False`` (:issue:`36907`) +- Bug in :meth:`Series.isin` that unwantedly casts isin values from ``float`` to ``integer`` (:issue:`21804`) Conversion ^^^^^^^^^^ diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index ec88eb817b3f8..237920a0610e6 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -58,6 +58,7 @@ from pandas.core.construction import array, extract_array from pandas.core.indexers import validate_indices +from pandas.core.tools.numeric import to_numeric if TYPE_CHECKING: from pandas import Categorical, DataFrame, Series @@ -431,6 +432,15 @@ def isin(comps: AnyArrayLike, values: AnyArrayLike) -> np.ndarray: return cast("Categorical", comps).isin(values) comps, dtype = _ensure_data(comps) + + if is_numeric_dtype(dtype): + # Try downcasting values if comps is numeric to prevent precision + # loss resulting from casting values to comps its exact dtype + try: + values = to_numeric(values, downcast="integer") + dtype = None + except (TypeError, ValueError): + pass values, _ = _ensure_data(values, dtype=dtype) # faster for larger cases to use np.in1d @@ -445,7 +455,11 @@ def isin(comps: AnyArrayLike, values: AnyArrayLike) -> np.ndarray: f = lambda c, v: np.logical_or(np.in1d(c, v), np.isnan(c)) else: f = np.in1d - elif is_integer_dtype(comps): + elif ( + is_integer_dtype(comps) + or is_integer_dtype(values) + and not (is_float_dtype(comps) or is_float_dtype(values)) + ): try: values = values.astype("int64", copy=False) comps = comps.astype("int64", copy=False) @@ -453,8 +467,7 @@ def isin(comps: AnyArrayLike, values: AnyArrayLike) -> np.ndarray: except (TypeError, ValueError, OverflowError): values = values.astype(object) comps = comps.astype(object) - - elif is_float_dtype(comps): + elif is_float_dtype(comps) or is_float_dtype(values): try: values = values.astype("float64", copy=False) comps = comps.astype("float64", copy=False) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 34b7d0e73e914..5b73538ded6ff 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -942,13 +942,19 @@ def test_different_nans(self): ) tm.assert_numpy_array_equal(np.array([True]), result) - def test_no_cast(self): - # GH 22160 - # ensure 42 is not casted to a string - comps = ["ss", 42] - values = ["42"] - expected = np.array([False, False]) + @pytest.mark.parametrize( + "comps, values, expected_values", + [ + (["ss", 42], ["42"], [False, False]), + ([1, 0], [1, 0.5], [True, False]), + ], + ) + def test_no_cast(self, comps, values, expected_values): + # GH 22160 ensure 42 is not casted to a string + # GH21804 Prevent Series.isin from unwantedly casting isin values + # from float to integer result = algos.isin(comps, values) + expected = np.array(expected_values) tm.assert_numpy_array_equal(expected, result) @pytest.mark.parametrize("empty", [[], Series(dtype=object), np.array([])])