From 05406dcedd4d319c43293f4c50a9e36ca2f757f5 Mon Sep 17 00:00:00 2001 From: OlehKSS Date: Fri, 2 Oct 2020 17:42:20 +0200 Subject: [PATCH 1/9] Fix select_dtypes(include='int') for Windows. --- pandas/core/dtypes/common.py | 5 ++++ pandas/tests/dtypes/test_common.py | 24 ++++++++++--------- .../tests/frame/methods/test_select_dtypes.py | 6 +++++ 3 files changed, 24 insertions(+), 11 deletions(-) diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index 14184f044ae95..8457157671754 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -1796,6 +1796,11 @@ def pandas_dtype(dtype) -> DtypeObj: # try a numpy dtype # raise a consistent TypeError if failed try: + # int is mapped to different types (int32, in64) on Windows and Linux + # see https://github.com/numpy/numpy/issues/9464 + if (isinstance(dtype, str) and dtype == "int") or (dtype is int): + dtype = "int64" + npdtype = np.dtype(dtype) except SyntaxError as err: # np.dtype uses `eval` which can raise SyntaxError diff --git a/pandas/tests/dtypes/test_common.py b/pandas/tests/dtypes/test_common.py index 2db9a9a403e1c..c43d5565402e7 100644 --- a/pandas/tests/dtypes/test_common.py +++ b/pandas/tests/dtypes/test_common.py @@ -43,20 +43,22 @@ def test_invalid_dtype_error(self, box): com.pandas_dtype(box) @pytest.mark.parametrize( - "dtype", + "dtype_input,dtype_output", [ - object, - "float64", - np.object_, - np.dtype("object"), - "O", - np.float64, - float, - np.dtype("float64"), + (object, object), + ("float64", np.float64), + (np.object_, np.object_), + (np.dtype("object"), np.object_), + ("O", object), + (np.float64, np.float64), + (float, float), + (np.dtype("float64"), np.float64), + ("int", np.int64), + (int, np.int64) ], ) - def test_pandas_dtype_valid(self, dtype): - assert com.pandas_dtype(dtype) == dtype + def test_pandas_dtype_valid(self, dtype_input, dtype_output): + assert com.pandas_dtype(dtype_input) == dtype_output @pytest.mark.parametrize( "dtype", ["M8[ns]", "m8[ns]", "object", "float64", "int64"] diff --git a/pandas/tests/frame/methods/test_select_dtypes.py b/pandas/tests/frame/methods/test_select_dtypes.py index 4599761909c33..9b046daa1a975 100644 --- a/pandas/tests/frame/methods/test_select_dtypes.py +++ b/pandas/tests/frame/methods/test_select_dtypes.py @@ -82,6 +82,12 @@ def test_select_dtypes_exclude_include_using_list_like(self): e = df[["b", "c", "e"]] tm.assert_frame_equal(r, e) + exclude = (np.datetime64,) + include = np.bool_, "int" + r = df.select_dtypes(include=include, exclude=exclude) + e = df[["b", "e"]] + tm.assert_frame_equal(r, e) + exclude = ("datetime",) include = "bool", "int64", "int32" r = df.select_dtypes(include=include, exclude=exclude) From 4b75067f4f5b01744a9565a21d5ac5057c6a7c43 Mon Sep 17 00:00:00 2001 From: OlehKSS Date: Fri, 2 Oct 2020 17:50:46 +0200 Subject: [PATCH 2/9] Whatsnew entry was added. --- doc/source/whatsnew/v1.2.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 016e8d90e7d21..86bd63110a336 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -326,6 +326,7 @@ Numeric - Bug in :meth:`Series.equals` where a ``ValueError`` was raised when numpy arrays were compared to scalars (:issue:`35267`) - Bug in :class:`Series` where two :class:`Series` each have a :class:`DatetimeIndex` with different timezones having those indexes incorrectly changed when performing arithmetic operations (:issue:`33671`) - Bug in :meth:`pd._testing.assert_almost_equal` was incorrect for complex numeric types (:issue:`28235`) +- Bug in :func:`select_dtypes` different behaviour on windows and linux for ``select_dtypes(include="int")`` (:issue:`36569`) - Conversion From bd6e724dab4bf5562cffc878f92494ea46ea52cf Mon Sep 17 00:00:00 2001 From: Oleh Kozynets Date: Sun, 8 Nov 2020 21:07:03 +0100 Subject: [PATCH 3/9] Fix code style --- pandas/tests/dtypes/test_common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/dtypes/test_common.py b/pandas/tests/dtypes/test_common.py index c43d5565402e7..fec92cd8e43ff 100644 --- a/pandas/tests/dtypes/test_common.py +++ b/pandas/tests/dtypes/test_common.py @@ -54,7 +54,7 @@ def test_invalid_dtype_error(self, box): (float, float), (np.dtype("float64"), np.float64), ("int", np.int64), - (int, np.int64) + (int, np.int64), ], ) def test_pandas_dtype_valid(self, dtype_input, dtype_output): From 39ef89b6abf3317a7f00270a95d1e850524198ff Mon Sep 17 00:00:00 2001 From: Oleh Kozynets Date: Tue, 10 Nov 2020 23:12:07 +0100 Subject: [PATCH 4/9] Fix int inference in select_dtypes. --- pandas/core/dtypes/common.py | 5 ----- pandas/core/frame.py | 17 +++++++++++++++-- pandas/tests/dtypes/test_common.py | 24 +++++++++++------------- 3 files changed, 26 insertions(+), 20 deletions(-) diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index 8457157671754..14184f044ae95 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -1796,11 +1796,6 @@ def pandas_dtype(dtype) -> DtypeObj: # try a numpy dtype # raise a consistent TypeError if failed try: - # int is mapped to different types (int32, in64) on Windows and Linux - # see https://github.com/numpy/numpy/issues/9464 - if (isinstance(dtype, str) and dtype == "int") or (dtype is int): - dtype = "int64" - npdtype = np.dtype(dtype) except SyntaxError as err: # np.dtype uses `eval` which can raise SyntaxError diff --git a/pandas/core/frame.py b/pandas/core/frame.py index cf9c39af5b8fa..427d09e89b430 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3633,8 +3633,21 @@ def select_dtypes(self, include=None, exclude=None) -> DataFrame: raise ValueError("at least one of include or exclude must be nonempty") # convert the myriad valid dtypes object to a single representation - include = frozenset(infer_dtype_from_object(x) for x in include) - exclude = frozenset(infer_dtype_from_object(x) for x in exclude) + def check_int_infer_dtype(dtypes): + converted_dtypes = [] + for dtype in dtypes: + # Numpy maps int to different types (int32, in64) on Windows and Linux + # see https://github.com/numpy/numpy/issues/9464 + if (isinstance(dtype, str) and dtype == "int") or (dtype is int): + converted_dtypes.append(np.int32) + converted_dtypes.append(np.int64) + else: + converted_dtypes.append(infer_dtype_from_object(dtype)) + return frozenset(converted_dtypes) + + include = check_int_infer_dtype(include) + exclude = check_int_infer_dtype(exclude) + for dtypes in (include, exclude): invalidate_string_dtypes(dtypes) diff --git a/pandas/tests/dtypes/test_common.py b/pandas/tests/dtypes/test_common.py index fec92cd8e43ff..2db9a9a403e1c 100644 --- a/pandas/tests/dtypes/test_common.py +++ b/pandas/tests/dtypes/test_common.py @@ -43,22 +43,20 @@ def test_invalid_dtype_error(self, box): com.pandas_dtype(box) @pytest.mark.parametrize( - "dtype_input,dtype_output", + "dtype", [ - (object, object), - ("float64", np.float64), - (np.object_, np.object_), - (np.dtype("object"), np.object_), - ("O", object), - (np.float64, np.float64), - (float, float), - (np.dtype("float64"), np.float64), - ("int", np.int64), - (int, np.int64), + object, + "float64", + np.object_, + np.dtype("object"), + "O", + np.float64, + float, + np.dtype("float64"), ], ) - def test_pandas_dtype_valid(self, dtype_input, dtype_output): - assert com.pandas_dtype(dtype_input) == dtype_output + def test_pandas_dtype_valid(self, dtype): + assert com.pandas_dtype(dtype) == dtype @pytest.mark.parametrize( "dtype", ["M8[ns]", "m8[ns]", "object", "float64", "int64"] From 810bbab51536e056df34ec2310f8814729fb8e4b Mon Sep 17 00:00:00 2001 From: Oleh Kozynets Date: Thu, 12 Nov 2020 20:42:30 +0100 Subject: [PATCH 5/9] Add a separate unit test. --- .../tests/frame/methods/test_select_dtypes.py | 23 ++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/pandas/tests/frame/methods/test_select_dtypes.py b/pandas/tests/frame/methods/test_select_dtypes.py index 9b046daa1a975..401dbe7c6fad6 100644 --- a/pandas/tests/frame/methods/test_select_dtypes.py +++ b/pandas/tests/frame/methods/test_select_dtypes.py @@ -82,16 +82,33 @@ def test_select_dtypes_exclude_include_using_list_like(self): e = df[["b", "c", "e"]] tm.assert_frame_equal(r, e) + exclude = ("datetime",) + include = "bool", "int64", "int32" + r = df.select_dtypes(include=include, exclude=exclude) + e = df[["b", "e"]] + tm.assert_frame_equal(r, e) + + def test_select_dtypes_exclude_include_int(self): + df = DataFrame( + { + "a": list("abc"), + "b": list(range(1, 4)), + "c": np.arange(3, 6).astype("int32"), + "d": np.arange(4.0, 7.0, dtype="float64"), + "e": [True, False, True], + "f": pd.date_range("now", periods=3).values, + } + ) exclude = (np.datetime64,) include = np.bool_, "int" r = df.select_dtypes(include=include, exclude=exclude) - e = df[["b", "e"]] + e = df[["b", "c", "e"]] tm.assert_frame_equal(r, e) exclude = ("datetime",) - include = "bool", "int64", "int32" + include = "bool", int r = df.select_dtypes(include=include, exclude=exclude) - e = df[["b", "e"]] + e = df[["b", "c", "e"]] tm.assert_frame_equal(r, e) def test_select_dtypes_include_using_scalars(self): From f45f2715f53689ce16e25b5d9cdcf66bf69b5276 Mon Sep 17 00:00:00 2001 From: Oleh Kozynets Date: Mon, 30 Nov 2020 17:11:54 +0100 Subject: [PATCH 6/9] Fix dtype in the unit test. --- pandas/tests/frame/methods/test_select_dtypes.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/frame/methods/test_select_dtypes.py b/pandas/tests/frame/methods/test_select_dtypes.py index 401dbe7c6fad6..da25be0ac7569 100644 --- a/pandas/tests/frame/methods/test_select_dtypes.py +++ b/pandas/tests/frame/methods/test_select_dtypes.py @@ -70,7 +70,7 @@ def test_select_dtypes_exclude_include_using_list_like(self): { "a": list("abc"), "b": list(range(1, 4)), - "c": np.arange(3, 6).astype("u1"), + "c": np.arange(3, 6, dtype="u1"), "d": np.arange(4.0, 7.0, dtype="float64"), "e": [True, False, True], "f": pd.date_range("now", periods=3).values, @@ -93,7 +93,7 @@ def test_select_dtypes_exclude_include_int(self): { "a": list("abc"), "b": list(range(1, 4)), - "c": np.arange(3, 6).astype("int32"), + "c": np.arange(3, 6, dtype="int32"), "d": np.arange(4.0, 7.0, dtype="float64"), "e": [True, False, True], "f": pd.date_range("now", periods=3).values, From a913bd8b07765a725c49013f9a1413fa39c6bf6c Mon Sep 17 00:00:00 2001 From: Oleh Kozynets Date: Mon, 18 Jan 2021 12:21:52 +0100 Subject: [PATCH 7/9] Fix whatsnew, add new unit test. --- doc/source/whatsnew/v1.3.0.rst | 1 + .../tests/frame/methods/test_select_dtypes.py | 19 +++++++++++++------ 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 6a85bfd852e19..6ff98c0867f5d 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -251,6 +251,7 @@ Numeric - Bug in :meth:`DataFrame.mode` and :meth:`Series.mode` not keeping consistent integer :class:`Index` for empty input (:issue:`33321`) - Bug in :meth:`DataFrame.rank` with ``np.inf`` and mixture of ``np.nan`` and ``np.inf`` (:issue:`32593`) - Bug in :meth:`DataFrame.rank` with ``axis=0`` and columns holding incomparable types raising ``IndexError`` (:issue:`38932`) +- Bug in :func:`select_dtypes` different behavior between Windows and Linux with ``include="int"`` (:issue:`36569`) - Conversion diff --git a/pandas/tests/frame/methods/test_select_dtypes.py b/pandas/tests/frame/methods/test_select_dtypes.py index f65674e880a12..2ab58d2b484b0 100644 --- a/pandas/tests/frame/methods/test_select_dtypes.py +++ b/pandas/tests/frame/methods/test_select_dtypes.py @@ -129,6 +129,7 @@ def test_select_dtypes_exclude_include_using_list_like(self): tm.assert_frame_equal(r, e) def test_select_dtypes_exclude_include_int(self): + # Fix select_dtypes(include='int') for Windows, FYI #36596 df = DataFrame( { "a": list("abc"), @@ -141,15 +142,21 @@ def test_select_dtypes_exclude_include_int(self): ) exclude = (np.datetime64,) include = np.bool_, "int" - r = df.select_dtypes(include=include, exclude=exclude) - e = df[["b", "c", "e"]] - tm.assert_frame_equal(r, e) + result = df.select_dtypes(include=include, exclude=exclude) + expected = df[["b", "c", "e"]] + tm.assert_frame_equal(result, expected) + + exclude = (np.datetime64,) + include = np.bool_, "integer" + result = df.select_dtypes(include=include, exclude=exclude) + expected = df[["b", "c", "e"]] + tm.assert_frame_equal(result, expected) exclude = ("datetime",) include = "bool", int - r = df.select_dtypes(include=include, exclude=exclude) - e = df[["b", "c", "e"]] - tm.assert_frame_equal(r, e) + result = df.select_dtypes(include=include, exclude=exclude) + expected = df[["b", "c", "e"]] + tm.assert_frame_equal(result, expected) def test_select_dtypes_include_using_scalars(self): df = DataFrame( From 502093b984848a58bcd92e71330ff673afb56e39 Mon Sep 17 00:00:00 2001 From: Oleh Kozynets Date: Fri, 5 Feb 2021 17:11:47 +0100 Subject: [PATCH 8/9] Parametrize tests. --- .../tests/frame/methods/test_select_dtypes.py | 18 ++++-------------- 1 file changed, 4 insertions(+), 14 deletions(-) diff --git a/pandas/tests/frame/methods/test_select_dtypes.py b/pandas/tests/frame/methods/test_select_dtypes.py index 2ab58d2b484b0..2a94b18b806f8 100644 --- a/pandas/tests/frame/methods/test_select_dtypes.py +++ b/pandas/tests/frame/methods/test_select_dtypes.py @@ -128,7 +128,10 @@ def test_select_dtypes_exclude_include_using_list_like(self): e = df[["b", "e"]] tm.assert_frame_equal(r, e) - def test_select_dtypes_exclude_include_int(self): + @pytest.mark.parametrize( + "include", [(np.bool_, "int"), (np.bool_, "integer"), ("bool", int)] + ) + def test_select_dtypes_exclude_include_int(self, include): # Fix select_dtypes(include='int') for Windows, FYI #36596 df = DataFrame( { @@ -141,19 +144,6 @@ def test_select_dtypes_exclude_include_int(self): } ) exclude = (np.datetime64,) - include = np.bool_, "int" - result = df.select_dtypes(include=include, exclude=exclude) - expected = df[["b", "c", "e"]] - tm.assert_frame_equal(result, expected) - - exclude = (np.datetime64,) - include = np.bool_, "integer" - result = df.select_dtypes(include=include, exclude=exclude) - expected = df[["b", "c", "e"]] - tm.assert_frame_equal(result, expected) - - exclude = ("datetime",) - include = "bool", int result = df.select_dtypes(include=include, exclude=exclude) expected = df[["b", "c", "e"]] tm.assert_frame_equal(result, expected) From 06b7077eb56422c25c123f563a54c32a1b10c3fc Mon Sep 17 00:00:00 2001 From: Oleh Kozynets Date: Sat, 6 Feb 2021 00:10:49 +0100 Subject: [PATCH 9/9] Trigger CI.