From 9728d81e9e8ec5cbe2e5030d565f1fe3db1206ae Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Sat, 16 May 2020 16:33:05 +0100 Subject: [PATCH 1/8] BUG: DataFrame with Int64 columns casts to float64 with .max()/.min() --- doc/source/whatsnew/v1.1.0.rst | 1 + pandas/core/frame.py | 4 +++- pandas/tests/frame/test_analytics.py | 13 +++++++++++++ 3 files changed, 17 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 73892da2cbf71..7968c44346003 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -865,6 +865,7 @@ ExtensionArray - Fixed bug where :meth:`StringArray.memory_usage` was not implemented (:issue:`33963`) - Fixed bug where :meth:`DataFrameGroupBy` would ignore the ``min_count`` argument for aggregations on nullable boolean dtypes (:issue:`34051`) - Fixed bug that `DataFrame(columns=.., dtype='string')` would fail (:issue:`27953`, :issue:`33623`) +- Fixed bug where :meth:`DataFrame.min` and :meth:`DataFrame.max` with Int64 columns casts to float64 (:issue:`32651`) Other ^^^^^ diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 31015e3095e7d..d721b76aca7de 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -8419,7 +8419,9 @@ def _get_data(axis_matters): raise NotImplementedError(msg) return data - if numeric_only is not None and axis in [0, 1]: + if ( + self._is_homogeneous_type and self._mgr.any_extension_types and axis == 0 + ) or (numeric_only is not None and axis in [0, 1]): df = self if numeric_only is True: df = _get_data(axis_matters=True) diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index b4842e8d5e8ed..65e453beb0cec 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -907,6 +907,19 @@ def test_mean_extensionarray_numeric_only_true(self): expected = pd.DataFrame(arr).mean() tm.assert_series_equal(result, expected) + @pytest.mark.parametrize("numeric_only", [True, False, None]) + @pytest.mark.parametrize("method", ["min", "max"]) + def test_minmax_extensionarray(self, method, numeric_only): + # https://github.com/pandas-dev/pandas/issues/32651 + int64_info = np.iinfo("int64") + ser = Series([int64_info.max, None, int64_info.min], dtype=pd.Int64Dtype()) + df = DataFrame({"Int64": ser}) + result = getattr(df, method)(numeric_only=numeric_only) + expected = Series( + [getattr(int64_info, method)], index=pd.Index(["Int64"], dtype="object") + ) + tm.assert_series_equal(result, expected) + def test_stats_mixed_type(self, float_string_frame): # don't blow up float_string_frame.std(1) From 7765bac79a191e66348cd13812e454bd964c5b2f Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Sun, 17 May 2020 12:25:19 +0100 Subject: [PATCH 2/8] temp fix --- pandas/core/arrays/sparse/array.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index 2720c831bcff6..272e01f465b05 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -1220,7 +1220,7 @@ def any(self, axis=0, *args, **kwargs): return values.any().item() - def sum(self, axis=0, *args, **kwargs): + def sum(self, axis=0, min_count=0, *args, **kwargs): """ Sum of non-NA/null values From 4a14032bc1dfe50acc2064312eea94a4ccb46518 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Mon, 18 May 2020 14:15:42 +0100 Subject: [PATCH 3/8] changes condition --- pandas/core/frame.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index d721b76aca7de..4fa483b566c94 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -8420,8 +8420,8 @@ def _get_data(axis_matters): return data if ( - self._is_homogeneous_type and self._mgr.any_extension_types and axis == 0 - ) or (numeric_only is not None and axis in [0, 1]): + self._mgr.any_extension_types or numeric_only is not None + ) and axis is not None: df = self if numeric_only is True: df = _get_data(axis_matters=True) From e4e6c8ed4ee77f2d21496637ebf326c006e72633 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Mon, 18 May 2020 14:51:35 +0100 Subject: [PATCH 4/8] make results consistent for numeric_only parameter and numeric_only dataframes --- pandas/core/frame.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 4fa483b566c94..5f48f3eca9de0 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -8419,9 +8419,11 @@ def _get_data(axis_matters): raise NotImplementedError(msg) return data - if ( - self._mgr.any_extension_types or numeric_only is not None - ) and axis is not None: + # TODO: dispatch to EA reductions for all EA not just numeric + # https://github.com/pandas-dev/pandas/pull/34210/files + is_numeric = all(b.is_numeric for b in self._mgr.blocks) + + if (is_numeric or numeric_only is not None) and axis is not None: df = self if numeric_only is True: df = _get_data(axis_matters=True) @@ -8443,6 +8445,11 @@ def blk_func(values): assert isinstance(res, dict) if len(res): assert len(res) == max(list(res.keys())) + 1, res.keys() + elif not out_dtype: + # DeprecationWarning: The default dtype for empty Series will be + # 'object' instead of 'float64' in a future version. Specify + # a dtype explicitly to silence this warning. + out_dtype = 'float64' out = df._constructor_sliced(res, index=range(len(res)), dtype=out_dtype) out.index = df.columns if axis == 0 and is_object_dtype(out.dtype): From 9b5f41c3bb78c466917ff8447c1f453b1b2559ea Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Mon, 18 May 2020 14:52:56 +0100 Subject: [PATCH 5/8] black fixup --- pandas/core/frame.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 5f48f3eca9de0..a293923f3c26d 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -8449,7 +8449,7 @@ def blk_func(values): # DeprecationWarning: The default dtype for empty Series will be # 'object' instead of 'float64' in a future version. Specify # a dtype explicitly to silence this warning. - out_dtype = 'float64' + out_dtype = "float64" out = df._constructor_sliced(res, index=range(len(res)), dtype=out_dtype) out.index = df.columns if axis == 0 and is_object_dtype(out.dtype): From e704f514ac4463eb19eeeee809c11082b885b6c9 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Mon, 18 May 2020 16:55:08 +0100 Subject: [PATCH 6/8] comments --- pandas/core/frame.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index a293923f3c26d..e4f0c62e9ea15 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -8419,8 +8419,6 @@ def _get_data(axis_matters): raise NotImplementedError(msg) return data - # TODO: dispatch to EA reductions for all EA not just numeric - # https://github.com/pandas-dev/pandas/pull/34210/files is_numeric = all(b.is_numeric for b in self._mgr.blocks) if (is_numeric or numeric_only is not None) and axis is not None: @@ -8446,9 +8444,8 @@ def blk_func(values): if len(res): assert len(res) == max(list(res.keys())) + 1, res.keys() elif not out_dtype: - # DeprecationWarning: The default dtype for empty Series will be - # 'object' instead of 'float64' in a future version. Specify - # a dtype explicitly to silence this warning. + # The default dtype for empty Series will be 'object' instead of + # 'float64' in a future version. out_dtype = "float64" out = df._constructor_sliced(res, index=range(len(res)), dtype=out_dtype) out.index = df.columns From 7b18d89fdcebf362e32e364c104518f028ff3243 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Tue, 30 Jun 2020 13:02:30 +0100 Subject: [PATCH 7/8] change is_numeric to be False on empty DataFrame --- pandas/core/frame.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 01a94af3c4546..c0a991156b095 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -8508,7 +8508,9 @@ def _get_data(axis_matters): raise NotImplementedError(msg) return data - is_numeric = all(b.is_numeric for b in self._mgr.blocks) + is_numeric = all(b.is_numeric for b in self._mgr.blocks) and len( + self._mgr.blocks + ) if (is_numeric or numeric_only is not None) and axis is not None: df = self @@ -8532,10 +8534,6 @@ def blk_func(values): assert isinstance(res, dict) if len(res): assert len(res) == max(list(res.keys())) + 1, res.keys() - elif not out_dtype: - # The default dtype for empty Series will be 'object' instead of - # 'float64' in a future version. - out_dtype = "float64" out = df._constructor_sliced(res, index=range(len(res)), dtype=out_dtype) out.index = df.columns if axis == 0 and is_object_dtype(out.dtype): From e850108165ea4df44d52a814eb712a44b426e99b Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Tue, 30 Jun 2020 13:12:08 +0100 Subject: [PATCH 8/8] reorder condition --- pandas/core/frame.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index c0a991156b095..9e14773738faf 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -8508,9 +8508,7 @@ def _get_data(axis_matters): raise NotImplementedError(msg) return data - is_numeric = all(b.is_numeric for b in self._mgr.blocks) and len( - self._mgr.blocks - ) + is_numeric = self._mgr.blocks and all(b.is_numeric for b in self._mgr.blocks) if (is_numeric or numeric_only is not None) and axis is not None: df = self