From 2a3f8145769d055886a48bc6b9c6dc4e7ed257e3 Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Sun, 22 Mar 2020 15:54:35 -0500 Subject: [PATCH 01/16] Add test --- pandas/tests/groupby/test_function.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 9c33843cdcecc..128b867f0689c 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -1636,3 +1636,20 @@ def test_apply_to_nullable_integer_returns_float(values, function): result = groups.agg([function]) expected.columns = MultiIndex.from_tuples([("b", function)]) tm.assert_frame_equal(result, expected) + + +def test_groupby_sum_below_mincount_nullable_integer(): + # https://github.com/pandas-dev/pandas/issues/32861 + df = pd.DataFrame({"a": [0, 1, 2], "b": [0, 1, 2], "c": [0, 1, 2]}, dtype="Int64") + grouped = df.groupby("a") + idx = pd.Index([0, 1, 2], dtype=object, name="a") + + result = grouped["b"].sum(min_count=2) + expected = pd.Series([np.nan] * 3, index=idx, name="b") + tm.assert_series_equal(result, expected) + + result = grouped.sum(min_count=2) + expected = pd.DataFrame( + {"b": [pd.NA] * 3, "c": [pd.NA] * 3}, dtype="Int64", index=idx + ) + tm.assert_frame_equal(result, expected) From 3ce038427a430f95ad615202e361c0cc91aeaa14 Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Sun, 22 Mar 2020 15:54:58 -0500 Subject: [PATCH 02/16] Check for null --- pandas/core/groupby/ops.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 577c874c9cbbe..b4f833710db80 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -553,7 +553,8 @@ def _cython_operation( # Two options for avoiding this special case # 1. mask-aware ops and avoid casting to float with NaN above # 2. specify the result dtype when calling this method - result = result.astype("int64") + if not isna(result).any(): + result = result.astype("int64") if kind == "aggregate" and self._filter_empty_groups and not counts.all(): assert result.ndim != 2 From e25c823ae8e0a41bb3af65eb7d3faca580731840 Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Sun, 22 Mar 2020 15:55:10 -0500 Subject: [PATCH 03/16] Release note --- doc/source/whatsnew/v1.1.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 4044fb2d3fa09..2305b6032e257 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -364,6 +364,7 @@ Groupby/resample/rolling - Bug in :meth:`GroupBy.apply` raises ``ValueError`` when the ``by`` axis is not sorted and has duplicates and the applied ``func`` does not mutate passed in objects (:issue:`30667`) - Bug in :meth:`DataFrameGroupby.transform` produces incorrect result with transformation functions (:issue:`30918`) +- Bug in :meth:`DataFrameGroupBy.sum` and :meth:`SeriesGroupBy.sum` where a large negative number would be returned when the number of non-null values was below ``min_count`` for nullable integer dtypes (:issue:`32861`) Reshaping ^^^^^^^^^ From 2656bc33ff89658078019595ee9662906e6ccc49 Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Sun, 22 Mar 2020 16:37:42 -0500 Subject: [PATCH 04/16] Update and comment --- pandas/core/groupby/ops.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index b4f833710db80..6e9f61e4980e1 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -547,14 +547,18 @@ def _cython_operation( how == "add" and is_integer_dtype(orig_values.dtype) and is_extension_array_dtype(orig_values.dtype) + and not isna(result).any() ): # We need this to ensure that Series[Int64Dtype].resample().sum() # remains int64 dtype. # Two options for avoiding this special case # 1. mask-aware ops and avoid casting to float with NaN above # 2. specify the result dtype when calling this method - if not isna(result).any(): - result = result.astype("int64") + # + # Sometimes result can contain null values (e.g. see + # https://github.com/pandas-dev/pandas/issues/32861) + # and so we must check for that before casting to int + result = result.astype("int64") if kind == "aggregate" and self._filter_empty_groups and not counts.all(): assert result.ndim != 2 From 300b1e5ee52d6c86e0505fac59b4da41e2d3e62e Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Sun, 22 Mar 2020 19:49:21 -0500 Subject: [PATCH 05/16] Update test --- pandas/tests/groupby/test_function.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 128b867f0689c..77c1b6c9ad2fa 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -1645,7 +1645,7 @@ def test_groupby_sum_below_mincount_nullable_integer(): idx = pd.Index([0, 1, 2], dtype=object, name="a") result = grouped["b"].sum(min_count=2) - expected = pd.Series([np.nan] * 3, index=idx, name="b") + expected = pd.Series([pd.NA] * 3, dtype="Int64", index=idx, name="b") tm.assert_series_equal(result, expected) result = grouped.sum(min_count=2) From e2e08e249eb4a8ca036547d5349e28af5aad268d Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Sun, 22 Mar 2020 19:51:09 -0500 Subject: [PATCH 06/16] Hack --- pandas/core/groupby/ops.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 6e9f61e4980e1..5a839208d9666 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -38,6 +38,7 @@ ) from pandas.core.dtypes.missing import _maybe_fill, isna +import pandas as pd import pandas.core.algorithms as algorithms from pandas.core.base import SelectionMixin import pandas.core.common as com @@ -582,6 +583,14 @@ def _cython_operation( elif is_datetimelike and kind == "aggregate": result = result.astype(orig_values.dtype) + if ( + how == "add" + and is_integer_dtype(orig_values.dtype) + and is_extension_array_dtype(orig_values.dtype) + and isna(result).any() + ): + result = pd.array(result.ravel(), dtype="Int64") + return result, names def aggregate( From cdf7dfb95b9056972ff0629d57f2e6ac129e89c6 Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Thu, 26 Mar 2020 20:01:20 -0500 Subject: [PATCH 07/16] Try a different casting --- pandas/core/groupby/ops.py | 24 ++++-------------------- 1 file changed, 4 insertions(+), 20 deletions(-) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index fc6e94224a74b..7ce2e7912b157 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -18,6 +18,7 @@ from pandas.errors import AbstractMethodError from pandas.util._decorators import cache_readonly +from pandas.core.dtypes.cast import maybe_cast_to_extension_array from pandas.core.dtypes.common import ( ensure_float64, ensure_int64, @@ -542,22 +543,6 @@ def _cython_operation( if mask.any(): result = result.astype("float64") result[mask] = np.nan - elif ( - how == "add" - and is_integer_dtype(orig_values.dtype) - and is_extension_array_dtype(orig_values.dtype) - and not isna(result).any() - ): - # We need this to ensure that Series[Int64Dtype].resample().sum() - # remains int64 dtype. - # Two options for avoiding this special case - # 1. mask-aware ops and avoid casting to float with NaN above - # 2. specify the result dtype when calling this method - # - # Sometimes result can contain null values (e.g. see - # https://github.com/pandas-dev/pandas/issues/32861) - # and so we must check for that before casting to int - result = result.astype("int64") if kind == "aggregate" and self._filter_empty_groups and not counts.all(): assert result.ndim != 2 @@ -582,12 +567,11 @@ def _cython_operation( result = result.astype(orig_values.dtype) if ( - how == "add" - and is_integer_dtype(orig_values.dtype) + is_integer_dtype(orig_values.dtype) and is_extension_array_dtype(orig_values.dtype) - and isna(result).any() + and not is_extension_array_dtype(result.dtype) ): - result = pd.array(result.ravel(), dtype="Int64") + result = maybe_cast_to_extension_array(type(orig_values), result) return result, names From b4edea38d19de166f94dc3b0c0afdece342d48fb Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Thu, 26 Mar 2020 20:03:57 -0500 Subject: [PATCH 08/16] No pd --- pandas/core/groupby/ops.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 7ce2e7912b157..e28533217e4fb 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -39,7 +39,6 @@ ) from pandas.core.dtypes.missing import _maybe_fill, isna -import pandas as pd import pandas.core.algorithms as algorithms from pandas.core.base import SelectionMixin import pandas.core.common as com From 02fc55c0869a969abfc4da64e800373676a6b4de Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Thu, 26 Mar 2020 20:32:31 -0500 Subject: [PATCH 09/16] Only for add --- pandas/core/groupby/ops.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index e28533217e4fb..2d9f8554e6c12 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -566,7 +566,8 @@ def _cython_operation( result = result.astype(orig_values.dtype) if ( - is_integer_dtype(orig_values.dtype) + how == "add" + and is_integer_dtype(orig_values.dtype) and is_extension_array_dtype(orig_values.dtype) and not is_extension_array_dtype(result.dtype) ): From fe0ab930bb61b460cd001117ae6f6043a156956f Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Fri, 27 Mar 2020 11:00:25 -0500 Subject: [PATCH 10/16] Undo --- doc/source/whatsnew/v1.1.0.rst | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 92af32757aa1a..7920820b32620 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -314,7 +314,7 @@ Conversion Strings ^^^^^^^ -- Bug in the :meth:`~Series.astype` method when converting "string" dtype data to nullable integer dtype (:issue:`32450`). +- - @@ -400,6 +400,8 @@ Groupby/resample/rolling - Bug in :meth:`GroupBy.apply` raises ``ValueError`` when the ``by`` axis is not sorted and has duplicates and the applied ``func`` does not mutate passed in objects (:issue:`30667`) - Bug in :meth:`DataFrameGroupby.transform` produces incorrect result with transformation functions (:issue:`30918`) +- Bug in :meth:`DataFrame.groupby` and :meth:`Series.groupby` produces inconsistent type when aggregating Boolean series (:issue:`32894`) + Reshaping ^^^^^^^^^ @@ -420,6 +422,7 @@ Reshaping - Bug in :meth:`DataFrame.pivot_table` losing timezone information when creating a :class:`MultiIndex` level from a column with timezone-aware dtype (:issue:`32558`) - Bug in :meth:`concat` where when passing a non-dict mapping as ``objs`` would raise a ``TypeError`` (:issue:`32863`) - :meth:`DataFrame.agg` now provides more descriptive ``SpecificationError`` message when attempting to aggregating non-existant column (:issue:`32755`) +- Bug in :meth:`DataFrame.unstack` when MultiIndexed columns and MultiIndexed rows were used (:issue:`32624`, :issue:`24729` and :issue:`28306`) Sparse From e6f5b4df6353b08dd777acaafc8007094b41eaa5 Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Sun, 22 Mar 2020 15:55:10 -0500 Subject: [PATCH 11/16] Release note --- doc/source/whatsnew/v1.1.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 7920820b32620..7b4700f800e40 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -401,7 +401,7 @@ Groupby/resample/rolling - Bug in :meth:`GroupBy.apply` raises ``ValueError`` when the ``by`` axis is not sorted and has duplicates and the applied ``func`` does not mutate passed in objects (:issue:`30667`) - Bug in :meth:`DataFrameGroupby.transform` produces incorrect result with transformation functions (:issue:`30918`) - Bug in :meth:`DataFrame.groupby` and :meth:`Series.groupby` produces inconsistent type when aggregating Boolean series (:issue:`32894`) - +- Bug in :meth:`DataFrameGroupBy.sum` and :meth:`SeriesGroupBy.sum` where a large negative number would be returned when the number of non-null values was below ``min_count`` for nullable integer dtypes (:issue:`32861`) Reshaping ^^^^^^^^^ From c29dfad03f3fe12e44f6c1cdeb1e0fabf5d70e96 Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Fri, 27 Mar 2020 11:05:46 -0500 Subject: [PATCH 12/16] Fix --- doc/source/whatsnew/v1.1.0.rst | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 7b4700f800e40..1bc2d2823d32e 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -314,9 +314,8 @@ Conversion Strings ^^^^^^^ +- Bug in the :meth:`~Series.astype` method when converting "string" dtype data to nullable integer dtype (:issue:`32450`). - -- - Interval ^^^^^^^^ From fb6b1d58754ae7c8c7a34a141180d1df0b3c0b80 Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Fri, 27 Mar 2020 11:07:32 -0500 Subject: [PATCH 13/16] Space --- doc/source/whatsnew/v1.1.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 1bc2d2823d32e..8846325d6db8d 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -317,6 +317,7 @@ Strings - Bug in the :meth:`~Series.astype` method when converting "string" dtype data to nullable integer dtype (:issue:`32450`). - + Interval ^^^^^^^^ - From 7c815b588b89b07c68c6937a7e39d3be2156a4b8 Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Tue, 31 Mar 2020 19:54:00 -0500 Subject: [PATCH 14/16] maybe_cast_result --- pandas/core/dtypes/cast.py | 14 ++++++++------ pandas/core/groupby/ops.py | 11 ++++------- 2 files changed, 12 insertions(+), 13 deletions(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index da9646aa8c46f..c0345ff2a1aa4 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -32,6 +32,7 @@ ensure_str, is_bool, is_bool_dtype, + is_categorical_dtype, is_complex, is_complex_dtype, is_datetime64_dtype, @@ -275,12 +276,13 @@ def maybe_cast_result( dtype = maybe_cast_result_dtype(dtype, how) if not is_scalar(result): - if is_extension_array_dtype(dtype) and dtype.kind != "M": - # The result may be of any type, cast back to original - # type if it's compatible. - if len(result) and isinstance(result[0], dtype.type): - cls = dtype.construct_array_type() - result = maybe_cast_to_extension_array(cls, result, dtype=dtype) + if ( + is_extension_array_dtype(dtype) + and dtype.kind != "M" + and not is_categorical_dtype(dtype) + ): + cls = dtype.construct_array_type() + result = maybe_cast_to_extension_array(cls, result, dtype=dtype) elif numeric_only and is_numeric_dtype(dtype) or not numeric_only: result = maybe_downcast_to_dtype(result, dtype) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 2d9f8554e6c12..1db197ee195e4 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -18,7 +18,7 @@ from pandas.errors import AbstractMethodError from pandas.util._decorators import cache_readonly -from pandas.core.dtypes.cast import maybe_cast_to_extension_array +from pandas.core.dtypes.cast import maybe_cast_result from pandas.core.dtypes.common import ( ensure_float64, ensure_int64, @@ -565,13 +565,10 @@ def _cython_operation( elif is_datetimelike and kind == "aggregate": result = result.astype(orig_values.dtype) - if ( - how == "add" - and is_integer_dtype(orig_values.dtype) - and is_extension_array_dtype(orig_values.dtype) - and not is_extension_array_dtype(result.dtype) + if is_integer_dtype(orig_values.dtype) and is_extension_array_dtype( + orig_values.dtype ): - result = maybe_cast_to_extension_array(type(orig_values), result) + result = maybe_cast_result(result=result, obj=orig_values, how=how) return result, names From aeec4b04c705ae70f84ff1142e5a0cd87a04e478 Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Tue, 31 Mar 2020 20:44:06 -0500 Subject: [PATCH 15/16] float -> Int --- pandas/tests/resample/test_datetime_index.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pandas/tests/resample/test_datetime_index.py b/pandas/tests/resample/test_datetime_index.py index 3ad82b9e075a8..ab6985b11ba9a 100644 --- a/pandas/tests/resample/test_datetime_index.py +++ b/pandas/tests/resample/test_datetime_index.py @@ -122,9 +122,7 @@ def test_resample_integerarray(): result = ts.resample("3T").mean() expected = Series( - [1, 4, 7], - index=pd.date_range("1/1/2000", periods=3, freq="3T"), - dtype="float64", + [1, 4, 7], index=pd.date_range("1/1/2000", periods=3, freq="3T"), dtype="Int64", ) tm.assert_series_equal(result, expected) From fc0f40600c86166685815724c3388b58fe2fd477 Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Fri, 3 Apr 2020 21:26:28 -0500 Subject: [PATCH 16/16] Less if --- pandas/core/groupby/ops.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 1db197ee195e4..bfa205da0b282 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -565,9 +565,7 @@ def _cython_operation( elif is_datetimelike and kind == "aggregate": result = result.astype(orig_values.dtype) - if is_integer_dtype(orig_values.dtype) and is_extension_array_dtype( - orig_values.dtype - ): + if is_extension_array_dtype(orig_values.dtype): result = maybe_cast_result(result=result, obj=orig_values, how=how) return result, names