From a1c48bcb86f4b1e98c87d9c51f461cede70c3cee Mon Sep 17 00:00:00 2001 From: richard Date: Tue, 27 Sep 2022 16:19:42 -0400 Subject: [PATCH 1/4] REGR: groupby fails with nullable dtypes and dropna=False --- doc/source/whatsnew/v1.5.1.rst | 1 + pandas/core/arrays/arrow/array.py | 4 +- pandas/core/arrays/masked.py | 2 +- pandas/tests/groupby/test_groupby_dropna.py | 113 +++++++++++--------- 4 files changed, 66 insertions(+), 54 deletions(-) diff --git a/doc/source/whatsnew/v1.5.1.rst b/doc/source/whatsnew/v1.5.1.rst index ee66f2f649bc2..77622b25a9404 100644 --- a/doc/source/whatsnew/v1.5.1.rst +++ b/doc/source/whatsnew/v1.5.1.rst @@ -78,6 +78,7 @@ Fixed regressions - Fixed regression causing an ``AttributeError`` during warning emitted if the provided table name in :meth:`DataFrame.to_sql` and the table name actually used in the database do not match (:issue:`48733`) - Fixed :meth:`.DataFrameGroupBy.size` not returning a Series when ``axis=1`` (:issue:`48738`) - Fixed Regression in :meth:`DataFrameGroupBy.apply` when user defined function is called on an empty dataframe (:issue:`47985`) +- Fixed regression in :meth:`Series.groupby` and :meth:`DataFrame.groupby` when the grouper is a nullable data type (e.g. :class:`Int64`) or a PyArrow-backed string array, contains null values, and ``dropna=False`` (:issue:`48794`) .. --------------------------------------------------------------------------- diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 03f734730a0f9..d0c64ec522eef 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -619,8 +619,8 @@ def factorize( na_mask = indices.values == -1 na_index = na_mask.argmax() if na_mask[na_index]: - uniques = uniques.insert(na_index, self.dtype.na_value) - na_code = 0 if na_index == 0 else indices[:na_index].argmax() + 1 + na_code = 0 if na_index == 0 else indices[:na_index].max() + 1 + uniques = uniques.insert(na_code, self.dtype.na_value) indices[indices >= na_code] += 1 indices[indices == -1] = na_code else: diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 043e0baf3ec0e..34ca205f7709a 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -913,7 +913,7 @@ def factorize( else: # mypy error: Slice index must be an integer or None # https://github.com/python/mypy/issues/2410 - na_code = codes[:na_index].argmax() + 1 # type: ignore[misc] + na_code = codes[:na_index].max() + 1 # type: ignore[misc] codes[codes >= na_code] += 1 codes[codes == -1] = na_code # dummy value for uniques; not used since uniques_mask will be True diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py index 360e3096ceb63..d68ea4b5ab733 100644 --- a/pandas/tests/groupby/test_groupby_dropna.py +++ b/pandas/tests/groupby/test_groupby_dropna.py @@ -393,75 +393,86 @@ def test_groupby_drop_nan_with_multi_index(): tm.assert_frame_equal(result, expected) +# Test all combinations of values e.g. 1, 2, and NA. Use string labels to +# correspond to various dtypes. "z" always corresponds to NA. +@pytest.mark.parametrize("sequence0", ["x", "y", "z"]) +@pytest.mark.parametrize("sequence1", ["x", "y", "z"]) +@pytest.mark.parametrize("sequence2", ["x", "y", "z"]) +@pytest.mark.parametrize("sequence3", ["x", "y", "z"]) @pytest.mark.parametrize( - "values, dtype", + "uniques, dtype", [ - ([2, np.nan, 1, 2], None), - ([2, np.nan, 1, 2], "UInt8"), - ([2, np.nan, 1, 2], "Int8"), - ([2, np.nan, 1, 2], "UInt16"), - ([2, np.nan, 1, 2], "Int16"), - ([2, np.nan, 1, 2], "UInt32"), - ([2, np.nan, 1, 2], "Int32"), - ([2, np.nan, 1, 2], "UInt64"), - ([2, np.nan, 1, 2], "Int64"), - ([2, np.nan, 1, 2], "Float32"), - ([2, np.nan, 1, 2], "Int64"), - ([2, np.nan, 1, 2], "Float64"), + ({"x": 1, "y": 2, "z": np.nan}, None), + ({"x": 1, "y": 2, "z": pd.NA}, "UInt8"), + ({"x": 1, "y": 2, "z": pd.NA}, "Int8"), + ({"x": 1, "y": 2, "z": pd.NA}, "UInt16"), + ({"x": 1, "y": 2, "z": pd.NA}, "Int16"), + ({"x": 1, "y": 2, "z": pd.NA}, "UInt32"), + ({"x": 1, "y": 2, "z": pd.NA}, "Int32"), + ({"x": 1, "y": 2, "z": pd.NA}, "UInt64"), + ({"x": 1, "y": 2, "z": pd.NA}, "Int64"), + ({"x": 1, "y": 2, "z": pd.NA}, "Float32"), + ({"x": 1, "y": 2, "z": pd.NA}, "Int64"), + ({"x": 1, "y": 2, "z": pd.NA}, "Float64"), + ({"x": "x", "y": "y", "z": None}, "category"), + ({"x": "x", "y": "y", "z": pd.NA}, "string"), pytest.param( - ["y", None, "x", "y"], - "category", - marks=pytest.mark.xfail( - reason="dropna=False not correct for categorical, GH#48645" - ), - ), - (["y", pd.NA, "x", "y"], "string"), - pytest.param( - ["y", pd.NA, "x", "y"], + {"x": "x", "y": "y", "z": pd.NA}, "string[pyarrow]", marks=pytest.mark.skipif( pa_version_under1p01, reason="pyarrow is not installed" ), ), ( - ["2016-01-01", np.datetime64("NaT"), "2017-01-01", "2016-01-01"], + {"x": "2016-01-01", "y": "2017-01-01", "z": np.datetime64("NaT")}, "datetime64[ns]", ), ( - [ - pd.Period("2012-02-01", freq="D"), - pd.NaT, - pd.Period("2012-01-01", freq="D"), - pd.Period("2012-02-01", freq="D"), - ], + { + "x": pd.Period("2012-01-01", freq="D"), + "y": pd.Period("2012-02-01", freq="D"), + "z": pd.NaT, + }, None, ), - (pd.arrays.SparseArray([2, np.nan, 1, 2]), None), ], ) -@pytest.mark.parametrize("test_series", [True, False]) -def test_no_sort_keep_na(values, dtype, test_series): - # GH#46584 - key = pd.Series(values, dtype=dtype) - df = pd.DataFrame({"key": key, "a": [1, 2, 3, 4]}) +def test_no_sort_keep_na( + request, sequence0, sequence1, sequence2, sequence3, uniques, dtype +): + # GH#46584, GH#48794 + sequence = "".join([sequence0, sequence1, sequence2, sequence3]) + if dtype == "category" and "z" in sequence: + # Only xfail when nulls are present + msg = "dropna=False not correct for categorical, GH#48645" + request.node.add_marker(pytest.mark.xfail(reason=msg)) + if dtype == "datetime64[ns]" and sequence == "zzzz": + msg = "Cannot construct datetime of all nulls" + request.node.add_marker(pytest.mark.xfail(reason=msg)) + weights = {"x": 1, "y": 2, "z": 3} + + key = pd.Series([uniques[label] for label in sequence], dtype=dtype) + df = pd.DataFrame({"key": key, "a": [weights[label] for label in sequence]}) gb = df.groupby("key", dropna=False, sort=False) - if test_series: - gb = gb["a"] - - warn = None - if isinstance(values, pd.arrays.SparseArray): - warn = FutureWarning - msg = "passing a SparseArray to pd.Index will store that array directly" - with tm.assert_produces_warning(warn, match=msg): - result = gb.sum() - expected = pd.DataFrame({"a": [5, 2, 3]}, index=key[:-1].rename("key")) - if test_series: - expected = expected["a"] - if expected.index.is_categorical(): - # TODO: Slicing reorders categories? - expected.index = expected.index.reorder_categories(["y", "x"]) - tm.assert_equal(result, expected) + result = gb.sum() + # Manually compute the groupby sum, use the labels "x", "y", and "z" to avoid + # issues with hashing np.nan + summed = {} + for label in sequence: + summed[label] = summed.get(label, 0) + weights[label] + + if dtype == "category": + index = pd.CategoricalIndex( + [uniques[label] for label in summed], + # Get the nonnull categories in the order they appear ignoring duplicates + list({uniques[k]: 0 for k in sequence if not pd.isnull(uniques[k])}), + name="key", + ) + else: + index = pd.Index([uniques[label] for label in summed], dtype=dtype, name="key") + expected = pd.Series(summed.values(), index=index, name="a", dtype=None).to_frame() + tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("test_series", [True, False]) From 8a107bee5ff59accde019193803220f78369c6ef Mon Sep 17 00:00:00 2001 From: richard Date: Tue, 27 Sep 2022 16:19:42 -0400 Subject: [PATCH 2/4] Rework test --- doc/source/whatsnew/v1.5.1.rst | 3 - pandas/tests/groupby/test_groupby_dropna.py | 110 +++++++++++--------- 2 files changed, 58 insertions(+), 55 deletions(-) diff --git a/doc/source/whatsnew/v1.5.1.rst b/doc/source/whatsnew/v1.5.1.rst index 77622b25a9404..a733eb63be7d4 100644 --- a/doc/source/whatsnew/v1.5.1.rst +++ b/doc/source/whatsnew/v1.5.1.rst @@ -75,9 +75,6 @@ Fixed regressions - Fixed regression in :meth:`DataFrame.describe` raising ``TypeError`` when result contains ``NA`` (:issue:`48778`) - Fixed regression in :meth:`DataFrame.plot` ignoring invalid ``colormap`` for ``kind="scatter"`` (:issue:`48726`) - Fixed performance regression in :func:`factorize` when ``na_sentinel`` is not ``None`` and ``sort=False`` (:issue:`48620`) -- Fixed regression causing an ``AttributeError`` during warning emitted if the provided table name in :meth:`DataFrame.to_sql` and the table name actually used in the database do not match (:issue:`48733`) -- Fixed :meth:`.DataFrameGroupBy.size` not returning a Series when ``axis=1`` (:issue:`48738`) -- Fixed Regression in :meth:`DataFrameGroupBy.apply` when user defined function is called on an empty dataframe (:issue:`47985`) - Fixed regression in :meth:`Series.groupby` and :meth:`DataFrame.groupby` when the grouper is a nullable data type (e.g. :class:`Int64`) or a PyArrow-backed string array, contains null values, and ``dropna=False`` (:issue:`48794`) .. --------------------------------------------------------------------------- diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py index d68ea4b5ab733..18abb3e30ef45 100644 --- a/pandas/tests/groupby/test_groupby_dropna.py +++ b/pandas/tests/groupby/test_groupby_dropna.py @@ -393,86 +393,92 @@ def test_groupby_drop_nan_with_multi_index(): tm.assert_frame_equal(result, expected) -# Test all combinations of values e.g. 1, 2, and NA. Use string labels to -# correspond to various dtypes. "z" always corresponds to NA. -@pytest.mark.parametrize("sequence0", ["x", "y", "z"]) -@pytest.mark.parametrize("sequence1", ["x", "y", "z"]) -@pytest.mark.parametrize("sequence2", ["x", "y", "z"]) -@pytest.mark.parametrize("sequence3", ["x", "y", "z"]) +# sequence_index enumerates all strings made up of x, y, z of length 4 +@pytest.mark.parametrize("sequence_index", range(3**4 + 1)) @pytest.mark.parametrize( - "uniques, dtype", + "dtype", [ - ({"x": 1, "y": 2, "z": np.nan}, None), - ({"x": 1, "y": 2, "z": pd.NA}, "UInt8"), - ({"x": 1, "y": 2, "z": pd.NA}, "Int8"), - ({"x": 1, "y": 2, "z": pd.NA}, "UInt16"), - ({"x": 1, "y": 2, "z": pd.NA}, "Int16"), - ({"x": 1, "y": 2, "z": pd.NA}, "UInt32"), - ({"x": 1, "y": 2, "z": pd.NA}, "Int32"), - ({"x": 1, "y": 2, "z": pd.NA}, "UInt64"), - ({"x": 1, "y": 2, "z": pd.NA}, "Int64"), - ({"x": 1, "y": 2, "z": pd.NA}, "Float32"), - ({"x": 1, "y": 2, "z": pd.NA}, "Int64"), - ({"x": 1, "y": 2, "z": pd.NA}, "Float64"), - ({"x": "x", "y": "y", "z": None}, "category"), - ({"x": "x", "y": "y", "z": pd.NA}, "string"), + None, + "UInt8", + "Int8", + "UInt16", + "Int16", + "UInt32", + "Int32", + "UInt64", + "Int64", + "Float32", + "Int64", + "Float64", + "category", + "string", pytest.param( - {"x": "x", "y": "y", "z": pd.NA}, "string[pyarrow]", marks=pytest.mark.skipif( pa_version_under1p01, reason="pyarrow is not installed" ), ), - ( - {"x": "2016-01-01", "y": "2017-01-01", "z": np.datetime64("NaT")}, - "datetime64[ns]", - ), - ( - { - "x": pd.Period("2012-01-01", freq="D"), - "y": pd.Period("2012-02-01", freq="D"), - "z": pd.NaT, - }, - None, - ), + "datetime64[ns]", + "period[d]", + "Sparse[float]", ], ) -def test_no_sort_keep_na( - request, sequence0, sequence1, sequence2, sequence3, uniques, dtype -): +@pytest.mark.parametrize("test_series", [True, False]) +def test_no_sort_keep_na(request, sequence_index, dtype, test_series): # GH#46584, GH#48794 - sequence = "".join([sequence0, sequence1, sequence2, sequence3]) + + # Convert sequence_index into a string sequence, e.g. 5 becomes "xxyz" + # This sequence is used for the grouper. + sequence = "".join( + [{0: "x", 1: "y", 2: "z"}[sequence_index // (3**k) % 3] for k in range(4)] + ) + if dtype == "category" and "z" in sequence: # Only xfail when nulls are present msg = "dropna=False not correct for categorical, GH#48645" request.node.add_marker(pytest.mark.xfail(reason=msg)) - if dtype == "datetime64[ns]" and sequence == "zzzz": - msg = "Cannot construct datetime of all nulls" - request.node.add_marker(pytest.mark.xfail(reason=msg)) - weights = {"x": 1, "y": 2, "z": 3} - key = pd.Series([uniques[label] for label in sequence], dtype=dtype) - df = pd.DataFrame({"key": key, "a": [weights[label] for label in sequence]}) - gb = df.groupby("key", dropna=False, sort=False) + # Unique values to use for grouper, depends on dtype + if dtype in ("string", "string[pyarrow]"): + uniques = {"x": "x", "y": "y", "z": pd.NA} + elif dtype in ("datetime64[ns]", "period[d]"): + uniques = {"x": "2016-01-01", "y": "2017-01-01", "z": pd.NA} + else: + uniques = {"x": 1, "y": 2, "z": np.nan} + df = pd.DataFrame( + { + "key": pd.Series([uniques[label] for label in sequence], dtype=dtype), + "a": [0, 1, 2, 3], + } + ) + gb = df.groupby("key", dropna=False, sort=False) + if test_series: + gb = gb["a"] result = gb.sum() + # Manually compute the groupby sum, use the labels "x", "y", and "z" to avoid # issues with hashing np.nan summed = {} - for label in sequence: - summed[label] = summed.get(label, 0) + weights[label] - + for idx, label in enumerate(sequence): + summed[label] = summed.get(label, 0) + idx if dtype == "category": index = pd.CategoricalIndex( - [uniques[label] for label in summed], - # Get the nonnull categories in the order they appear ignoring duplicates + [uniques[e] for e in summed], list({uniques[k]: 0 for k in sequence if not pd.isnull(uniques[k])}), name="key", ) + elif isinstance(dtype, str) and dtype.startswith("Sparse"): + index = pd.Index( + pd.array([uniques[label] for label in summed], dtype=dtype), name="key" + ) else: index = pd.Index([uniques[label] for label in summed], dtype=dtype, name="key") - expected = pd.Series(summed.values(), index=index, name="a", dtype=None).to_frame() - tm.assert_frame_equal(result, expected) + expected = pd.Series(summed.values(), index=index, name="a", dtype=None) + if not test_series: + expected = expected.to_frame() + + tm.assert_equal(result, expected) @pytest.mark.parametrize("test_series", [True, False]) From 0510d95d90662b4c92f502b7de2bd873c0e89220 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Wed, 28 Sep 2022 18:22:06 -0400 Subject: [PATCH 3/4] Merge cleanup --- doc/source/whatsnew/v1.5.1.rst | 3 +++ 1 file changed, 3 insertions(+) diff --git a/doc/source/whatsnew/v1.5.1.rst b/doc/source/whatsnew/v1.5.1.rst index a733eb63be7d4..77622b25a9404 100644 --- a/doc/source/whatsnew/v1.5.1.rst +++ b/doc/source/whatsnew/v1.5.1.rst @@ -75,6 +75,9 @@ Fixed regressions - Fixed regression in :meth:`DataFrame.describe` raising ``TypeError`` when result contains ``NA`` (:issue:`48778`) - Fixed regression in :meth:`DataFrame.plot` ignoring invalid ``colormap`` for ``kind="scatter"`` (:issue:`48726`) - Fixed performance regression in :func:`factorize` when ``na_sentinel`` is not ``None`` and ``sort=False`` (:issue:`48620`) +- Fixed regression causing an ``AttributeError`` during warning emitted if the provided table name in :meth:`DataFrame.to_sql` and the table name actually used in the database do not match (:issue:`48733`) +- Fixed :meth:`.DataFrameGroupBy.size` not returning a Series when ``axis=1`` (:issue:`48738`) +- Fixed Regression in :meth:`DataFrameGroupBy.apply` when user defined function is called on an empty dataframe (:issue:`47985`) - Fixed regression in :meth:`Series.groupby` and :meth:`DataFrame.groupby` when the grouper is a nullable data type (e.g. :class:`Int64`) or a PyArrow-backed string array, contains null values, and ``dropna=False`` (:issue:`48794`) .. --------------------------------------------------------------------------- From 9800be72e6e15a36b7773ba5bd4bb74e53463a9a Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Wed, 28 Sep 2022 18:25:13 -0400 Subject: [PATCH 4/4] Range fixup --- pandas/tests/groupby/test_groupby_dropna.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py index 18abb3e30ef45..ee660dd073ce9 100644 --- a/pandas/tests/groupby/test_groupby_dropna.py +++ b/pandas/tests/groupby/test_groupby_dropna.py @@ -394,7 +394,7 @@ def test_groupby_drop_nan_with_multi_index(): # sequence_index enumerates all strings made up of x, y, z of length 4 -@pytest.mark.parametrize("sequence_index", range(3**4 + 1)) +@pytest.mark.parametrize("sequence_index", range(3**4)) @pytest.mark.parametrize( "dtype", [