Skip to content

Commit 9498c10

Browse files
Backport PR #48824 on branch 1.5.x (REGR: groupby fails with nullable dtypes and dropna=False) (#48938)
Backport PR #48824: REGR: groupby fails with nullable dtypes and dropna=False Co-authored-by: Richard Shadrach <[email protected]>
1 parent 48d20aa commit 9498c10

File tree

4 files changed

+73
-56
lines changed

4 files changed

+73
-56
lines changed

doc/source/whatsnew/v1.5.1.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,7 @@ Fixed regressions
8383
- Fixed :meth:`.DataFrameGroupBy.size` not returning a Series when ``axis=1`` (:issue:`48738`)
8484
- Fixed Regression in :meth:`DataFrameGroupBy.apply` when user defined function is called on an empty dataframe (:issue:`47985`)
8585
- Fixed regression in :meth:`DataFrame.apply` when passing non-zero ``axis`` via keyword argument (:issue:`48656`)
86-
-
86+
- Fixed regression in :meth:`Series.groupby` and :meth:`DataFrame.groupby` when the grouper is a nullable data type (e.g. :class:`Int64`) or a PyArrow-backed string array, contains null values, and ``dropna=False`` (:issue:`48794`)
8787

8888
.. ---------------------------------------------------------------------------
8989

pandas/core/arrays/arrow/array.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -618,8 +618,8 @@ def factorize(
618618
na_mask = indices.values == -1
619619
na_index = na_mask.argmax()
620620
if na_mask[na_index]:
621-
uniques = uniques.insert(na_index, self.dtype.na_value)
622-
na_code = 0 if na_index == 0 else indices[:na_index].argmax() + 1
621+
na_code = 0 if na_index == 0 else indices[:na_index].max() + 1
622+
uniques = uniques.insert(na_code, self.dtype.na_value)
623623
indices[indices >= na_code] += 1
624624
indices[indices == -1] = na_code
625625
else:

pandas/core/arrays/masked.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -911,7 +911,7 @@ def factorize(
911911
else:
912912
# mypy error: Slice index must be an integer or None
913913
# https://github.com/python/mypy/issues/2410
914-
na_code = codes[:na_index].argmax() + 1 # type: ignore[misc]
914+
na_code = codes[:na_index].max() + 1 # type: ignore[misc]
915915
codes[codes >= na_code] += 1
916916
codes[codes == -1] = na_code
917917
# dummy value for uniques; not used since uniques_mask will be True

pandas/tests/groupby/test_groupby_dropna.py

+69-52
Original file line numberDiff line numberDiff line change
@@ -393,74 +393,91 @@ def test_groupby_drop_nan_with_multi_index():
393393
tm.assert_frame_equal(result, expected)
394394

395395

396+
# sequence_index enumerates all strings made up of x, y, z of length 4
397+
@pytest.mark.parametrize("sequence_index", range(3**4))
396398
@pytest.mark.parametrize(
397-
"values, dtype",
399+
"dtype",
398400
[
399-
([2, np.nan, 1, 2], None),
400-
([2, np.nan, 1, 2], "UInt8"),
401-
([2, np.nan, 1, 2], "Int8"),
402-
([2, np.nan, 1, 2], "UInt16"),
403-
([2, np.nan, 1, 2], "Int16"),
404-
([2, np.nan, 1, 2], "UInt32"),
405-
([2, np.nan, 1, 2], "Int32"),
406-
([2, np.nan, 1, 2], "UInt64"),
407-
([2, np.nan, 1, 2], "Int64"),
408-
([2, np.nan, 1, 2], "Float32"),
409-
([2, np.nan, 1, 2], "Int64"),
410-
([2, np.nan, 1, 2], "Float64"),
401+
None,
402+
"UInt8",
403+
"Int8",
404+
"UInt16",
405+
"Int16",
406+
"UInt32",
407+
"Int32",
408+
"UInt64",
409+
"Int64",
410+
"Float32",
411+
"Int64",
412+
"Float64",
413+
"category",
414+
"string",
411415
pytest.param(
412-
["y", None, "x", "y"],
413-
"category",
414-
marks=pytest.mark.xfail(
415-
reason="dropna=False not correct for categorical, GH#48645"
416-
),
417-
),
418-
(["y", pd.NA, "x", "y"], "string"),
419-
pytest.param(
420-
["y", pd.NA, "x", "y"],
421416
"string[pyarrow]",
422417
marks=pytest.mark.skipif(
423418
pa_version_under1p01, reason="pyarrow is not installed"
424419
),
425420
),
426-
(
427-
["2016-01-01", np.datetime64("NaT"), "2017-01-01", "2016-01-01"],
428-
"datetime64[ns]",
429-
),
430-
(
431-
[
432-
pd.Period("2012-02-01", freq="D"),
433-
pd.NaT,
434-
pd.Period("2012-01-01", freq="D"),
435-
pd.Period("2012-02-01", freq="D"),
436-
],
437-
None,
438-
),
439-
(pd.arrays.SparseArray([2, np.nan, 1, 2]), None),
421+
"datetime64[ns]",
422+
"period[d]",
423+
"Sparse[float]",
440424
],
441425
)
442426
@pytest.mark.parametrize("test_series", [True, False])
443-
def test_no_sort_keep_na(values, dtype, test_series):
444-
# GH#46584
445-
key = pd.Series(values, dtype=dtype)
446-
df = pd.DataFrame({"key": key, "a": [1, 2, 3, 4]})
427+
def test_no_sort_keep_na(request, sequence_index, dtype, test_series):
428+
# GH#46584, GH#48794
429+
430+
# Convert sequence_index into a string sequence, e.g. 5 becomes "xxyz"
431+
# This sequence is used for the grouper.
432+
sequence = "".join(
433+
[{0: "x", 1: "y", 2: "z"}[sequence_index // (3**k) % 3] for k in range(4)]
434+
)
435+
436+
if dtype == "category" and "z" in sequence:
437+
# Only xfail when nulls are present
438+
msg = "dropna=False not correct for categorical, GH#48645"
439+
request.node.add_marker(pytest.mark.xfail(reason=msg))
440+
441+
# Unique values to use for grouper, depends on dtype
442+
if dtype in ("string", "string[pyarrow]"):
443+
uniques = {"x": "x", "y": "y", "z": pd.NA}
444+
elif dtype in ("datetime64[ns]", "period[d]"):
445+
uniques = {"x": "2016-01-01", "y": "2017-01-01", "z": pd.NA}
446+
else:
447+
uniques = {"x": 1, "y": 2, "z": np.nan}
448+
449+
df = pd.DataFrame(
450+
{
451+
"key": pd.Series([uniques[label] for label in sequence], dtype=dtype),
452+
"a": [0, 1, 2, 3],
453+
}
454+
)
447455
gb = df.groupby("key", dropna=False, sort=False)
448456
if test_series:
449457
gb = gb["a"]
458+
result = gb.sum()
450459

451-
warn = None
452-
if isinstance(values, pd.arrays.SparseArray):
453-
warn = FutureWarning
454-
msg = "passing a SparseArray to pd.Index will store that array directly"
455-
with tm.assert_produces_warning(warn, match=msg):
456-
result = gb.sum()
457-
expected = pd.DataFrame({"a": [5, 2, 3]}, index=key[:-1].rename("key"))
460+
# Manually compute the groupby sum, use the labels "x", "y", and "z" to avoid
461+
# issues with hashing np.nan
462+
summed = {}
463+
for idx, label in enumerate(sequence):
464+
summed[label] = summed.get(label, 0) + idx
465+
if dtype == "category":
466+
index = pd.CategoricalIndex(
467+
[uniques[e] for e in summed],
468+
list({uniques[k]: 0 for k in sequence if not pd.isnull(uniques[k])}),
469+
name="key",
470+
)
471+
elif isinstance(dtype, str) and dtype.startswith("Sparse"):
472+
index = pd.Index(
473+
pd.array([uniques[label] for label in summed], dtype=dtype), name="key"
474+
)
475+
else:
476+
index = pd.Index([uniques[label] for label in summed], dtype=dtype, name="key")
477+
expected = pd.Series(summed.values(), index=index, name="a", dtype=None)
478+
if not test_series:
479+
expected = expected.to_frame()
458480

459-
if test_series:
460-
expected = expected["a"]
461-
if expected.index.is_categorical():
462-
# TODO: Slicing reorders categories?
463-
expected.index = expected.index.reorder_categories(["y", "x"])
464481
tm.assert_equal(result, expected)
465482

466483

0 commit comments

Comments
 (0)