Skip to content

Commit 19a3f5a

Browse files
Backport PR #48539 on branch 1.5.x (REGR: groupby doesn't identify null values when sort=False) (#48568)
Backport PR #48539: REGR: groupby doesn't identify null values when sort=False Co-authored-by: Richard Shadrach <[email protected]>
1 parent 6ee47f9 commit 19a3f5a

File tree

4 files changed

+39
-8
lines changed

4 files changed

+39
-8
lines changed

doc/source/whatsnew/v1.5.0.rst

-1
Original file line numberDiff line numberDiff line change
@@ -1016,7 +1016,6 @@ Numeric
10161016
- Bug in division, ``pow`` and ``mod`` operations on array-likes with ``dtype="boolean"`` not being like their ``np.bool_`` counterparts (:issue:`46063`)
10171017
- Bug in multiplying a :class:`Series` with ``IntegerDtype`` or ``FloatingDtype`` by an array-like with ``timedelta64[ns]`` dtype incorrectly raising (:issue:`45622`)
10181018
- Bug in :meth:`mean` where the optional dependency ``bottleneck`` causes precision loss linear in the length of the array. ``bottleneck`` has been disabled for :meth:`mean` improving the loss to log-linear but may result in a performance decrease. (:issue:`42878`)
1019-
- Bug in :func:`factorize` would convert the value ``None`` to ``np.nan`` (:issue:`46601`)
10201019

10211020
Conversion
10221021
^^^^^^^^^^

pandas/core/algorithms.py

+11
Original file line numberDiff line numberDiff line change
@@ -566,6 +566,17 @@ def factorize_array(
566566

567567
hash_klass, values = _get_hashtable_algo(values)
568568

569+
# factorize can now handle differentiating various types of null values.
570+
# However, for backwards compatibility we only use the null for the
571+
# provided dtype. This may be revisited in the future, see GH#48476.
572+
null_mask = isna(values)
573+
if null_mask.any():
574+
na_value = na_value_for_dtype(values.dtype, compat=False)
575+
# Don't modify (potentially user-provided) array
576+
# error: No overload variant of "where" matches argument types "Any", "object",
577+
# "ndarray[Any, Any]"
578+
values = np.where(null_mask, na_value, values) # type: ignore[call-overload]
579+
569580
table = hash_klass(size_hint or len(values))
570581
uniques, codes = table.factorize(
571582
values,

pandas/tests/groupby/test_groupby_dropna.py

+22-1
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33

44
from pandas.compat.pyarrow import pa_version_under1p01
55

6+
from pandas.core.dtypes.missing import na_value_for_dtype
7+
68
import pandas as pd
79
import pandas._testing as tm
810

@@ -422,7 +424,7 @@ def test_groupby_drop_nan_with_multi_index():
422424
(
423425
[
424426
pd.Period("2012-02-01", freq="D"),
425-
pd.NA,
427+
pd.NaT,
426428
pd.Period("2012-01-01", freq="D"),
427429
pd.Period("2012-02-01", freq="D"),
428430
],
@@ -454,3 +456,22 @@ def test_no_sort_keep_na(values, dtype, test_series):
454456
# TODO: Slicing reorders categories?
455457
expected.index = expected.index.reorder_categories(["y", "x"])
456458
tm.assert_equal(result, expected)
459+
460+
461+
@pytest.mark.parametrize("test_series", [True, False])
462+
@pytest.mark.parametrize("dtype", [object, None])
463+
def test_null_is_null_for_dtype(
464+
sort, dtype, nulls_fixture, nulls_fixture2, test_series
465+
):
466+
# GH#48506 - groups should always result in using the null for the dtype
467+
df = pd.DataFrame({"a": [1, 2]})
468+
groups = pd.Series([nulls_fixture, nulls_fixture2], dtype=dtype)
469+
obj = df["a"] if test_series else df
470+
gb = obj.groupby(groups, dropna=False, sort=sort)
471+
result = gb.sum()
472+
index = pd.Index([na_value_for_dtype(groups.dtype)])
473+
expected = pd.DataFrame({"a": [3]}, index=index)
474+
if test_series:
475+
tm.assert_series_equal(result, expected["a"])
476+
else:
477+
tm.assert_frame_equal(result, expected)

pandas/tests/test_algos.py

+6-6
Original file line numberDiff line numberDiff line change
@@ -468,7 +468,7 @@ def test_factorize_na_sentinel(self, sort, na_sentinel, data, uniques):
468468
(
469469
["a", None, "b", "a"],
470470
np.array([0, 1, 2, 0], dtype=np.dtype("intp")),
471-
np.array(["a", None, "b"], dtype=object),
471+
np.array(["a", np.nan, "b"], dtype=object),
472472
),
473473
(
474474
["a", np.nan, "b", "a"],
@@ -482,16 +482,16 @@ def test_object_factorize_use_na_sentinel_false(
482482
):
483483
codes, uniques = algos.factorize(data, use_na_sentinel=False)
484484

485-
tm.assert_numpy_array_equal(uniques, expected_uniques)
486-
tm.assert_numpy_array_equal(codes, expected_codes)
485+
tm.assert_numpy_array_equal(uniques, expected_uniques, strict_nan=True)
486+
tm.assert_numpy_array_equal(codes, expected_codes, strict_nan=True)
487487

488488
@pytest.mark.parametrize(
489489
"data, expected_codes, expected_uniques",
490490
[
491491
(
492492
[1, None, 1, 2],
493493
np.array([0, 1, 0, 2], dtype=np.dtype("intp")),
494-
np.array([1, None, 2], dtype="O"),
494+
np.array([1, np.nan, 2], dtype="O"),
495495
),
496496
(
497497
[1, np.nan, 1, 2],
@@ -505,8 +505,8 @@ def test_int_factorize_use_na_sentinel_false(
505505
):
506506
codes, uniques = algos.factorize(data, use_na_sentinel=False)
507507

508-
tm.assert_numpy_array_equal(uniques, expected_uniques)
509-
tm.assert_numpy_array_equal(codes, expected_codes)
508+
tm.assert_numpy_array_equal(uniques, expected_uniques, strict_nan=True)
509+
tm.assert_numpy_array_equal(codes, expected_codes, strict_nan=True)
510510

511511

512512
class TestUnique:

0 commit comments

Comments
 (0)