Skip to content

BUG: GroupBy equates different types of null values #48477

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 11 additions & 43 deletions pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,10 +78,7 @@
ABCSeries,
ABCTimedeltaArray,
)
from pandas.core.dtypes.missing import (
isna,
na_value_for_dtype,
)
from pandas.core.dtypes.missing import isna

from pandas.core.array_algos.take import take_nd
from pandas.core.construction import (
Expand Down Expand Up @@ -761,10 +758,6 @@ def factorize(
if not isinstance(values, ABCMultiIndex):
values = extract_array(values, extract_numpy=True)

# GH35667, if na_sentinel=None, we will not dropna NaNs from the uniques
# of values, assign na_sentinel=-1 to replace code value for NaN.
dropna = na_sentinel is not None

if (
isinstance(values, (ABCDatetimeArray, ABCTimedeltaArray))
and values.freq is not None
Expand Down Expand Up @@ -792,43 +785,17 @@ def factorize(

else:
values = np.asarray(values) # convert DTA/TDA/MultiIndex
# TODO: pass na_sentinel=na_sentinel to factorize_array. When sort is True and
# na_sentinel is None we append NA on the end because safe_sort does not
# handle null values in uniques.
if na_sentinel is None and sort:
na_sentinel_arg = -1
elif na_sentinel is None:
na_sentinel_arg = None
else:
na_sentinel_arg = na_sentinel
codes, uniques = factorize_array(
values,
na_sentinel=na_sentinel_arg,
na_sentinel=na_sentinel,
size_hint=size_hint,
)

if sort and len(uniques) > 0:
if na_sentinel is None:
# TODO: Can remove when na_sentinel=na_sentinel as in TODO above
na_sentinel = -1
uniques, codes = safe_sort(
uniques, codes, na_sentinel=na_sentinel, assume_unique=True, verify=False
)

if not dropna and sort:
# TODO: Can remove entire block when na_sentinel=na_sentinel as in TODO above
if na_sentinel is None:
na_sentinel_arg = -1
else:
na_sentinel_arg = na_sentinel
code_is_na = codes == na_sentinel_arg
if code_is_na.any():
# na_value is set based on the dtype of uniques, and compat set to False is
# because we do not want na_value to be 0 for integers
na_value = na_value_for_dtype(uniques.dtype, compat=False)
uniques = np.append(uniques, [na_value])
codes = np.where(code_is_na, len(uniques) - 1, codes)

uniques = _reconstruct_data(uniques, original.dtype, original)

return _re_wrap_factorize(original, uniques, codes)
Expand Down Expand Up @@ -1898,24 +1865,25 @@ def safe_sort(
# may deal with them here without performance loss using `mode='wrap'`
new_codes = reverse_indexer.take(codes, mode="wrap")

mask = codes == na_sentinel
if verify:
mask = mask | (codes < -len(values)) | (codes >= len(values))
if na_sentinel is not None:
mask = codes == na_sentinel
if verify:
mask = mask | (codes < -len(values)) | (codes >= len(values))

if mask is not None:
if na_sentinel is not None and mask is not None:
np.putmask(new_codes, mask, na_sentinel)

return ordered, ensure_platform_int(new_codes)


def _sort_mixed(values) -> np.ndarray:
"""order ints before strings in 1d arrays, safe in py3"""
"""order ints before strings before nulls in 1d arrays"""
str_pos = np.array([isinstance(x, str) for x in values], dtype=bool)
none_pos = np.array([x is None for x in values], dtype=bool)
nums = np.sort(values[~str_pos & ~none_pos])
null_pos = np.array([isna(x) for x in values], dtype=bool)
nums = np.sort(values[~str_pos & ~null_pos])
strs = np.sort(values[str_pos])
return np.concatenate(
[nums, np.asarray(strs, dtype=object), np.array(values[none_pos])]
[nums, np.asarray(strs, dtype=object), np.array(values[null_pos])]
)


Expand Down
82 changes: 68 additions & 14 deletions pandas/tests/groupby/test_groupby_dropna.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import copy

import numpy as np
import pytest

Expand Down Expand Up @@ -60,18 +62,23 @@ def test_groupby_dropna_multi_index_dataframe_nan_in_one_group(
),
(
False,
[["A", "B"], ["A", np.nan], ["B", "A"], [np.nan, "B"]],
# "null1" and "null2" are placeholders for fixtures
[["A", "B"], ["A", "null1"], ["A", "null2"], ["B", "A"], ["null1", "B"]],
{
"c": [12.0, 13.3, 123.23, 1.0],
"d": [12.0, 234.0, 123.0, 1.0],
"e": [12.0, 13.0, 1.0, 1.0],
"c": [12.0, 12.3, 1.0, 123.23, 1.0],
"d": [12.0, 233.0, 1.0, 123.0, 1.0],
"e": [12.0, 12.0, 1.0, 1.0, 1.0],
},
),
],
)
def test_groupby_dropna_multi_index_dataframe_nan_in_two_groups(
def test_groupby_dropna_multi_index_dataframe_different_nan_in_two_groups(
dropna, tuples, outputs, nulls_fixture, nulls_fixture2
):
if not dropna and type(nulls_fixture).__name__ == type(nulls_fixture2).__name__:
pytest.skip(
"tested in test_groupby_dropna_multi_index_dataframe_same_nan_in_two_groups"
)
# GH 3729 this is to test that NA in different groups with different representations
df_list = [
["A", "B", 12, 12, 12],
Expand All @@ -83,6 +90,60 @@ def test_groupby_dropna_multi_index_dataframe_nan_in_two_groups(
df = pd.DataFrame(df_list, columns=["a", "b", "c", "d", "e"])
grouped = df.groupby(["a", "b"], dropna=dropna).sum()

if not dropna:
tuples = copy.deepcopy(tuples)
tuples[1][1] = nulls_fixture
tuples[2][1] = nulls_fixture2
tuples[4][0] = nulls_fixture
mi = pd.MultiIndex.from_tuples(tuples, names=list("ab"))

# Since right now, by default MI will drop NA from levels when we create MI
# via `from_*`, so we need to add NA for level manually afterwards.
if not dropna:
mi = mi.set_levels([["A", "B", np.nan], ["A", "B", np.nan]])
expected = pd.DataFrame(outputs, index=mi)

tm.assert_frame_equal(grouped, expected)


@pytest.mark.parametrize(
"dropna, tuples, outputs",
[
(
True,
[["A", "B"], ["B", "A"]],
{"c": [12.0, 123.23], "d": [12.0, 123.0], "e": [12.0, 1.0]},
),
(
False,
# "null" is a placeholders for fixtures
[["A", "B"], ["A", "null"], ["B", "A"], ["null", "B"]],
{
"c": [12.0, 13.3, 123.23, 1.0],
"d": [12.0, 234.0, 123.0, 1.0],
"e": [12.0, 13.0, 1.0, 1.0],
},
),
],
)
def test_groupby_dropna_multi_index_dataframe_same_nan_in_two_groups(
dropna, tuples, outputs, nulls_fixture
):
# GH 3729 this is to test that NA in different groups with different representations
df_list = [
["A", "B", 12, 12, 12],
["A", nulls_fixture, 12.3, 233.0, 12],
["B", "A", 123.23, 123, 1],
[nulls_fixture, "B", 1, 1, 1.0],
["A", nulls_fixture, 1, 1, 1.0],
]
df = pd.DataFrame(df_list, columns=["a", "b", "c", "d", "e"])
grouped = df.groupby(["a", "b"], dropna=dropna).sum()

if not dropna:
tuples = copy.deepcopy(tuples)
tuples[1][1] = nulls_fixture
tuples[3][0] = nulls_fixture
mi = pd.MultiIndex.from_tuples(tuples, names=list("ab"))

# Since right now, by default MI will drop NA from levels when we create MI
Expand Down Expand Up @@ -439,15 +500,8 @@ def test_no_sort_keep_na(values, dtype, test_series):
gb = df.groupby("key", dropna=False, sort=False)
if test_series:
gb = gb["a"]

warn = None
if isinstance(values, pd.arrays.SparseArray):
warn = FutureWarning
msg = "passing a SparseArray to pd.Index will store that array directly"
with tm.assert_produces_warning(warn, match=msg):
result = gb.sum()
expected = pd.DataFrame({"a": [5, 2, 3]}, index=key[:-1].rename("key"))

result = gb.sum()
expected = pd.DataFrame({"a": [5, 2, 3]}, index=key[:-1].rename("key"))
if test_series:
expected = expected["a"]
if expected.index.is_categorical():
Expand Down