Skip to content

Commit f6f3dd3

Browse files
authored
BUG: Groupy dropped nan groups from result when grouping over single column (#36842)
1 parent 54dda90 commit f6f3dd3

File tree

7 files changed

+72
-20
lines changed

7 files changed

+72
-20
lines changed

doc/source/whatsnew/v1.2.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -527,6 +527,7 @@ Groupby/resample/rolling
527527
- Using :meth:`Rolling.var()` instead of :meth:`Rolling.std()` avoids numerical issues for :meth:`Rolling.corr()` when :meth:`Rolling.var()` is still within floating point precision while :meth:`Rolling.std()` is not (:issue:`31286`)
528528
- Bug in :meth:`df.groupby(..).quantile() <pandas.core.groupby.DataFrameGroupBy.quantile>` and :meth:`df.resample(..).quantile() <pandas.core.resample.Resampler.quantile>` raised ``TypeError`` when values were of type ``Timedelta`` (:issue:`29485`)
529529
- Bug in :meth:`Rolling.median` and :meth:`Rolling.quantile` returned wrong values for :class:`BaseIndexer` subclasses with non-monotonic starting or ending points for windows (:issue:`37153`)
530+
- Bug in :meth:`DataFrame.groupby` dropped ``nan`` groups from result with ``dropna=False`` when grouping over a single column (:issue:`35646`, :issue:`35542`)
530531

531532
Reshaping
532533
^^^^^^^^^

pandas/_libs/lib.pyx

+18-11
Original file line numberDiff line numberDiff line change
@@ -896,21 +896,28 @@ def indices_fast(ndarray index, const int64_t[:] labels, list keys,
896896

897897
if lab != cur:
898898
if lab != -1:
899-
tup = PyTuple_New(k)
900-
for j in range(k):
901-
val = keys[j][sorted_labels[j][i - 1]]
902-
PyTuple_SET_ITEM(tup, j, val)
903-
Py_INCREF(val)
904-
899+
if k == 1:
900+
# When k = 1 we do not want to return a tuple as key
901+
tup = keys[0][sorted_labels[0][i - 1]]
902+
else:
903+
tup = PyTuple_New(k)
904+
for j in range(k):
905+
val = keys[j][sorted_labels[j][i - 1]]
906+
PyTuple_SET_ITEM(tup, j, val)
907+
Py_INCREF(val)
905908
result[tup] = index[start:i]
906909
start = i
907910
cur = lab
908911

909-
tup = PyTuple_New(k)
910-
for j in range(k):
911-
val = keys[j][sorted_labels[j][n - 1]]
912-
PyTuple_SET_ITEM(tup, j, val)
913-
Py_INCREF(val)
912+
if k == 1:
913+
# When k = 1 we do not want to return a tuple as key
914+
tup = keys[0][sorted_labels[0][n - 1]]
915+
else:
916+
tup = PyTuple_New(k)
917+
for j in range(k):
918+
val = keys[j][sorted_labels[j][n - 1]]
919+
PyTuple_SET_ITEM(tup, j, val)
920+
Py_INCREF(val)
914921
result[tup] = index[start:]
915922

916923
return result

pandas/core/groupby/ops.py

+3-6
Original file line numberDiff line numberDiff line change
@@ -229,12 +229,9 @@ def apply(self, f: F, data: FrameOrSeries, axis: int = 0):
229229
@cache_readonly
230230
def indices(self):
231231
""" dict {group name -> group indices} """
232-
if len(self.groupings) == 1:
233-
return self.groupings[0].indices
234-
else:
235-
codes_list = [ping.codes for ping in self.groupings]
236-
keys = [ping.group_index for ping in self.groupings]
237-
return get_indexer_dict(codes_list, keys)
232+
codes_list = [ping.codes for ping in self.groupings]
233+
keys = [ping.group_index for ping in self.groupings]
234+
return get_indexer_dict(codes_list, keys)
238235

239236
@property
240237
def codes(self) -> List[np.ndarray]:

pandas/core/sorting.py

+9-2
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
TYPE_CHECKING,
55
Callable,
66
DefaultDict,
7+
Dict,
78
Iterable,
89
List,
910
Optional,
@@ -528,16 +529,22 @@ def get_flattened_list(
528529
return [tuple(array) for array in arrays.values()]
529530

530531

531-
def get_indexer_dict(label_list, keys):
532+
def get_indexer_dict(
533+
label_list: List[np.ndarray], keys: List["Index"]
534+
) -> Dict[Union[str, Tuple], np.ndarray]:
532535
"""
533536
Returns
534537
-------
535-
dict
538+
dict:
536539
Labels mapped to indexers.
537540
"""
538541
shape = [len(x) for x in keys]
539542

540543
group_index = get_group_index(label_list, shape, sort=True, xnull=True)
544+
if np.all(group_index == -1):
545+
# When all keys are nan and dropna=True, indices_fast can't handle this
546+
# and the return is empty anyway
547+
return {}
541548
ngroups = (
542549
((group_index.size and group_index.max()) + 1)
543550
if is_int64_overflow_possible(shape)

pandas/tests/groupby/test_groupby.py

+7
Original file line numberDiff line numberDiff line change
@@ -1298,6 +1298,13 @@ def test_groupby_nat_exclude():
12981298
grouped.get_group(pd.NaT)
12991299

13001300

1301+
def test_groupby_two_group_keys_all_nan():
1302+
# GH #36842: Grouping over two group keys shouldn't raise an error
1303+
df = DataFrame({"a": [np.nan, np.nan], "b": [np.nan, np.nan], "c": [1, 2]})
1304+
result = df.groupby(["a", "b"]).indices
1305+
assert result == {}
1306+
1307+
13011308
def test_groupby_2d_malformed():
13021309
d = DataFrame(index=range(2))
13031310
d["group"] = ["g1", "g2"]

pandas/tests/groupby/test_groupby_dropna.py

+19-1
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
import pytest
33

44
import pandas as pd
5-
import pandas.testing as tm
5+
import pandas._testing as tm
66

77

88
@pytest.mark.parametrize(
@@ -335,3 +335,21 @@ def test_groupby_apply_with_dropna_for_multi_index(dropna, data, selected_data,
335335

336336
expected = pd.DataFrame(selected_data, index=mi)
337337
tm.assert_frame_equal(result, expected)
338+
339+
340+
def test_groupby_nan_included():
341+
# GH 35646
342+
data = {"group": ["g1", np.nan, "g1", "g2", np.nan], "B": [0, 1, 2, 3, 4]}
343+
df = pd.DataFrame(data)
344+
grouped = df.groupby("group", dropna=False)
345+
result = grouped.indices
346+
dtype = "int64"
347+
expected = {
348+
"g1": np.array([0, 2], dtype=dtype),
349+
"g2": np.array([3], dtype=dtype),
350+
np.nan: np.array([1, 4], dtype=dtype),
351+
}
352+
for result_values, expected_values in zip(result.values(), expected.values()):
353+
tm.assert_numpy_array_equal(result_values, expected_values)
354+
assert np.isnan(list(result.keys())[2])
355+
assert list(result.keys())[0:2] == ["g1", "g2"]

pandas/tests/window/test_rolling.py

+15
Original file line numberDiff line numberDiff line change
@@ -1087,3 +1087,18 @@ def test_rolling_corr_timedelta_index(index, window):
10871087
result = x.rolling(window).corr(y)
10881088
expected = Series([np.nan, np.nan, 1, 1, 1], index=index)
10891089
tm.assert_almost_equal(result, expected)
1090+
1091+
1092+
def test_groupby_rolling_nan_included():
1093+
# GH 35542
1094+
data = {"group": ["g1", np.nan, "g1", "g2", np.nan], "B": [0, 1, 2, 3, 4]}
1095+
df = DataFrame(data)
1096+
result = df.groupby("group", dropna=False).rolling(1, min_periods=1).mean()
1097+
expected = DataFrame(
1098+
{"B": [0.0, 2.0, 3.0, 1.0, 4.0]},
1099+
index=pd.MultiIndex.from_tuples(
1100+
[("g1", 0), ("g1", 2), ("g2", 3), (np.nan, 1), (np.nan, 4)],
1101+
names=["group", None],
1102+
),
1103+
)
1104+
tm.assert_frame_equal(result, expected)

0 commit comments

Comments
 (0)