Skip to content

Commit 61b57a7

Browse files
charlesdong1991rhshadrach
authored andcommitted
ENH: Add dropna in groupby to allow NaN in keys (pandas-dev#30584)
1 parent 2cbac0c commit 61b57a7

File tree

10 files changed

+477
-3
lines changed

10 files changed

+477
-3
lines changed

doc/source/user_guide/groupby.rst

+27
Original file line numberDiff line numberDiff line change
@@ -199,6 +199,33 @@ For example, the groups created by ``groupby()`` below are in the order they app
199199
df3.groupby(['X']).get_group('B')
200200
201201
202+
.. _groupby.dropna:
203+
204+
.. versionadded:: 1.1.0
205+
206+
GroupBy dropna
207+
^^^^^^^^^^^^^^
208+
209+
By default ``NA`` values are excluded from group keys during the ``groupby`` operation. However,
210+
in case you want to include ``NA`` values in group keys, you could pass ``dropna=False`` to achieve it.
211+
212+
.. ipython:: python
213+
214+
df_list = [[1, 2, 3], [1, None, 4], [2, 1, 3], [1, 2, 2]]
215+
df_dropna = pd.DataFrame(df_list, columns=["a", "b", "c"])
216+
217+
df_dropna
218+
219+
.. ipython:: python
220+
221+
# Default `dropna` is set to True, which will exclude NaNs in keys
222+
df_dropna.groupby(by=["b"], dropna=True).sum()
223+
224+
# In order to allow NaN in keys, set `dropna` to False
225+
df_dropna.groupby(by=["b"], dropna=False).sum()
226+
227+
The default setting of ``dropna`` argument is ``True`` which means ``NA`` are not included in group keys.
228+
202229

203230
.. _groupby.attributes:
204231

doc/source/whatsnew/v1.1.0.rst

+31
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,37 @@ For example:
3636
ser["2014"]
3737
ser.loc["May 2015"]
3838
39+
40+
.. _whatsnew_110.groupby_key:
41+
42+
Allow NA in groupby key
43+
^^^^^^^^^^^^^^^^^^^^^^^^
44+
45+
With :ref:`groupby <groupby.dropna>` , we've added a ``dropna`` keyword to :meth:`DataFrame.groupby` and :meth:`Series.groupby` in order to
46+
allow ``NA`` values in group keys. Users can define ``dropna`` to ``False`` if they want to include
47+
``NA`` values in groupby keys. The default is set to ``True`` for ``dropna`` to keep backwards
48+
compatibility (:issue:`3729`)
49+
50+
.. ipython:: python
51+
52+
df_list = [[1, 2, 3], [1, None, 4], [2, 1, 3], [1, 2, 2]]
53+
df_dropna = pd.DataFrame(df_list, columns=["a", "b", "c"])
54+
55+
df_dropna
56+
57+
.. ipython:: python
58+
59+
# Default `dropna` is set to True, which will exclude NaNs in keys
60+
df_dropna.groupby(by=["b"], dropna=True).sum()
61+
62+
# In order to allow NaN in keys, set `dropna` to False
63+
df_dropna.groupby(by=["b"], dropna=False).sum()
64+
65+
The default setting of ``dropna`` argument is ``True`` which means ``NA`` are not included in group keys.
66+
67+
.. versionadded:: 1.1.0
68+
69+
3970
.. _whatsnew_110.key_sorting:
4071

4172
Sorting with keys

pandas/core/algorithms.py

+13-1
Original file line numberDiff line numberDiff line change
@@ -523,7 +523,11 @@ def _factorize_array(
523523
),
524524
)
525525
def factorize(
526-
values, sort: bool = False, na_sentinel: int = -1, size_hint: Optional[int] = None
526+
values,
527+
sort: bool = False,
528+
na_sentinel: int = -1,
529+
size_hint: Optional[int] = None,
530+
dropna: bool = True,
527531
) -> Tuple[np.ndarray, Union[np.ndarray, ABCIndex]]:
528532
"""
529533
Encode the object as an enumerated type or categorical variable.
@@ -649,6 +653,14 @@ def factorize(
649653
uniques, codes, na_sentinel=na_sentinel, assume_unique=True, verify=False
650654
)
651655

656+
code_is_na = codes == na_sentinel
657+
if not dropna and code_is_na.any():
658+
# na_value is set based on the dtype of uniques, and compat set to False is
659+
# because we do not want na_value to be 0 for integers
660+
na_value = na_value_for_dtype(uniques.dtype, compat=False)
661+
uniques = np.append(uniques, [na_value])
662+
codes = np.where(code_is_na, len(uniques) - 1, codes)
663+
652664
uniques = _reconstruct_data(uniques, dtype, original)
653665

654666
# return original tenor

pandas/core/frame.py

+37
Original file line numberDiff line numberDiff line change
@@ -6134,6 +6134,41 @@ def update(
61346134
Type
61356135
Captive 210.0
61366136
Wild 185.0
6137+
6138+
We can also choose to include NA in group keys or not by setting
6139+
`dropna` parameter, the default setting is `True`:
6140+
6141+
>>> l = [[1, 2, 3], [1, None, 4], [2, 1, 3], [1, 2, 2]]
6142+
>>> df = pd.DataFrame(l, columns=["a", "b", "c"])
6143+
6144+
>>> df.groupby(by=["b"]).sum()
6145+
a c
6146+
b
6147+
1.0 2 3
6148+
2.0 2 5
6149+
6150+
>>> df.groupby(by=["b"], dropna=False).sum()
6151+
a c
6152+
b
6153+
1.0 2 3
6154+
2.0 2 5
6155+
NaN 1 4
6156+
6157+
>>> l = [["a", 12, 12], [None, 12.3, 33.], ["b", 12.3, 123], ["a", 1, 1]]
6158+
>>> df = pd.DataFrame(l, columns=["a", "b", "c"])
6159+
6160+
>>> df.groupby(by="a").sum()
6161+
b c
6162+
a
6163+
a 13.0 13.0
6164+
b 12.3 123.0
6165+
6166+
>>> df.groupby(by="a", dropna=False).sum()
6167+
b c
6168+
a
6169+
a 13.0 13.0
6170+
b 12.3 123.0
6171+
NaN 12.3 33.0
61376172
"""
61386173
)
61396174
@Appender(_shared_docs["groupby"] % _shared_doc_kwargs)
@@ -6147,6 +6182,7 @@ def groupby(
61476182
group_keys: bool = True,
61486183
squeeze: bool = False,
61496184
observed: bool = False,
6185+
dropna: bool = True,
61506186
) -> "DataFrameGroupBy":
61516187
from pandas.core.groupby.generic import DataFrameGroupBy
61526188

@@ -6164,6 +6200,7 @@ def groupby(
61646200
group_keys=group_keys,
61656201
squeeze=squeeze,
61666202
observed=observed,
6203+
dropna=dropna,
61676204
)
61686205

61696206
_shared_docs[

pandas/core/generic.py

+6
Original file line numberDiff line numberDiff line change
@@ -7475,6 +7475,12 @@ def clip(
74757475
If False: show all values for categorical groupers.
74767476
74777477
.. versionadded:: 0.23.0
7478+
dropna : bool, default True
7479+
If True, and if group keys contain NA values, NA values together
7480+
with row/column will be dropped.
7481+
If False, NA values will also be treated as the key in groups
7482+
7483+
.. versionadded:: 1.1.0
74787484
74797485
Returns
74807486
-------

pandas/core/groupby/groupby.py

+5
Original file line numberDiff line numberDiff line change
@@ -474,6 +474,7 @@ def __init__(
474474
squeeze: bool = False,
475475
observed: bool = False,
476476
mutated: bool = False,
477+
dropna: bool = True,
477478
):
478479

479480
self._selection = selection
@@ -496,6 +497,7 @@ def __init__(
496497
self.squeeze = squeeze
497498
self.observed = observed
498499
self.mutated = mutated
500+
self.dropna = dropna
499501

500502
if grouper is None:
501503
from pandas.core.groupby.grouper import get_grouper
@@ -508,6 +510,7 @@ def __init__(
508510
sort=sort,
509511
observed=observed,
510512
mutated=self.mutated,
513+
dropna=self.dropna,
511514
)
512515

513516
self.obj = obj
@@ -2649,6 +2652,7 @@ def get_groupby(
26492652
squeeze: bool = False,
26502653
observed: bool = False,
26512654
mutated: bool = False,
2655+
dropna: bool = True,
26522656
) -> GroupBy:
26532657

26542658
klass: Type[GroupBy]
@@ -2677,4 +2681,5 @@ def get_groupby(
26772681
squeeze=squeeze,
26782682
observed=observed,
26792683
mutated=mutated,
2684+
dropna=dropna,
26802685
)

pandas/core/groupby/grouper.py

+12-2
Original file line numberDiff line numberDiff line change
@@ -134,7 +134,9 @@ def __new__(cls, *args, **kwargs):
134134
cls = TimeGrouper
135135
return super().__new__(cls)
136136

137-
def __init__(self, key=None, level=None, freq=None, axis=0, sort=False):
137+
def __init__(
138+
self, key=None, level=None, freq=None, axis=0, sort=False, dropna=True
139+
):
138140
self.key = key
139141
self.level = level
140142
self.freq = freq
@@ -146,6 +148,7 @@ def __init__(self, key=None, level=None, freq=None, axis=0, sort=False):
146148
self.indexer = None
147149
self.binner = None
148150
self._grouper = None
151+
self.dropna = dropna
149152

150153
@property
151154
def ax(self):
@@ -171,6 +174,7 @@ def _get_grouper(self, obj, validate: bool = True):
171174
level=self.level,
172175
sort=self.sort,
173176
validate=validate,
177+
dropna=self.dropna,
174178
)
175179
return self.binner, self.grouper, self.obj
176180

@@ -283,6 +287,7 @@ def __init__(
283287
sort: bool = True,
284288
observed: bool = False,
285289
in_axis: bool = False,
290+
dropna: bool = True,
286291
):
287292
self.name = name
288293
self.level = level
@@ -293,6 +298,7 @@ def __init__(
293298
self.obj = obj
294299
self.observed = observed
295300
self.in_axis = in_axis
301+
self.dropna = dropna
296302

297303
# right place for this?
298304
if isinstance(grouper, (Series, Index)) and name is None:
@@ -446,7 +452,9 @@ def _make_codes(self) -> None:
446452
codes = self.grouper.codes_info
447453
uniques = self.grouper.result_index
448454
else:
449-
codes, uniques = algorithms.factorize(self.grouper, sort=self.sort)
455+
codes, uniques = algorithms.factorize(
456+
self.grouper, sort=self.sort, dropna=self.dropna
457+
)
450458
uniques = Index(uniques, name=self.name)
451459
self._codes = codes
452460
self._group_index = uniques
@@ -465,6 +473,7 @@ def get_grouper(
465473
observed: bool = False,
466474
mutated: bool = False,
467475
validate: bool = True,
476+
dropna: bool = True,
468477
) -> "Tuple[ops.BaseGrouper, List[Hashable], FrameOrSeries]":
469478
"""
470479
Create and return a BaseGrouper, which is an internal
@@ -655,6 +664,7 @@ def is_in_obj(gpr) -> bool:
655664
sort=sort,
656665
observed=observed,
657666
in_axis=in_axis,
667+
dropna=dropna,
658668
)
659669
if not isinstance(gpr, Grouping)
660670
else gpr

pandas/core/series.py

+30
Original file line numberDiff line numberDiff line change
@@ -1603,6 +1603,34 @@ def _set_name(self, name, inplace=False) -> "Series":
16031603
Captive 210.0
16041604
Wild 185.0
16051605
Name: Max Speed, dtype: float64
1606+
1607+
We can also choose to include `NA` in group keys or not by defining
1608+
`dropna` parameter, the default setting is `True`:
1609+
1610+
>>> ser = pd.Series([1, 2, 3, 3], index=["a", 'a', 'b', np.nan])
1611+
>>> ser.groupby(level=0).sum()
1612+
a 3
1613+
b 3
1614+
dtype: int64
1615+
1616+
>>> ser.groupby(level=0, dropna=False).sum()
1617+
a 3
1618+
b 3
1619+
NaN 3
1620+
dtype: int64
1621+
1622+
>>> arrays = ['Falcon', 'Falcon', 'Parrot', 'Parrot']
1623+
>>> ser = pd.Series([390., 350., 30., 20.], index=arrays, name="Max Speed")
1624+
>>> ser.groupby(["a", "b", "a", np.nan]).mean()
1625+
a 210.0
1626+
b 350.0
1627+
Name: Max Speed, dtype: float64
1628+
1629+
>>> ser.groupby(["a", "b", "a", np.nan], dropna=False).mean()
1630+
a 210.0
1631+
b 350.0
1632+
NaN 20.0
1633+
Name: Max Speed, dtype: float64
16061634
"""
16071635
)
16081636
@Appender(generic._shared_docs["groupby"] % _shared_doc_kwargs)
@@ -1616,6 +1644,7 @@ def groupby(
16161644
group_keys: bool = True,
16171645
squeeze: bool = False,
16181646
observed: bool = False,
1647+
dropna: bool = True,
16191648
) -> "SeriesGroupBy":
16201649
from pandas.core.groupby.generic import SeriesGroupBy
16211650

@@ -1633,6 +1662,7 @@ def groupby(
16331662
group_keys=group_keys,
16341663
squeeze=squeeze,
16351664
observed=observed,
1665+
dropna=dropna,
16361666
)
16371667

16381668
# ----------------------------------------------------------------------

0 commit comments

Comments
 (0)