diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index 406ca9ba045c9..69a8be39be62c 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -9,6 +9,7 @@ New features ~~~~~~~~~~~~ - ``ExcelWriter`` now accepts ``mode`` as a keyword argument, enabling append to existing workbooks when using the ``openpyxl`` engine (:issue:`3441`) +- ``groupby`` now accepts ``group_nas`` as a keyword argument, enabling the `np.nan` as a grouping label (:issue:`3729`) .. _whatsnew_0240.enhancements.other: diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 9e34b8eb55ccb..adfe47ddd5b94 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -493,6 +493,8 @@ def _factorize_array(values, na_sentinel=-1, size_hint=None, %(values)s%(sort)s%(order)s na_sentinel : int, default -1 Value to mark "not found". + factor_nas : boolean, default False + If true, treat NaNs as a factorizable value %(size_hint)s\ Returns @@ -597,7 +599,8 @@ def _factorize_array(values, na_sentinel=-1, size_hint=None, ) @Appender(_shared_docs['factorize']) @deprecate_kwarg(old_arg_name='order', new_arg_name=None) -def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None): +def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None, + factor_nas=False): # Implementation notes: This method is responsible for 3 things # 1.) coercing data to array-like (ndarray, Index, extension array) # 2.) factorizing labels and uniques @@ -642,6 +645,13 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None): na_sentinel=na_sentinel, assume_unique=True) + if factor_nas and (labels == na_sentinel).any(): + new_na_sentinel = len(uniques) + # Append the np.nan + uniques = np.resize(uniques, len(uniques) + 1) + uniques[-1] = np.nan + labels = np.where(labels == na_sentinel, new_na_sentinel, labels) + uniques = _reconstruct_data(uniques, dtype, original) # return original tenor diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 8fa79a130d1f8..f523ab73730a4 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -6584,7 +6584,8 @@ def clip_lower(self, threshold, axis=None, inplace=False): axis=axis, inplace=inplace) def groupby(self, by=None, axis=0, level=None, as_index=True, sort=True, - group_keys=True, squeeze=False, observed=False, **kwargs): + group_keys=True, squeeze=False, observed=False, + group_nas=False, **kwargs): """ Group series using mapper (dict or key function, apply given function to group, return result as series) or by a series of columns. @@ -6621,6 +6622,8 @@ def groupby(self, by=None, axis=0, level=None, as_index=True, sort=True, This only applies if any of the groupers are Categoricals If True: only show observed values for categorical groupers. If False: show all values for categorical groupers. + group_nas : boolean, default False + Group the NaNs as normal values .. versionadded:: 0.23.0 @@ -6656,7 +6659,7 @@ def groupby(self, by=None, axis=0, level=None, as_index=True, sort=True, axis = self._get_axis_number(axis) return groupby(self, by=by, axis=axis, level=level, as_index=as_index, sort=sort, group_keys=group_keys, squeeze=squeeze, - observed=observed, **kwargs) + observed=observed, group_nas=group_nas, **kwargs) def asfreq(self, freq, method=None, how=None, normalize=False, fill_value=None): diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index c69d7f43de8ea..763cb9e2c875c 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -567,7 +567,7 @@ class _GroupBy(PandasObject, SelectionMixin): def __init__(self, obj, keys=None, axis=0, level=None, grouper=None, exclusions=None, selection=None, as_index=True, sort=True, group_keys=True, squeeze=False, - observed=False, **kwargs): + observed=False, group_nas=False, **kwargs): self._selection = selection @@ -588,6 +588,7 @@ def __init__(self, obj, keys=None, axis=0, level=None, self.group_keys = group_keys self.squeeze = squeeze self.observed = observed + self.group_nas = group_nas self.mutated = kwargs.pop('mutated', False) if grouper is None: @@ -596,6 +597,7 @@ def __init__(self, obj, keys=None, axis=0, level=None, level=level, sort=sort, observed=observed, + group_nas=group_nas, mutated=self.mutated) self.obj = obj @@ -2922,6 +2924,8 @@ class Grouping(object): level : observed : boolean, default False If we are a Categorical, use the observed values + group_nas : boolean, default False + Should NaNs be grouped as another value in_axis : if the Grouping is a column in self.obj and hence among Groupby.exclusions list @@ -2937,7 +2941,7 @@ class Grouping(object): """ def __init__(self, index, grouper=None, obj=None, name=None, level=None, - sort=True, observed=False, in_axis=False): + sort=True, observed=False, group_nas=False, in_axis=False): self.name = name self.level = level @@ -2947,6 +2951,7 @@ def __init__(self, index, grouper=None, obj=None, name=None, level=None, self.sort = sort self.obj = obj self.observed = observed + self.group_nas = group_nas self.in_axis = in_axis # right place for this? @@ -3100,7 +3105,7 @@ def _make_labels(self): uniques = self.grouper.result_index else: labels, uniques = algorithms.factorize( - self.grouper, sort=self.sort) + self.grouper, sort=self.sort, factor_nas=self.group_nas) uniques = Index(uniques, name=self.name) self._labels = labels self._group_index = uniques @@ -3112,7 +3117,8 @@ def groups(self): def _get_grouper(obj, key=None, axis=0, level=None, sort=True, - observed=False, mutated=False, validate=True): + observed=False, group_nas=False, mutated=False, + validate=True): """ create and return a BaseGrouper, which is an internal mapping of how to create the grouper indexers. @@ -3311,6 +3317,7 @@ def is_in_obj(gpr): level=level, sort=sort, observed=observed, + group_nas=group_nas, in_axis=in_axis) \ if not isinstance(gpr, Grouping) else gpr @@ -3752,7 +3759,9 @@ def nunique(self, dropna=True): except TypeError: # catches object dtypes assert val.dtype == object, \ 'val.dtype must be object, got %s' % val.dtype - val, _ = algorithms.factorize(val, sort=False) + val, _ = algorithms.factorize( + val, sort=False, factor_nas=self.group_nas + ) sorter = np.lexsort((val, ids)) _isna = lambda a: a == -1 else: diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index e05f9de5ea7f4..63923160496fb 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -276,6 +276,12 @@ def test_with_na_groups(dtype): assert_series_equal(agged, expected, check_dtype=False) + grouped_na = values.groupby(labels, group_nas=True) + agged_na = grouped_na.agg(len) + expected_na = Series([4, 2, 4], index=['bar', 'foo', np.nan]) + + assert_series_equal(agged_na, expected_na, check_dtype=False) + # assert issubclass(agged.dtype.type, np.integer) # explicitly return a float from my function @@ -288,6 +294,28 @@ def f(x): assert_series_equal(agged, expected, check_dtype=False) assert issubclass(agged.dtype.type, np.dtype(dtype).type) + agged_na = grouped_na.agg(f) + expected_na = Series([4, 2, 4], index=['bar', 'foo', np.nan]) + + assert_series_equal(agged_na, expected_na, check_dtype=False) + assert issubclass(agged_na.dtype.type, np.dtype(dtype).type) + + # Check the data frame groupby interface also handles NaNs correctly + df = pd.DataFrame({"Vals": values, "Labs": labels}) + + agged = df.groupby("Labs")["Vals"].sum() + expected = Series( + [4, 2], index=Index(['bar', 'foo'], name='Labs'), name='Vals' + ) + assert_series_equal(agged, expected, check_dtype=False) + + agged_na = df.groupby("Labs", group_nas=True)["Vals"].sum() + expected_na = Series( + [4, 2, 4], index=Index(['bar', 'foo', np.nan], name='Labs'), + name='Vals' + ) + assert_series_equal(agged_na, expected_na, check_dtype=False) + def test_indices_concatenation_order():