Skip to content

ENH: pivot/groupby index with nan (GH3729) #21669

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.24.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ New features
~~~~~~~~~~~~

- ``ExcelWriter`` now accepts ``mode`` as a keyword argument, enabling append to existing workbooks when using the ``openpyxl`` engine (:issue:`3441`)
- ``groupby`` now accepts ``group_nas`` as a keyword argument, enabling the `np.nan` as a grouping label (:issue:`3729`)

.. _whatsnew_0240.enhancements.other:

Expand Down
12 changes: 11 additions & 1 deletion pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -493,6 +493,8 @@ def _factorize_array(values, na_sentinel=-1, size_hint=None,
%(values)s%(sort)s%(order)s
na_sentinel : int, default -1
Value to mark "not found".
factor_nas : boolean, default False
If true, treat NaNs as a factorizable value
%(size_hint)s\

Returns
Expand Down Expand Up @@ -597,7 +599,8 @@ def _factorize_array(values, na_sentinel=-1, size_hint=None,
)
@Appender(_shared_docs['factorize'])
@deprecate_kwarg(old_arg_name='order', new_arg_name=None)
def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None):
def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None,
factor_nas=False):
# Implementation notes: This method is responsible for 3 things
# 1.) coercing data to array-like (ndarray, Index, extension array)
# 2.) factorizing labels and uniques
Expand Down Expand Up @@ -642,6 +645,13 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None):
na_sentinel=na_sentinel,
assume_unique=True)

if factor_nas and (labels == na_sentinel).any():
new_na_sentinel = len(uniques)
# Append the np.nan
uniques = np.resize(uniques, len(uniques) + 1)
uniques[-1] = np.nan
labels = np.where(labels == na_sentinel, new_na_sentinel, labels)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Given that you're using broadcasting and NumPy, I don't think we should have any performance issues, but I wonder if we should still add a benchmark test anyway.

uniques = _reconstruct_data(uniques, dtype, original)

# return original tenor
Expand Down
7 changes: 5 additions & 2 deletions pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -6584,7 +6584,8 @@ def clip_lower(self, threshold, axis=None, inplace=False):
axis=axis, inplace=inplace)

def groupby(self, by=None, axis=0, level=None, as_index=True, sort=True,
group_keys=True, squeeze=False, observed=False, **kwargs):
group_keys=True, squeeze=False, observed=False,
group_nas=False, **kwargs):
"""
Group series using mapper (dict or key function, apply given function
to group, return result as series) or by a series of columns.
Expand Down Expand Up @@ -6621,6 +6622,8 @@ def groupby(self, by=None, axis=0, level=None, as_index=True, sort=True,
This only applies if any of the groupers are Categoricals
If True: only show observed values for categorical groupers.
If False: show all values for categorical groupers.
group_nas : boolean, default False
Group the NaNs as normal values

.. versionadded:: 0.23.0

Expand Down Expand Up @@ -6656,7 +6659,7 @@ def groupby(self, by=None, axis=0, level=None, as_index=True, sort=True,
axis = self._get_axis_number(axis)
return groupby(self, by=by, axis=axis, level=level, as_index=as_index,
sort=sort, group_keys=group_keys, squeeze=squeeze,
observed=observed, **kwargs)
observed=observed, group_nas=group_nas, **kwargs)

def asfreq(self, freq, method=None, how=None, normalize=False,
fill_value=None):
Expand Down
19 changes: 14 additions & 5 deletions pandas/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -567,7 +567,7 @@ class _GroupBy(PandasObject, SelectionMixin):
def __init__(self, obj, keys=None, axis=0, level=None,
grouper=None, exclusions=None, selection=None, as_index=True,
sort=True, group_keys=True, squeeze=False,
observed=False, **kwargs):
observed=False, group_nas=False, **kwargs):

self._selection = selection

Expand All @@ -588,6 +588,7 @@ def __init__(self, obj, keys=None, axis=0, level=None,
self.group_keys = group_keys
self.squeeze = squeeze
self.observed = observed
self.group_nas = group_nas
self.mutated = kwargs.pop('mutated', False)

if grouper is None:
Expand All @@ -596,6 +597,7 @@ def __init__(self, obj, keys=None, axis=0, level=None,
level=level,
sort=sort,
observed=observed,
group_nas=group_nas,
mutated=self.mutated)

self.obj = obj
Expand Down Expand Up @@ -2922,6 +2924,8 @@ class Grouping(object):
level :
observed : boolean, default False
If we are a Categorical, use the observed values
group_nas : boolean, default False
Should NaNs be grouped as another value
in_axis : if the Grouping is a column in self.obj and hence among
Groupby.exclusions list

Expand All @@ -2937,7 +2941,7 @@ class Grouping(object):
"""

def __init__(self, index, grouper=None, obj=None, name=None, level=None,
sort=True, observed=False, in_axis=False):
sort=True, observed=False, group_nas=False, in_axis=False):

self.name = name
self.level = level
Expand All @@ -2947,6 +2951,7 @@ def __init__(self, index, grouper=None, obj=None, name=None, level=None,
self.sort = sort
self.obj = obj
self.observed = observed
self.group_nas = group_nas
self.in_axis = in_axis

# right place for this?
Expand Down Expand Up @@ -3100,7 +3105,7 @@ def _make_labels(self):
uniques = self.grouper.result_index
else:
labels, uniques = algorithms.factorize(
self.grouper, sort=self.sort)
self.grouper, sort=self.sort, factor_nas=self.group_nas)
uniques = Index(uniques, name=self.name)
self._labels = labels
self._group_index = uniques
Expand All @@ -3112,7 +3117,8 @@ def groups(self):


def _get_grouper(obj, key=None, axis=0, level=None, sort=True,
observed=False, mutated=False, validate=True):
observed=False, group_nas=False, mutated=False,
validate=True):
"""
create and return a BaseGrouper, which is an internal
mapping of how to create the grouper indexers.
Expand Down Expand Up @@ -3311,6 +3317,7 @@ def is_in_obj(gpr):
level=level,
sort=sort,
observed=observed,
group_nas=group_nas,
in_axis=in_axis) \
if not isinstance(gpr, Grouping) else gpr

Expand Down Expand Up @@ -3752,7 +3759,9 @@ def nunique(self, dropna=True):
except TypeError: # catches object dtypes
assert val.dtype == object, \
'val.dtype must be object, got %s' % val.dtype
val, _ = algorithms.factorize(val, sort=False)
val, _ = algorithms.factorize(
val, sort=False, factor_nas=self.group_nas
)
sorter = np.lexsort((val, ids))
_isna = lambda a: a == -1
else:
Expand Down
28 changes: 28 additions & 0 deletions pandas/tests/groupby/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -276,6 +276,12 @@ def test_with_na_groups(dtype):

assert_series_equal(agged, expected, check_dtype=False)

grouped_na = values.groupby(labels, group_nas=True)
agged_na = grouped_na.agg(len)
expected_na = Series([4, 2, 4], index=['bar', 'foo', np.nan])

assert_series_equal(agged_na, expected_na, check_dtype=False)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Reference issue number in a comment.

# assert issubclass(agged.dtype.type, np.integer)

# explicitly return a float from my function
Expand All @@ -288,6 +294,28 @@ def f(x):
assert_series_equal(agged, expected, check_dtype=False)
assert issubclass(agged.dtype.type, np.dtype(dtype).type)

agged_na = grouped_na.agg(f)
expected_na = Series([4, 2, 4], index=['bar', 'foo', np.nan])

assert_series_equal(agged_na, expected_na, check_dtype=False)
assert issubclass(agged_na.dtype.type, np.dtype(dtype).type)

# Check the data frame groupby interface also handles NaNs correctly
df = pd.DataFrame({"Vals": values, "Labs": labels})

agged = df.groupby("Labs")["Vals"].sum()
expected = Series(
[4, 2], index=Index(['bar', 'foo'], name='Labs'), name='Vals'
)
assert_series_equal(agged, expected, check_dtype=False)

agged_na = df.groupby("Labs", group_nas=True)["Vals"].sum()
expected_na = Series(
[4, 2, 4], index=Index(['bar', 'foo', np.nan], name='Labs'),
name='Vals'
)
assert_series_equal(agged_na, expected_na, check_dtype=False)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Reference issue number in a comment.


def test_indices_concatenation_order():

Expand Down