Skip to content

Commit 5220a20

Browse files
committed
BUG: groupby with categorical and other columns
closes pandas-dev#14942
1 parent 5edc5c4 commit 5220a20

File tree

3 files changed

+187
-185
lines changed

3 files changed

+187
-185
lines changed

doc/source/whatsnew/v0.23.0.txt

+51
Original file line numberDiff line numberDiff line change
@@ -479,6 +479,57 @@ If you wish to retain the old behavior while using Python >= 3.6, you can use
479479
'Taxes': -200,
480480
'Net result': 300}).sort_index()
481481

482+
.. _whatsnew_0230.api_breaking.categorical_grouping:
483+
484+
Categorical Grouping no longer expands to all possible groupers
485+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
486+
487+
In previous versions, grouping by 1 or more categorical columns would result in an index that was the cartesian product of all of the categories for each grouper, not just the observed values. This is inconsistent with output for other dtypes, can potentially cast to different dtypes (as missing values are introduced), and could cause a huge frame to be generated. Pandas will now return only the observed values, regardless if grouping on a categorical column; note that the categorical dtype is *still* preserved. You will still have a categorical columns (:issue:`14942`)
488+
489+
490+
.. ipython:: python
491+
492+
cat1 = pd.Categorical(["a", "a", "b", "b"],
493+
categories=["a", "b", "z"], ordered=True)
494+
cat2 = pd.Categorical(["c", "d", "c", "d"],
495+
categories=["c", "d", "y"], ordered=True)
496+
df = pd.DataFrame({"A": cat1, "B": cat2, "values": [1, 2, 3, 4]})
497+
df['C'] = ['foo', 'bar'] * 2
498+
df
499+
500+
Previous Behavior:
501+
502+
.. code-block:: python
503+
504+
In [4]: df.groupby(['A', 'B', 'C']).count()
505+
Out[4]:
506+
values
507+
A B C
508+
a c bar NaN
509+
foo 1.0
510+
d bar 1.0
511+
foo NaN
512+
y bar NaN
513+
foo NaN
514+
b c bar NaN
515+
foo 1.0
516+
d bar 1.0
517+
foo NaN
518+
y bar NaN
519+
foo NaN
520+
z c bar NaN
521+
foo NaN
522+
d bar NaN
523+
foo NaN
524+
y bar NaN
525+
foo NaN
526+
527+
New Behavior:
528+
529+
.. ipython:: python
530+
531+
df.groupby(['A', 'B', 'C']).count()
532+
482533
.. _whatsnew_0230.api_breaking.deprecate_panel:
483534

484535
Deprecate Panel

pandas/core/groupby/groupby.py

+14-63
Original file line numberDiff line numberDiff line change
@@ -2336,10 +2336,13 @@ def result_index(self):
23362336
if not self.compressed and len(self.groupings) == 1:
23372337
return self.groupings[0].group_index.rename(self.names[0])
23382338

2339-
return MultiIndex(levels=[ping.group_index for ping in self.groupings],
2340-
labels=self.recons_labels,
2341-
verify_integrity=False,
2342-
names=self.names)
2339+
labels = self.recons_labels
2340+
levels = [ping.group_index for ping in self.groupings]
2341+
result = MultiIndex(levels=levels,
2342+
labels=labels,
2343+
verify_integrity=False,
2344+
names=self.names)
2345+
return result.remove_unused_levels()
23432346

23442347
def get_group_levels(self):
23452348
if not self.compressed and len(self.groupings) == 1:
@@ -4151,7 +4154,7 @@ def first_not_none(values):
41514154
not_indexed_same=not_indexed_same)
41524155
elif self.grouper.groupings is not None:
41534156
if len(self.grouper.groupings) > 1:
4154-
key_index = MultiIndex.from_tuples(keys, names=key_names)
4157+
key_index = self.grouper.result_index
41554158

41564159
else:
41574160
ping = self.grouper.groupings[0]
@@ -4241,8 +4244,9 @@ def first_not_none(values):
42414244

42424245
# normally use vstack as its faster than concat
42434246
# and if we have mi-columns
4244-
if isinstance(v.index,
4245-
MultiIndex) or key_index is None:
4247+
if (isinstance(v.index, MultiIndex) or
4248+
key_index is None or
4249+
isinstance(key_index, MultiIndex)):
42464250
stacked_values = np.vstack(map(np.asarray, values))
42474251
result = DataFrame(stacked_values, index=key_index,
42484252
columns=index)
@@ -4280,7 +4284,7 @@ def first_not_none(values):
42804284
else:
42814285
result = result._convert(datetime=True)
42824286

4283-
return self._reindex_output(result)
4287+
return result
42844288

42854289
# values are not series or array-like but scalars
42864290
else:
@@ -4661,7 +4665,7 @@ def _wrap_aggregated_output(self, output, names=None):
46614665
if self.axis == 1:
46624666
result = result.T
46634667

4664-
return self._reindex_output(result)._convert(datetime=True)
4668+
return result._convert(datetime=True)
46654669

46664670
def _wrap_transformed_output(self, output, names=None):
46674671
return DataFrame(output, index=self.obj.index)
@@ -4682,60 +4686,7 @@ def _wrap_agged_blocks(self, items, blocks):
46824686
if self.axis == 1:
46834687
result = result.T
46844688

4685-
return self._reindex_output(result)._convert(datetime=True)
4686-
4687-
def _reindex_output(self, result):
4688-
"""
4689-
if we have categorical groupers, then we want to make sure that
4690-
we have a fully reindex-output to the levels. These may have not
4691-
participated in the groupings (e.g. may have all been
4692-
nan groups)
4693-
4694-
This can re-expand the output space
4695-
"""
4696-
groupings = self.grouper.groupings
4697-
if groupings is None:
4698-
return result
4699-
elif len(groupings) == 1:
4700-
return result
4701-
elif not any(isinstance(ping.grouper, (Categorical, CategoricalIndex))
4702-
for ping in groupings):
4703-
return result
4704-
4705-
levels_list = [ping.group_index for ping in groupings]
4706-
index, _ = MultiIndex.from_product(
4707-
levels_list, names=self.grouper.names).sortlevel()
4708-
4709-
if self.as_index:
4710-
d = {self.obj._get_axis_name(self.axis): index, 'copy': False}
4711-
return result.reindex(**d)
4712-
4713-
# GH 13204
4714-
# Here, the categorical in-axis groupers, which need to be fully
4715-
# expanded, are columns in `result`. An idea is to do:
4716-
# result = result.set_index(self.grouper.names)
4717-
# .reindex(index).reset_index()
4718-
# but special care has to be taken because of possible not-in-axis
4719-
# groupers.
4720-
# So, we manually select and drop the in-axis grouper columns,
4721-
# reindex `result`, and then reset the in-axis grouper columns.
4722-
4723-
# Select in-axis groupers
4724-
in_axis_grps = [(i, ping.name) for (i, ping)
4725-
in enumerate(groupings) if ping.in_axis]
4726-
g_nums, g_names = zip(*in_axis_grps)
4727-
4728-
result = result.drop(labels=list(g_names), axis=1)
4729-
4730-
# Set a temp index and reindex (possibly expanding)
4731-
result = result.set_index(self.grouper.result_index
4732-
).reindex(index, copy=False)
4733-
4734-
# Reset in-axis grouper columns
4735-
# (using level numbers `g_nums` because level names may not be unique)
4736-
result = result.reset_index(level=g_nums)
4737-
4738-
return result.reset_index(drop=True)
4689+
return result._convert(datetime=True)
47394690

47404691
def _iterate_column_groupbys(self):
47414692
for i, colname in enumerate(self._selected_obj.columns):

0 commit comments

Comments
 (0)