Skip to content

Commit 69c5cf3

Browse files
committed
BUG: groupby with categorical and other columns
closes pandas-dev#14942
1 parent 2431641 commit 69c5cf3

File tree

3 files changed

+187
-185
lines changed

3 files changed

+187
-185
lines changed

doc/source/whatsnew/v0.23.0.txt

+51
Original file line numberDiff line numberDiff line change
@@ -482,6 +482,57 @@ If you wish to retain the old behavior while using Python >= 3.6, you can use
482482
'Taxes': -200,
483483
'Net result': 300}).sort_index()
484484

485+
.. _whatsnew_0230.api_breaking.categorical_grouping:
486+
487+
Categorical Grouping no longer expands to all possible groupers
488+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
489+
490+
In previous versions, grouping by 1 or more categorical columns would result in an index that was the cartesian product of all of the categories for each grouper, not just the observed values. This is inconsistent with output for other dtypes, can potentially cast to different dtypes (as missing values are introduced), and could cause a huge frame to be generated. Pandas will now return only the observed values, regardless if grouping on a categorical column; note that the categorical dtype is *still* preserved. You will still have a categorical columns (:issue:`14942`)
491+
492+
493+
.. ipython:: python
494+
495+
cat1 = pd.Categorical(["a", "a", "b", "b"],
496+
categories=["a", "b", "z"], ordered=True)
497+
cat2 = pd.Categorical(["c", "d", "c", "d"],
498+
categories=["c", "d", "y"], ordered=True)
499+
df = pd.DataFrame({"A": cat1, "B": cat2, "values": [1, 2, 3, 4]})
500+
df['C'] = ['foo', 'bar'] * 2
501+
df
502+
503+
Previous Behavior:
504+
505+
.. code-block:: python
506+
507+
In [4]: df.groupby(['A', 'B', 'C']).count()
508+
Out[4]:
509+
values
510+
A B C
511+
a c bar NaN
512+
foo 1.0
513+
d bar 1.0
514+
foo NaN
515+
y bar NaN
516+
foo NaN
517+
b c bar NaN
518+
foo 1.0
519+
d bar 1.0
520+
foo NaN
521+
y bar NaN
522+
foo NaN
523+
z c bar NaN
524+
foo NaN
525+
d bar NaN
526+
foo NaN
527+
y bar NaN
528+
foo NaN
529+
530+
New Behavior:
531+
532+
.. ipython:: python
533+
534+
df.groupby(['A', 'B', 'C']).count()
535+
485536
.. _whatsnew_0230.api_breaking.deprecate_panel:
486537

487538
Deprecate Panel

pandas/core/groupby/groupby.py

+14-63
Original file line numberDiff line numberDiff line change
@@ -2336,10 +2336,13 @@ def result_index(self):
23362336
if not self.compressed and len(self.groupings) == 1:
23372337
return self.groupings[0].group_index.rename(self.names[0])
23382338

2339-
return MultiIndex(levels=[ping.group_index for ping in self.groupings],
2340-
labels=self.recons_labels,
2341-
verify_integrity=False,
2342-
names=self.names)
2339+
labels = self.recons_labels
2340+
levels = [ping.group_index for ping in self.groupings]
2341+
result = MultiIndex(levels=levels,
2342+
labels=labels,
2343+
verify_integrity=False,
2344+
names=self.names)
2345+
return result.remove_unused_levels()
23432346

23442347
def get_group_levels(self):
23452348
if not self.compressed and len(self.groupings) == 1:
@@ -4151,7 +4154,7 @@ def first_not_none(values):
41514154
not_indexed_same=not_indexed_same)
41524155
elif self.grouper.groupings is not None:
41534156
if len(self.grouper.groupings) > 1:
4154-
key_index = MultiIndex.from_tuples(keys, names=key_names)
4157+
key_index = self.grouper.result_index
41554158

41564159
else:
41574160
ping = self.grouper.groupings[0]
@@ -4241,8 +4244,9 @@ def first_not_none(values):
42414244

42424245
# normally use vstack as its faster than concat
42434246
# and if we have mi-columns
4244-
if isinstance(v.index,
4245-
MultiIndex) or key_index is None:
4247+
if (isinstance(v.index, MultiIndex) or
4248+
key_index is None or
4249+
isinstance(key_index, MultiIndex)):
42464250
stacked_values = np.vstack(map(np.asarray, values))
42474251
result = DataFrame(stacked_values, index=key_index,
42484252
columns=index)
@@ -4280,7 +4284,7 @@ def first_not_none(values):
42804284
else:
42814285
result = result._convert(datetime=True)
42824286

4283-
return self._reindex_output(result)
4287+
return result
42844288

42854289
# values are not series or array-like but scalars
42864290
else:
@@ -4661,7 +4665,7 @@ def _wrap_aggregated_output(self, output, names=None):
46614665
if self.axis == 1:
46624666
result = result.T
46634667

4664-
return self._reindex_output(result)._convert(datetime=True)
4668+
return result._convert(datetime=True)
46654669

46664670
def _wrap_transformed_output(self, output, names=None):
46674671
return DataFrame(output, index=self.obj.index)
@@ -4682,60 +4686,7 @@ def _wrap_agged_blocks(self, items, blocks):
46824686
if self.axis == 1:
46834687
result = result.T
46844688

4685-
return self._reindex_output(result)._convert(datetime=True)
4686-
4687-
def _reindex_output(self, result):
4688-
"""
4689-
if we have categorical groupers, then we want to make sure that
4690-
we have a fully reindex-output to the levels. These may have not
4691-
participated in the groupings (e.g. may have all been
4692-
nan groups)
4693-
4694-
This can re-expand the output space
4695-
"""
4696-
groupings = self.grouper.groupings
4697-
if groupings is None:
4698-
return result
4699-
elif len(groupings) == 1:
4700-
return result
4701-
elif not any(isinstance(ping.grouper, (Categorical, CategoricalIndex))
4702-
for ping in groupings):
4703-
return result
4704-
4705-
levels_list = [ping.group_index for ping in groupings]
4706-
index, _ = MultiIndex.from_product(
4707-
levels_list, names=self.grouper.names).sortlevel()
4708-
4709-
if self.as_index:
4710-
d = {self.obj._get_axis_name(self.axis): index, 'copy': False}
4711-
return result.reindex(**d)
4712-
4713-
# GH 13204
4714-
# Here, the categorical in-axis groupers, which need to be fully
4715-
# expanded, are columns in `result`. An idea is to do:
4716-
# result = result.set_index(self.grouper.names)
4717-
# .reindex(index).reset_index()
4718-
# but special care has to be taken because of possible not-in-axis
4719-
# groupers.
4720-
# So, we manually select and drop the in-axis grouper columns,
4721-
# reindex `result`, and then reset the in-axis grouper columns.
4722-
4723-
# Select in-axis groupers
4724-
in_axis_grps = [(i, ping.name) for (i, ping)
4725-
in enumerate(groupings) if ping.in_axis]
4726-
g_nums, g_names = zip(*in_axis_grps)
4727-
4728-
result = result.drop(labels=list(g_names), axis=1)
4729-
4730-
# Set a temp index and reindex (possibly expanding)
4731-
result = result.set_index(self.grouper.result_index
4732-
).reindex(index, copy=False)
4733-
4734-
# Reset in-axis grouper columns
4735-
# (using level numbers `g_nums` because level names may not be unique)
4736-
result = result.reset_index(level=g_nums)
4737-
4738-
return result.reset_index(drop=True)
4689+
return result._convert(datetime=True)
47394690

47404691
def _iterate_column_groupbys(self):
47414692
for i, colname in enumerate(self._selected_obj.columns):

0 commit comments

Comments
 (0)