diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index df7a5dc9dc173..6cd2a91e9c17d 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -2298,7 +2298,28 @@ def levels(self): @property def names(self): - return [ping.name for ping in self.groupings] + # add suffix to level name in case they contain duplicates (GH 19029): + orig_names = [ping.name for ping in self.groupings] + # if no names were assigned return the original names + if all(x is None for x in orig_names): + return orig_names + + suffixes = collections.defaultdict(int) + dups = {n: count for n, count in + collections.Counter(orig_names).items() if count > 1} + new_names = [] + for name in orig_names: + if name not in dups: + new_names.append(name) + else: + if name is not None: + new_name = '{0}_{1}'.format(name, suffixes[name]) + else: + new_name = '{0}'.format(suffixes[name]) + suffixes[name] += 1 + new_names.append(new_name) + + return new_names def size(self): """ diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index e0793b8e1bd64..fc3f2b1b7c4b7 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -558,10 +558,6 @@ def test_as_index(): result = df.groupby(['cat', s], as_index=False, observed=True).sum() tm.assert_frame_equal(result, expected) - # GH18872: conflicting names in desired index - with pytest.raises(ValueError): - df.groupby(['cat', s.rename('cat')], observed=True).sum() - # is original index dropped? group_columns = ['cat', 'A'] expected = DataFrame( diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index e05f9de5ea7f4..a583c1230bfa4 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -1674,3 +1674,44 @@ def test_tuple_correct_keyerror(): [3, 4]])) with tm.assert_raises_regex(KeyError, "(7, 8)"): df.groupby((7, 8)).mean() + + +def test_dup_index_names(): + # dup. index names in groupby operations should be renamed (GH 19029): + df = pd.DataFrame({'date': pd.date_range('5.1.2018', '5.3.2018'), + 'vals': list(range(3))}) + + # duplicates get suffixed by integer position + mi = pd.MultiIndex.from_product([[5], [1, 2, 3]], + names=['date_0', 'date_1']) + expected = pd.Series(data=list(range(3)), index=mi, name='vals') + result = df.groupby([df.date.dt.month, df.date.dt.day])['vals'].sum() + + tm.assert_series_equal(result, expected) + + # 2 out of 3 are duplicates and None + mi = pd.MultiIndex.from_product([[2018], [5], [1, 2, 3]], + names=['0', '1', 'date']) + expected = pd.Series(data=list(range(3)), index=mi, name='vals') + result = df.groupby([df.date.dt.year.rename(None), + df.date.dt.month.rename(None), + df.date.dt.day])['vals'].sum() + tm.assert_series_equal(result, expected) + + # 2 out of 3 names (not None) are duplicates, the remaining is None + mi = pd.MultiIndex.from_product([[2018], [5], [1, 2, 3]], + names=['date_0', None, 'date_1']) + expected = pd.Series(data=list(range(3)), index=mi, name='vals') + result = df.groupby([df.date.dt.year, + df.date.dt.month.rename(None), + df.date.dt.day])['vals'].sum() + tm.assert_series_equal(result, expected) + + # all are None + mi = pd.MultiIndex.from_product([[2018], [5], [1, 2, 3]], + names=[None, None, None]) + expected = pd.Series(data=list(range(3)), index=mi, name='vals') + result = df.groupby([df.date.dt.year.rename(None), + df.date.dt.month.rename(None), + df.date.dt.day.rename(None)])['vals'].sum() + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index d2cf3fc11e165..3e416e6fed161 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -1705,9 +1705,21 @@ def test_crosstab_with_numpy_size(self): tm.assert_frame_equal(result, expected) def test_crosstab_dup_index_names(self): - # GH 13279, GH 18872 + # duplicated index name should get renamed (GH 19029) s = pd.Series(range(3), name='foo') - pytest.raises(ValueError, pd.crosstab, s, s) + failed = False + try: + result = pd.crosstab(s, s) + except ValueError: + failed = True + + assert failed is False + + s0 = pd.Series(range(3), name='foo0') + s1 = pd.Series(range(3), name='foo1') + expected = pd.DataFrame(np.diag(np.ones(3, dtype='int64')), + index=s0, columns=s1) + tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("names", [['a', ('b', 'c')], [('a', 'b'), 'c']])