Skip to content

Commit 7cd448a

Browse files
committed
change groupby-behaviour (duplicates) & tests
Only duplicates get suffixed by their corresponding enumeration value: ['name', None, 'name'] gets transformed into ['name_0', None, 'name_1'] Superfluous test cases have been deleted and some additonal test statements have been added.
1 parent c2a3fa5 commit 7cd448a

File tree

3 files changed

+43
-39
lines changed

3 files changed

+43
-39
lines changed

pandas/core/groupby/groupby.py

+16-6
Original file line numberDiff line numberDiff line change
@@ -2298,18 +2298,28 @@ def levels(self):
22982298

22992299
@property
23002300
def names(self):
2301-
# GH 19029
23022301
# add suffix to level name in case they contain duplicates (GH 19029):
23032302
orig_names = [ping.name for ping in self.groupings]
23042303
# if no names were assigned return the original names
23052304
if all(x is None for x in orig_names):
23062305
return orig_names
2307-
# in case duplicates are contained rename all of them
2308-
if len(set(orig_names)) < len(orig_names):
2309-
orig_names = [''.join([str(x), str(i)])
2310-
for i, x in enumerate(orig_names)]
23112306

2312-
return orig_names
2307+
suffixes = collections.defaultdict(int)
2308+
dups = {n: count for n, count in
2309+
collections.Counter(orig_names).items() if count > 1}
2310+
new_names = []
2311+
for name in orig_names:
2312+
if name not in dups:
2313+
new_names.append(name)
2314+
else:
2315+
if name is not None:
2316+
new_name = '{0}_{1}'.format(name, suffixes[name])
2317+
else:
2318+
new_name = '{0}'.format(suffixes[name])
2319+
suffixes[name] += 1
2320+
new_names.append(new_name)
2321+
2322+
return new_names
23132323

23142324
def size(self):
23152325
"""

pandas/tests/groupby/test_categorical.py

-8
Original file line numberDiff line numberDiff line change
@@ -558,14 +558,6 @@ def test_as_index():
558558
result = df.groupby(['cat', s], as_index=False, observed=True).sum()
559559
tm.assert_frame_equal(result, expected)
560560

561-
# GH 19029: conflicitng names should not raise a value error anymore
562-
raised = False
563-
try:
564-
df.groupby(['cat', s.rename('cat')], observed=True).sum()
565-
except ValueError:
566-
raised = True
567-
assert raised is False
568-
569561
# is original index dropped?
570562
group_columns = ['cat', 'A']
571563
expected = DataFrame(

pandas/tests/groupby/test_groupby.py

+27-25
Original file line numberDiff line numberDiff line change
@@ -1678,38 +1678,40 @@ def test_tuple_correct_keyerror():
16781678

16791679
def test_dup_index_names():
16801680
# dup. index names in groupby operations should be renamed (GH 19029):
1681-
df = pd.DataFrame({'date': list(pd.date_range('5.1.2018', '5.3.2018')),
1681+
df = pd.DataFrame({'date': pd.date_range('5.1.2018', '5.3.2018'),
16821682
'vals': list(range(3))})
16831683

1684-
mi = pd.MultiIndex.from_product([[5], [1, 2, 3]], names=['date0', 'date1'])
1684+
# duplicates get suffixed by integer position
1685+
mi = pd.MultiIndex.from_product([[5], [1, 2, 3]],
1686+
names=['date_0', 'date_1'])
16851687
expected = pd.Series(data=list(range(3)), index=mi, name='vals')
1686-
1687-
failed = False
1688-
try:
1689-
result = df.groupby([df.date.dt.month, df.date.dt.day])['vals'].sum()
1690-
except ValueError:
1691-
failed = True
1692-
1693-
assert failed is False
1688+
result = df.groupby([df.date.dt.month, df.date.dt.day])['vals'].sum()
16941689

16951690
tm.assert_series_equal(result, expected)
16961691

1697-
1698-
def test_empty_index_names():
1699-
# don't rename frames in case no names were assigned (GH 19029)
1700-
df = pd.DataFrame({'date': list(pd.date_range('5.1.2018', '5.3.2018')),
1701-
'vals': list(range(3))})
1702-
1703-
mi = pd.MultiIndex.from_product([[5], [1, 2, 3]])
1692+
# 2 out of 3 are duplicates and None
1693+
mi = pd.MultiIndex.from_product([[2018], [5], [1, 2, 3]],
1694+
names=['0', '1', 'date'])
17041695
expected = pd.Series(data=list(range(3)), index=mi, name='vals')
1696+
result = df.groupby([df.date.dt.year.rename(None),
1697+
df.date.dt.month.rename(None),
1698+
df.date.dt.day])['vals'].sum()
1699+
tm.assert_series_equal(result, expected)
17051700

1706-
failed = False
1707-
try:
1708-
result = df.groupby([df.date.dt.month.rename(None),
1709-
df.date.dt.day.rename(None)])['vals'].sum()
1710-
except ValueError:
1711-
failed = True
1712-
1713-
assert failed is False
1701+
# 2 out of 3 names (not None) are duplicates, the remaining is None
1702+
mi = pd.MultiIndex.from_product([[2018], [5], [1, 2, 3]],
1703+
names=['date_0', None, 'date_1'])
1704+
expected = pd.Series(data=list(range(3)), index=mi, name='vals')
1705+
result = df.groupby([df.date.dt.year,
1706+
df.date.dt.month.rename(None),
1707+
df.date.dt.day])['vals'].sum()
1708+
tm.assert_series_equal(result, expected)
17141709

1710+
# all are None
1711+
mi = pd.MultiIndex.from_product([[2018], [5], [1, 2, 3]],
1712+
names=[None, None, None])
1713+
expected = pd.Series(data=list(range(3)), index=mi, name='vals')
1714+
result = df.groupby([df.date.dt.year.rename(None),
1715+
df.date.dt.month.rename(None),
1716+
df.date.dt.day.rename(None)])['vals'].sum()
17151717
tm.assert_series_equal(result, expected)

0 commit comments

Comments
 (0)