Skip to content

Commit 77d5f04

Browse files
bthyreaujreback
authored andcommitted
BUG: fix groupby crash on duplicated columns (GH7511)
1 parent a586867 commit 77d5f04

File tree

3 files changed

+23
-3
lines changed

3 files changed

+23
-3
lines changed

doc/source/v0.15.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -684,6 +684,7 @@ Bug Fixes
684684
- Bug in ``groupby`` where callable objects without name attributes would take the wrong path,
685685
and produce a ``DataFrame`` instead of a ``Series`` (:issue:`7929`)
686686

687+
- Bug in ``groupby`` error message when a DataFrame grouping column is duplicated (:issue:`7511`)
687688

688689
- Bug in ``read_html`` where the ``infer_types`` argument forced coercion of
689690
date-likes incorrectly (:issue:`7762`, :issue:`7032`).

pandas/core/groupby.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -1940,6 +1940,8 @@ def __init__(self, index, grouper=None, obj=None, name=None, level=None,
19401940

19411941
# no level passed
19421942
if not isinstance(self.grouper, (Series, Index, np.ndarray)):
1943+
if getattr(self.grouper,'ndim', 1) != 1:
1944+
raise ValueError("Grouper result with an ndim != 1")
19431945
self.grouper = self.index.map(self.grouper)
19441946
if not (hasattr(self.grouper, "__len__") and
19451947
len(self.grouper) == len(self.index)):
@@ -2098,8 +2100,7 @@ def _get_grouper(obj, key=None, axis=0, level=None, sort=True):
20982100
gpr = obj[gpr]
20992101

21002102
if isinstance(gpr, Categorical) and len(gpr) != len(obj):
2101-
errmsg = "Categorical grouper must have len(grouper) == len(data)"
2102-
raise AssertionError(errmsg)
2103+
raise ValueError("Categorical grouper must have len(grouper) == len(data)")
21032104

21042105
ping = Grouping(group_axis, gpr, obj=obj, name=name, level=level, sort=sort)
21052106
groupings.append(ping)

pandas/tests/test_groupby.py

+19-1
Original file line numberDiff line numberDiff line change
@@ -387,6 +387,19 @@ def test_groupby_grouper(self):
387387
expected = grouped.mean()
388388
assert_frame_equal(result, expected)
389389

390+
def test_groupby_duplicated_column_errormsg(self):
391+
# GH7511
392+
df = DataFrame(columns=['A','B','A','C'], \
393+
data=[range(4), range(2,6), range(0, 8, 2)])
394+
395+
self.assertRaises(ValueError, df.groupby, 'A')
396+
self.assertRaises(ValueError, df.groupby, ['A', 'B'])
397+
398+
grouped = df.groupby('B')
399+
c = grouped.count()
400+
self.assertTrue(c.columns.nlevels == 1)
401+
self.assertTrue(c.columns.size == 3)
402+
390403
def test_groupby_dict_mapping(self):
391404
# GH #679
392405
from pandas import Series
@@ -695,6 +708,11 @@ def test_agg_grouping_is_list_tuple(self):
695708
expected = grouped.mean()
696709
tm.assert_frame_equal(result, expected)
697710

711+
def test_grouping_error_on_multidim_input(self):
712+
from pandas.core.groupby import Grouping
713+
self.assertRaises(ValueError, \
714+
Grouping, self.df.index, self.df[['A','A']])
715+
698716
def test_agg_python_multiindex(self):
699717
grouped = self.mframe.groupby(['A', 'B'])
700718

@@ -3298,7 +3316,7 @@ def test_groupby_categorical_unequal_len(self):
32983316
bins = pd.cut(series.dropna(), 4)
32993317

33003318
# len(bins) != len(series) here
3301-
self.assertRaises(AssertionError,lambda : series.groupby(bins).mean())
3319+
self.assertRaises(ValueError,lambda : series.groupby(bins).mean())
33023320

33033321
def test_gb_apply_list_of_unequal_len_arrays(self):
33043322

0 commit comments

Comments
 (0)