Skip to content

Commit 48f57f8

Browse files
gfyoungvictor
authored and
victor
committed
DEPR: Error with ambiguous groupby strings (pandas-dev#22415)
xref pandas-devgh-14432.
1 parent f4d8052 commit 48f57f8

File tree

12 files changed

+51
-270
lines changed

12 files changed

+51
-270
lines changed

doc/source/groupby.rst

+2-3
Original file line numberDiff line numberDiff line change
@@ -106,9 +106,8 @@ consider the following ``DataFrame``:
106106
.. versionadded:: 0.20
107107

108108
A string passed to ``groupby`` may refer to either a column or an index level.
109-
If a string matches both a column name and an index level name then a warning is
110-
issued and the column takes precedence. This will result in an ambiguity error
111-
in a future version.
109+
If a string matches both a column name and an index level name, a
110+
``ValueError`` will be raised.
112111

113112
.. ipython:: python
114113

doc/source/whatsnew/v0.24.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -523,6 +523,7 @@ Removal of prior version deprecations/changes
523523
- :meth:`Series.repeat` has renamed the ``reps`` argument to ``repeats`` (:issue:`14645`)
524524
- Several private functions were removed from the (non-public) module ``pandas.core.common`` (:issue:`22001`)
525525
- Removal of the previously deprecated module ``pandas.core.datetools`` (:issue:`14105`, :issue:`14094`)
526+
- Strings passed into :meth:`DataFrame.groupby` that refer to both column and index levels will raise a ``ValueError`` (:issue:`14432`)
526527
-
527528

528529
.. _whatsnew_0240.performance:

pandas/core/frame.py

+2-5
Original file line numberDiff line numberDiff line change
@@ -4393,7 +4393,6 @@ def sort_values(self, by, axis=0, ascending=True, inplace=False,
43934393
kind='quicksort', na_position='last'):
43944394
inplace = validate_bool_kwarg(inplace, 'inplace')
43954395
axis = self._get_axis_number(axis)
4396-
stacklevel = 2 # Number of stack levels from df.sort_values
43974396

43984397
if not isinstance(by, list):
43994398
by = [by]
@@ -4405,8 +4404,7 @@ def sort_values(self, by, axis=0, ascending=True, inplace=False,
44054404

44064405
keys = []
44074406
for x in by:
4408-
k = self._get_label_or_level_values(x, axis=axis,
4409-
stacklevel=stacklevel)
4407+
k = self._get_label_or_level_values(x, axis=axis)
44104408
keys.append(k)
44114409
indexer = lexsort_indexer(keys, orders=ascending,
44124410
na_position=na_position)
@@ -4415,8 +4413,7 @@ def sort_values(self, by, axis=0, ascending=True, inplace=False,
44154413
from pandas.core.sorting import nargsort
44164414

44174415
by = by[0]
4418-
k = self._get_label_or_level_values(by, axis=axis,
4419-
stacklevel=stacklevel)
4416+
k = self._get_label_or_level_values(by, axis=axis)
44204417

44214418
if isinstance(ascending, (tuple, list)):
44224419
ascending = ascending[0]

pandas/core/generic.py

+9-28
Original file line numberDiff line numberDiff line change
@@ -1412,33 +1412,23 @@ def _is_label_or_level_reference(self, key, axis=0):
14121412
return (self._is_level_reference(key, axis=axis) or
14131413
self._is_label_reference(key, axis=axis))
14141414

1415-
def _check_label_or_level_ambiguity(self, key, axis=0, stacklevel=1):
1415+
def _check_label_or_level_ambiguity(self, key, axis=0):
14161416
"""
1417-
Check whether `key` matches both a level of the input `axis` and a
1418-
label of the other axis and raise a ``FutureWarning`` if this is the
1419-
case.
1417+
Check whether `key` is ambiguous.
14201418
1421-
Note: This method will be altered to raise an ambiguity exception in
1422-
a future version.
1419+
By ambiguous, we mean that it matches both a level of the input
1420+
`axis` and a label of the other axis.
14231421
14241422
Parameters
14251423
----------
14261424
key: str or object
14271425
label or level name
14281426
axis: int, default 0
14291427
Axis that levels are associated with (0 for index, 1 for columns)
1430-
stacklevel: int, default 1
1431-
Stack level used when a FutureWarning is raised (see below).
1432-
1433-
Returns
1434-
-------
1435-
ambiguous: bool
14361428
14371429
Raises
14381430
------
1439-
FutureWarning
1440-
if `key` is ambiguous. This will become an ambiguity error in a
1441-
future version
1431+
ValueError: `key` is ambiguous
14421432
"""
14431433

14441434
axis = self._get_axis_number(axis)
@@ -1464,21 +1454,15 @@ def _check_label_or_level_ambiguity(self, key, axis=0, stacklevel=1):
14641454
('an', 'index'))
14651455

14661456
msg = ("'{key}' is both {level_article} {level_type} level and "
1467-
"{label_article} {label_type} label.\n"
1468-
"Defaulting to {label_type}, but this will raise an "
1469-
"ambiguity error in a future version"
1457+
"{label_article} {label_type} label, which is ambiguous."
14701458
).format(key=key,
14711459
level_article=level_article,
14721460
level_type=level_type,
14731461
label_article=label_article,
14741462
label_type=label_type)
1463+
raise ValueError(msg)
14751464

1476-
warnings.warn(msg, FutureWarning, stacklevel=stacklevel + 1)
1477-
return True
1478-
else:
1479-
return False
1480-
1481-
def _get_label_or_level_values(self, key, axis=0, stacklevel=1):
1465+
def _get_label_or_level_values(self, key, axis=0):
14821466
"""
14831467
Return a 1-D array of values associated with `key`, a label or level
14841468
from the given `axis`.
@@ -1497,8 +1481,6 @@ def _get_label_or_level_values(self, key, axis=0, stacklevel=1):
14971481
Label or level name.
14981482
axis: int, default 0
14991483
Axis that levels are associated with (0 for index, 1 for columns)
1500-
stacklevel: int, default 1
1501-
Stack level used when a FutureWarning is raised (see below).
15021484
15031485
Returns
15041486
-------
@@ -1524,8 +1506,7 @@ def _get_label_or_level_values(self, key, axis=0, stacklevel=1):
15241506
.format(type=type(self)))
15251507

15261508
if self._is_label_reference(key, axis=axis):
1527-
self._check_label_or_level_ambiguity(key, axis=axis,
1528-
stacklevel=stacklevel + 1)
1509+
self._check_label_or_level_ambiguity(key, axis=axis)
15291510
values = self.xs(key, axis=other_axes[0])._values
15301511
elif self._is_level_reference(key, axis=axis):
15311512
values = self.axes[axis].get_level_values(key)._values

pandas/core/groupby/grouper.py

+1-3
Original file line numberDiff line numberDiff line change
@@ -571,9 +571,7 @@ def is_in_obj(gpr):
571571
elif is_in_axis(gpr): # df.groupby('name')
572572
if gpr in obj:
573573
if validate:
574-
stacklevel = 5 # Number of stack levels from df.groupby
575-
obj._check_label_or_level_ambiguity(
576-
gpr, stacklevel=stacklevel)
574+
obj._check_label_or_level_ambiguity(gpr)
577575
in_axis, name, gpr = True, gpr, obj[gpr]
578576
exclusions.append(name)
579577
elif obj._is_level_reference(gpr):

pandas/core/reshape/merge.py

+5-11
Original file line numberDiff line numberDiff line change
@@ -811,7 +811,6 @@ def _get_merge_keys(self):
811811
left_drop = []
812812

813813
left, right = self.left, self.right
814-
stacklevel = 5 # Number of stack levels from df.merge
815814

816815
is_lkey = lambda x: is_array_like(x) and len(x) == len(left)
817816
is_rkey = lambda x: is_array_like(x) and len(x) == len(right)
@@ -837,8 +836,7 @@ def _get_merge_keys(self):
837836
else:
838837
if rk is not None:
839838
right_keys.append(
840-
right._get_label_or_level_values(
841-
rk, stacklevel=stacklevel))
839+
right._get_label_or_level_values(rk))
842840
join_names.append(rk)
843841
else:
844842
# work-around for merge_asof(right_index=True)
@@ -848,8 +846,7 @@ def _get_merge_keys(self):
848846
if not is_rkey(rk):
849847
if rk is not None:
850848
right_keys.append(
851-
right._get_label_or_level_values(
852-
rk, stacklevel=stacklevel))
849+
right._get_label_or_level_values(rk))
853850
else:
854851
# work-around for merge_asof(right_index=True)
855852
right_keys.append(right.index)
@@ -862,8 +859,7 @@ def _get_merge_keys(self):
862859
else:
863860
right_keys.append(rk)
864861
if lk is not None:
865-
left_keys.append(left._get_label_or_level_values(
866-
lk, stacklevel=stacklevel))
862+
left_keys.append(left._get_label_or_level_values(lk))
867863
join_names.append(lk)
868864
else:
869865
# work-around for merge_asof(left_index=True)
@@ -875,8 +871,7 @@ def _get_merge_keys(self):
875871
left_keys.append(k)
876872
join_names.append(None)
877873
else:
878-
left_keys.append(left._get_label_or_level_values(
879-
k, stacklevel=stacklevel))
874+
left_keys.append(left._get_label_or_level_values(k))
880875
join_names.append(k)
881876
if isinstance(self.right.index, MultiIndex):
882877
right_keys = [lev._values.take(lab)
@@ -890,8 +885,7 @@ def _get_merge_keys(self):
890885
right_keys.append(k)
891886
join_names.append(None)
892887
else:
893-
right_keys.append(right._get_label_or_level_values(
894-
k, stacklevel=stacklevel))
888+
right_keys.append(right._get_label_or_level_values(k))
895889
join_names.append(k)
896890
if isinstance(self.left.index, MultiIndex):
897891
left_keys = [lev._values.take(lab)

pandas/tests/frame/test_sort_values_level_as_str.py

+1-32
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
import numpy as np
22
import pytest
33

4-
from pandas import DataFrame, Index
4+
from pandas import DataFrame
55
from pandas.errors import PerformanceWarning
66
from pandas.util import testing as tm
77
from pandas.util.testing import assert_frame_equal
@@ -93,34 +93,3 @@ def test_sort_column_level_and_index_label(
9393
assert_frame_equal(result, expected)
9494
else:
9595
assert_frame_equal(result, expected)
96-
97-
98-
def test_sort_values_column_index_level_precedence():
99-
# GH 14353, when a string passed as the `by` parameter
100-
# matches a column and an index level the column takes
101-
# precedence
102-
103-
# Construct DataFrame with index and column named 'idx'
104-
idx = Index(np.arange(1, 7), name='idx')
105-
df = DataFrame({'A': np.arange(11, 17),
106-
'idx': np.arange(6, 0, -1)},
107-
index=idx)
108-
109-
# Sorting by 'idx' should sort by the idx column and raise a
110-
# FutureWarning
111-
with tm.assert_produces_warning(FutureWarning):
112-
result = df.sort_values(by='idx')
113-
114-
# This should be equivalent to sorting by the 'idx' index level in
115-
# descending order
116-
expected = df.sort_index(level='idx', ascending=False)
117-
assert_frame_equal(result, expected)
118-
119-
# Perform same test with MultiIndex
120-
df_multi = df.set_index('A', append=True)
121-
122-
with tm.assert_produces_warning(FutureWarning):
123-
result = df_multi.sort_values(by='idx')
124-
125-
expected = df_multi.sort_index(level='idx', ascending=False)
126-
assert_frame_equal(result, expected)

pandas/tests/generic/test_label_or_level_utils.py

+28-47
Original file line numberDiff line numberDiff line change
@@ -166,31 +166,24 @@ def test_is_label_or_level_reference_panel_error(panel):
166166
def test_check_label_or_level_ambiguity_df(df_ambig, axis):
167167

168168
# Transpose frame if axis == 1
169-
if axis in {1, 'columns'}:
169+
if axis in {1, "columns"}:
170170
df_ambig = df_ambig.T
171171

172-
# df_ambig has both an on-axis level and off-axis label named L1
173-
# Therefore L1 is ambiguous
174-
with tm.assert_produces_warning(FutureWarning,
175-
clear=True) as w:
172+
if axis in {0, "index"}:
173+
msg = "'L1' is both an index level and a column label"
174+
else:
175+
msg = "'L1' is both a column level and an index label"
176176

177-
assert df_ambig._check_label_or_level_ambiguity('L1', axis=axis)
178-
warning_msg = w[0].message.args[0]
179-
if axis in {0, 'index'}:
180-
assert warning_msg.startswith("'L1' is both an index level "
181-
"and a column label")
182-
else:
183-
assert warning_msg.startswith("'L1' is both a column level "
184-
"and an index label")
177+
# df_ambig has both an on-axis level and off-axis label named L1
178+
# Therefore, L1 is ambiguous.
179+
with tm.assert_raises_regex(ValueError, msg):
180+
df_ambig._check_label_or_level_ambiguity("L1", axis=axis)
185181

186-
# df_ambig has an on-axis level named L2 and it is not ambiguous
187-
# No warning should be raised
188-
with tm.assert_produces_warning(None):
189-
assert not df_ambig._check_label_or_level_ambiguity('L2', axis=axis)
182+
# df_ambig has an on-axis level named L2,, and it is not ambiguous.
183+
df_ambig._check_label_or_level_ambiguity("L2", axis=axis)
190184

191-
# df_ambig has an off-axis label named L3 and it is not ambiguous
192-
with tm.assert_produces_warning(None):
193-
assert not df_ambig._is_level_reference('L3', axis=axis)
185+
# df_ambig has an off-axis label named L3, and it is not ambiguous
186+
assert not df_ambig._check_label_or_level_ambiguity("L3", axis=axis)
194187

195188

196189
# Series
@@ -200,17 +193,15 @@ def test_check_label_or_level_ambiguity_series(df):
200193
# A series has no columns and therefore references are never ambiguous
201194

202195
# Make series with L1 as index
203-
s = df.set_index('L1').L2
204-
with tm.assert_produces_warning(None):
205-
assert not s._check_label_or_level_ambiguity('L1', axis=0)
206-
assert not s._check_label_or_level_ambiguity('L2', axis=0)
196+
s = df.set_index("L1").L2
197+
s._check_label_or_level_ambiguity("L1", axis=0)
198+
s._check_label_or_level_ambiguity("L2", axis=0)
207199

208200
# Make series with L1 and L2 as index
209-
s = df.set_index(['L1', 'L2']).L3
210-
with tm.assert_produces_warning(None):
211-
assert not s._check_label_or_level_ambiguity('L1', axis=0)
212-
assert not s._check_label_or_level_ambiguity('L2', axis=0)
213-
assert not s._check_label_or_level_ambiguity('L3', axis=0)
201+
s = df.set_index(["L1", "L2"]).L3
202+
s._check_label_or_level_ambiguity("L1", axis=0)
203+
s._check_label_or_level_ambiguity("L2", axis=0)
204+
s._check_label_or_level_ambiguity("L3", axis=0)
214205

215206

216207
def test_check_label_or_level_ambiguity_series_axis1_error(df):
@@ -229,7 +220,7 @@ def test_check_label_or_level_ambiguity_panel_error(panel):
229220
.format(type=type(panel)))
230221

231222
with tm.assert_raises_regex(NotImplementedError, msg):
232-
panel._check_label_or_level_ambiguity('L1', axis=0)
223+
panel._check_label_or_level_ambiguity("L1", axis=0)
233224

234225

235226
# Test _get_label_or_level_values
@@ -241,19 +232,16 @@ def assert_label_values(frame, labels, axis):
241232
else:
242233
expected = frame.loc[label]._values
243234

244-
result = frame._get_label_or_level_values(label, axis=axis,
245-
stacklevel=2)
235+
result = frame._get_label_or_level_values(label, axis=axis)
246236
assert array_equivalent(expected, result)
247237

248238

249239
def assert_level_values(frame, levels, axis):
250240
for level in levels:
251-
if axis in {0, 'index'}:
241+
if axis in {0, "index"}:
252242
expected = frame.index.get_level_values(level=level)._values
253243
else:
254-
expected = (frame.columns
255-
.get_level_values(level=level)
256-
._values)
244+
expected = frame.columns.get_level_values(level=level)._values
257245

258246
result = frame._get_label_or_level_values(level, axis=axis)
259247
assert array_equivalent(expected, result)
@@ -281,18 +269,11 @@ def test_get_label_or_level_values_df_ambig(df_ambig, axis):
281269
if axis in {1, 'columns'}:
282270
df_ambig = df_ambig.T
283271

284-
# df has both an on-axis level and off-axis label named L1
285-
# Therefore L1 is ambiguous but will default to label
286-
with tm.assert_produces_warning(FutureWarning):
287-
assert_label_values(df_ambig, ['L1'], axis=axis)
288-
289-
# df has an on-axis level named L2 and it is not ambiguous
290-
with tm.assert_produces_warning(None):
291-
assert_level_values(df_ambig, ['L2'], axis=axis)
272+
# df has an on-axis level named L2, and it is not ambiguous.
273+
assert_level_values(df_ambig, ['L2'], axis=axis)
292274

293-
# df has an off-axis label named L3 and it is not ambiguous
294-
with tm.assert_produces_warning(None):
295-
assert_label_values(df_ambig, ['L3'], axis=axis)
275+
# df has an off-axis label named L3, and it is not ambiguous.
276+
assert_label_values(df_ambig, ['L3'], axis=axis)
296277

297278

298279
def test_get_label_or_level_values_df_duplabels(df_duplabels, axis):

pandas/tests/groupby/test_categorical.py

+2-11
Original file line numberDiff line numberDiff line change
@@ -568,18 +568,9 @@ def test_as_index():
568568
'B': [101, 205]},
569569
columns=['cat', 'A', 'B'])
570570

571-
for name in [None, 'X', 'B', 'cat']:
571+
for name in [None, 'X', 'B']:
572572
df.index = Index(list("abc"), name=name)
573-
574-
if name in group_columns and name in df.index.names:
575-
with tm.assert_produces_warning(FutureWarning,
576-
check_stacklevel=False):
577-
result = df.groupby(
578-
group_columns, as_index=False, observed=True).sum()
579-
580-
else:
581-
result = df.groupby(
582-
group_columns, as_index=False, observed=True).sum()
573+
result = df.groupby(group_columns, as_index=False, observed=True).sum()
583574

584575
tm.assert_frame_equal(result, expected)
585576

0 commit comments

Comments
 (0)