Skip to content

Commit 145c227

Browse files
h-vetinarijreback
authored andcommitted
API: better error-handling for df.set_index (pandas-dev#22486)
1 parent c6ca378 commit 145c227

File tree

4 files changed

+101
-50
lines changed

4 files changed

+101
-50
lines changed

doc/source/whatsnew/v0.24.0.txt

+2
Original file line numberDiff line numberDiff line change
@@ -805,6 +805,8 @@ Other API Changes
805805
- :class:`pandas.io.formats.style.Styler` supports a ``number-format`` property when using :meth:`~pandas.io.formats.style.Styler.to_excel` (:issue:`22015`)
806806
- :meth:`DataFrame.corr` and :meth:`Series.corr` now raise a ``ValueError`` along with a helpful error message instead of a ``KeyError`` when supplied with an invalid method (:issue:`22298`)
807807
- :meth:`shift` will now always return a copy, instead of the previous behaviour of returning self when shifting by 0 (:issue:`22397`)
808+
- :meth:`DataFrame.set_index` now allows all one-dimensional list-likes, raises a ``TypeError`` for incorrect types,
809+
has an improved ``KeyError`` message, and will not fail on duplicate column names with ``drop=True``. (:issue:`22484`)
808810
- Slicing a single row of a DataFrame with multiple ExtensionArrays of the same type now preserves the dtype, rather than coercing to object (:issue:`22784`)
809811
- :class:`DateOffset` attribute `_cacheable` and method `_should_cache` have been removed (:issue:`23118`)
810812

pandas/core/frame.py

+38-17
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,7 @@
7373
is_sequence,
7474
is_named_tuple)
7575
from pandas.core.dtypes.concat import _get_sliced_frame_result_type
76+
from pandas.core.dtypes.generic import ABCSeries, ABCIndexClass, ABCMultiIndex
7677
from pandas.core.dtypes.missing import isna, notna
7778

7879
from pandas.core import algorithms
@@ -3988,6 +3989,25 @@ def set_index(self, keys, drop=True, append=False, inplace=False,
39883989
if not isinstance(keys, list):
39893990
keys = [keys]
39903991

3992+
missing = []
3993+
for col in keys:
3994+
if (is_scalar(col) or isinstance(col, tuple)) and col in self:
3995+
# tuples can be both column keys or list-likes
3996+
# if they are valid column keys, everything is fine
3997+
continue
3998+
elif is_scalar(col) and col not in self:
3999+
# tuples that are not column keys are considered list-like,
4000+
# not considered missing
4001+
missing.append(col)
4002+
elif (not is_list_like(col, allow_sets=False)
4003+
or getattr(col, 'ndim', 1) > 1):
4004+
raise TypeError('The parameter "keys" may only contain a '
4005+
'combination of valid column keys and '
4006+
'one-dimensional list-likes')
4007+
4008+
if missing:
4009+
raise KeyError('{}'.format(missing))
4010+
39914011
if inplace:
39924012
frame = self
39934013
else:
@@ -3997,37 +4017,37 @@ def set_index(self, keys, drop=True, append=False, inplace=False,
39974017
names = []
39984018
if append:
39994019
names = [x for x in self.index.names]
4000-
if isinstance(self.index, MultiIndex):
4020+
if isinstance(self.index, ABCMultiIndex):
40014021
for i in range(self.index.nlevels):
40024022
arrays.append(self.index._get_level_values(i))
40034023
else:
40044024
arrays.append(self.index)
40054025

40064026
to_remove = []
40074027
for col in keys:
4008-
if isinstance(col, MultiIndex):
4009-
# append all but the last column so we don't have to modify
4010-
# the end of this loop
4011-
for n in range(col.nlevels - 1):
4028+
if isinstance(col, ABCMultiIndex):
4029+
for n in range(col.nlevels):
40124030
arrays.append(col._get_level_values(n))
4013-
4014-
level = col._get_level_values(col.nlevels - 1)
40154031
names.extend(col.names)
4016-
elif isinstance(col, Series):
4017-
level = col._values
4018-
names.append(col.name)
4019-
elif isinstance(col, Index):
4020-
level = col
4032+
elif isinstance(col, (ABCIndexClass, ABCSeries)):
4033+
# if Index then not MultiIndex (treated above)
4034+
arrays.append(col)
40214035
names.append(col.name)
4022-
elif isinstance(col, (list, np.ndarray, Index)):
4023-
level = col
4036+
elif isinstance(col, (list, np.ndarray)):
4037+
arrays.append(col)
4038+
names.append(None)
4039+
elif (is_list_like(col)
4040+
and not (isinstance(col, tuple) and col in self)):
4041+
# all other list-likes (but avoid valid column keys)
4042+
col = list(col) # ensure iterator do not get read twice etc.
4043+
arrays.append(col)
40244044
names.append(None)
4045+
# from here, col can only be a column label
40254046
else:
4026-
level = frame[col]._values
4047+
arrays.append(frame[col]._values)
40274048
names.append(col)
40284049
if drop:
40294050
to_remove.append(col)
4030-
arrays.append(level)
40314051

40324052
index = ensure_index_from_sequences(arrays, names)
40334053

@@ -4036,7 +4056,8 @@ def set_index(self, keys, drop=True, append=False, inplace=False,
40364056
raise ValueError('Index has duplicate keys: {dup}'.format(
40374057
dup=duplicates))
40384058

4039-
for c in to_remove:
4059+
# use set to handle duplicate column names gracefully in case of drop
4060+
for c in set(to_remove):
40404061
del frame[c]
40414062

40424063
# clear up memory usage

pandas/tests/frame/conftest.py

+4-3
Original file line numberDiff line numberDiff line change
@@ -211,12 +211,13 @@ def frame_of_index_cols():
211211
"""
212212
Fixture for DataFrame of columns that can be used for indexing
213213
214-
Columns are ['A', 'B', 'C', 'D', 'E']; 'A' & 'B' contain duplicates (but
215-
are jointly unique), the rest are unique.
214+
Columns are ['A', 'B', 'C', 'D', 'E', ('tuple', 'as', 'label')];
215+
'A' & 'B' contain duplicates (but are jointly unique), the rest are unique.
216216
"""
217217
df = DataFrame({'A': ['foo', 'foo', 'foo', 'bar', 'bar'],
218218
'B': ['one', 'two', 'three', 'one', 'two'],
219219
'C': ['a', 'b', 'c', 'd', 'e'],
220220
'D': np.random.randn(5),
221-
'E': np.random.randn(5)})
221+
'E': np.random.randn(5),
222+
('tuple', 'as', 'label'): np.random.randn(5)})
222223
return df

pandas/tests/frame/test_alter_axes.py

+57-30
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,8 @@ def test_set_index_cast(self):
4949
tm.assert_frame_equal(df, df2)
5050

5151
# A has duplicate values, C does not
52-
@pytest.mark.parametrize('keys', ['A', 'C', ['A', 'B']])
52+
@pytest.mark.parametrize('keys', ['A', 'C', ['A', 'B'],
53+
('tuple', 'as', 'label')])
5354
@pytest.mark.parametrize('inplace', [True, False])
5455
@pytest.mark.parametrize('drop', [True, False])
5556
def test_set_index_drop_inplace(self, frame_of_index_cols,
@@ -72,7 +73,8 @@ def test_set_index_drop_inplace(self, frame_of_index_cols,
7273
tm.assert_frame_equal(result, expected)
7374

7475
# A has duplicate values, C does not
75-
@pytest.mark.parametrize('keys', ['A', 'C', ['A', 'B']])
76+
@pytest.mark.parametrize('keys', ['A', 'C', ['A', 'B'],
77+
('tuple', 'as', 'label')])
7678
@pytest.mark.parametrize('drop', [True, False])
7779
def test_set_index_append(self, frame_of_index_cols, drop, keys):
7880
df = frame_of_index_cols
@@ -88,7 +90,8 @@ def test_set_index_append(self, frame_of_index_cols, drop, keys):
8890
tm.assert_frame_equal(result, expected)
8991

9092
# A has duplicate values, C does not
91-
@pytest.mark.parametrize('keys', ['A', 'C', ['A', 'B']])
93+
@pytest.mark.parametrize('keys', ['A', 'C', ['A', 'B'],
94+
('tuple', 'as', 'label')])
9295
@pytest.mark.parametrize('drop', [True, False])
9396
def test_set_index_append_to_multiindex(self, frame_of_index_cols,
9497
drop, keys):
@@ -114,8 +117,10 @@ def test_set_index_after_mutation(self):
114117
tm.assert_frame_equal(result, expected)
115118

116119
# MultiIndex constructor does not work directly on Series -> lambda
120+
# Add list-of-list constructor because list is ambiguous -> lambda
117121
# also test index name if append=True (name is duplicate here for B)
118122
@pytest.mark.parametrize('box', [Series, Index, np.array,
123+
list, tuple, iter, lambda x: [list(x)],
119124
lambda x: MultiIndex.from_arrays([x])])
120125
@pytest.mark.parametrize('append, index_name', [(True, None),
121126
(True, 'B'), (True, 'test'), (False, None)])
@@ -126,21 +131,29 @@ def test_set_index_pass_single_array(self, frame_of_index_cols,
126131
df.index.name = index_name
127132

128133
key = box(df['B'])
129-
# np.array and list "forget" the name of B
130-
name = [None if box in [np.array, list] else 'B']
134+
if box == list:
135+
# list of strings gets interpreted as list of keys
136+
msg = "['one', 'two', 'three', 'one', 'two']"
137+
with tm.assert_raises_regex(KeyError, msg):
138+
df.set_index(key, drop=drop, append=append)
139+
else:
140+
# np.array/tuple/iter/list-of-list "forget" the name of B
141+
name_mi = getattr(key, 'names', None)
142+
name = [getattr(key, 'name', None)] if name_mi is None else name_mi
131143

132-
result = df.set_index(key, drop=drop, append=append)
144+
result = df.set_index(key, drop=drop, append=append)
133145

134-
# only valid column keys are dropped
135-
# since B is always passed as array above, nothing is dropped
136-
expected = df.set_index(['B'], drop=False, append=append)
137-
expected.index.names = [index_name] + name if append else name
146+
# only valid column keys are dropped
147+
# since B is always passed as array above, nothing is dropped
148+
expected = df.set_index(['B'], drop=False, append=append)
149+
expected.index.names = [index_name] + name if append else name
138150

139-
tm.assert_frame_equal(result, expected)
151+
tm.assert_frame_equal(result, expected)
140152

141153
# MultiIndex constructor does not work directly on Series -> lambda
142154
# also test index name if append=True (name is duplicate here for A & B)
143-
@pytest.mark.parametrize('box', [Series, Index, np.array, list,
155+
@pytest.mark.parametrize('box', [Series, Index, np.array,
156+
list, tuple, iter,
144157
lambda x: MultiIndex.from_arrays([x])])
145158
@pytest.mark.parametrize('append, index_name',
146159
[(True, None), (True, 'A'), (True, 'B'),
@@ -152,8 +165,8 @@ def test_set_index_pass_arrays(self, frame_of_index_cols,
152165
df.index.name = index_name
153166

154167
keys = ['A', box(df['B'])]
155-
# np.array and list "forget" the name of B
156-
names = ['A', None if box in [np.array, list] else 'B']
168+
# np.array/list/tuple/iter "forget" the name of B
169+
names = ['A', None if box in [np.array, list, tuple, iter] else 'B']
157170

158171
result = df.set_index(keys, drop=drop, append=append)
159172

@@ -168,10 +181,12 @@ def test_set_index_pass_arrays(self, frame_of_index_cols,
168181
# MultiIndex constructor does not work directly on Series -> lambda
169182
# We also emulate a "constructor" for the label -> lambda
170183
# also test index name if append=True (name is duplicate here for A)
171-
@pytest.mark.parametrize('box2', [Series, Index, np.array, list,
184+
@pytest.mark.parametrize('box2', [Series, Index, np.array,
185+
list, tuple, iter,
172186
lambda x: MultiIndex.from_arrays([x]),
173187
lambda x: x.name])
174-
@pytest.mark.parametrize('box1', [Series, Index, np.array, list,
188+
@pytest.mark.parametrize('box1', [Series, Index, np.array,
189+
list, tuple, iter,
175190
lambda x: MultiIndex.from_arrays([x]),
176191
lambda x: x.name])
177192
@pytest.mark.parametrize('append, index_name', [(True, None),
@@ -183,21 +198,22 @@ def test_set_index_pass_arrays_duplicate(self, frame_of_index_cols, drop,
183198
df.index.name = index_name
184199

185200
keys = [box1(df['A']), box2(df['A'])]
201+
result = df.set_index(keys, drop=drop, append=append)
186202

187-
# == gives ambiguous Boolean for Series
188-
if drop and keys[0] is 'A' and keys[1] is 'A':
189-
with tm.assert_raises_regex(KeyError, '.*'):
190-
df.set_index(keys, drop=drop, append=append)
191-
else:
192-
result = df.set_index(keys, drop=drop, append=append)
203+
# if either box was iter, the content has been consumed; re-read it
204+
keys = [box1(df['A']), box2(df['A'])]
193205

194-
# to test against already-tested behavior, we add sequentially,
195-
# hence second append always True; must wrap in list, otherwise
196-
# list-box will be illegal
197-
expected = df.set_index([keys[0]], drop=drop, append=append)
198-
expected = expected.set_index([keys[1]], drop=drop, append=True)
206+
# need to adapt first drop for case that both keys are 'A' --
207+
# cannot drop the same column twice;
208+
# use "is" because == would give ambiguous Boolean error for containers
209+
first_drop = False if (keys[0] is 'A' and keys[1] is 'A') else drop
199210

200-
tm.assert_frame_equal(result, expected)
211+
# to test against already-tested behaviour, we add sequentially,
212+
# hence second append always True; must wrap keys in list, otherwise
213+
# box = list would be illegal
214+
expected = df.set_index([keys[0]], drop=first_drop, append=append)
215+
expected = expected.set_index([keys[1]], drop=drop, append=True)
216+
tm.assert_frame_equal(result, expected)
201217

202218
@pytest.mark.parametrize('append', [True, False])
203219
@pytest.mark.parametrize('drop', [True, False])
@@ -229,13 +245,24 @@ def test_set_index_verify_integrity(self, frame_of_index_cols):
229245
def test_set_index_raise(self, frame_of_index_cols, drop, append):
230246
df = frame_of_index_cols
231247

232-
with tm.assert_raises_regex(KeyError, '.*'): # column names are A-E
248+
with tm.assert_raises_regex(KeyError, "['foo', 'bar', 'baz']"):
249+
# column names are A-E, as well as one tuple
233250
df.set_index(['foo', 'bar', 'baz'], drop=drop, append=append)
234251

235252
# non-existent key in list with arrays
236-
with tm.assert_raises_regex(KeyError, '.*'):
253+
with tm.assert_raises_regex(KeyError, 'X'):
237254
df.set_index([df['A'], df['B'], 'X'], drop=drop, append=append)
238255

256+
msg = 'The parameter "keys" may only contain a combination of.*'
257+
# forbidden type, e.g. set
258+
with tm.assert_raises_regex(TypeError, msg):
259+
df.set_index(set(df['A']), drop=drop, append=append)
260+
261+
# forbidden type in list, e.g. set
262+
with tm.assert_raises_regex(TypeError, msg):
263+
df.set_index(['A', df['A'], set(df['A'])],
264+
drop=drop, append=append)
265+
239266
def test_construction_with_categorical_index(self):
240267
ci = tm.makeCategoricalIndex(10)
241268
ci.name = 'B'

0 commit comments

Comments
 (0)