Skip to content

Commit 287a5d7

Browse files
h-vetinariPingviinituutti
authored andcommitted
DOC: update DF.set_index (pandas-dev#24762)
1 parent b54f091 commit 287a5d7

File tree

3 files changed

+73
-48
lines changed

3 files changed

+73
-48
lines changed

doc/source/whatsnew/v0.24.0.rst

+2-2
Original file line numberDiff line numberDiff line change
@@ -1147,8 +1147,8 @@ Other API Changes
11471147
- :class:`pandas.io.formats.style.Styler` supports a ``number-format`` property when using :meth:`~pandas.io.formats.style.Styler.to_excel` (:issue:`22015`)
11481148
- :meth:`DataFrame.corr` and :meth:`Series.corr` now raise a ``ValueError`` along with a helpful error message instead of a ``KeyError`` when supplied with an invalid method (:issue:`22298`)
11491149
- :meth:`shift` will now always return a copy, instead of the previous behaviour of returning self when shifting by 0 (:issue:`22397`)
1150-
- :meth:`DataFrame.set_index` now allows all one-dimensional list-likes, raises a ``TypeError`` for incorrect types,
1151-
has an improved ``KeyError`` message, and will not fail on duplicate column names with ``drop=True``. (:issue:`22484`)
1150+
- :meth:`DataFrame.set_index` now gives a better (and less frequent) KeyError, raises a ``ValueError`` for incorrect types,
1151+
and will not fail on duplicate column names with ``drop=True``. (:issue:`22484`)
11521152
- Slicing a single row of a DataFrame with multiple ExtensionArrays of the same type now preserves the dtype, rather than coercing to object (:issue:`22784`)
11531153
- :class:`DateOffset` attribute `_cacheable` and method `_should_cache` have been removed (:issue:`23118`)
11541154
- :meth:`Series.searchsorted`, when supplied a scalar value to search for, now returns a scalar instead of an array (:issue:`23801`).

pandas/core/frame.py

+39-25
Original file line numberDiff line numberDiff line change
@@ -4042,12 +4042,16 @@ def set_index(self, keys, drop=True, append=False, inplace=False,
40424042
Set the DataFrame index using existing columns.
40434043
40444044
Set the DataFrame index (row labels) using one or more existing
4045-
columns. The index can replace the existing index or expand on it.
4045+
columns or arrays (of the correct length). The index can replace the
4046+
existing index or expand on it.
40464047
40474048
Parameters
40484049
----------
4049-
keys : label or list of label
4050-
Name or names of the columns that will be used as the index.
4050+
keys : label or array-like or list of labels/arrays
4051+
This parameter can be either a single column key, a single array of
4052+
the same length as the calling DataFrame, or a list containing an
4053+
arbitrary combination of column keys and arrays. Here, "array"
4054+
encompasses :class:`Series`, :class:`Index` and ``np.ndarray``.
40514055
drop : bool, default True
40524056
Delete columns to be used as the new index.
40534057
append : bool, default False
@@ -4092,7 +4096,7 @@ def set_index(self, keys, drop=True, append=False, inplace=False,
40924096
7 2013 84
40934097
10 2014 31
40944098
4095-
Create a multi-index using columns 'year' and 'month':
4099+
Create a MultiIndex using columns 'year' and 'month':
40964100
40974101
>>> df.set_index(['year', 'month'])
40984102
sale
@@ -4102,35 +4106,51 @@ def set_index(self, keys, drop=True, append=False, inplace=False,
41024106
2013 7 84
41034107
2014 10 31
41044108
4105-
Create a multi-index using a set of values and a column:
4109+
Create a MultiIndex using an Index and a column:
41064110
4107-
>>> df.set_index([[1, 2, 3, 4], 'year'])
4111+
>>> df.set_index([pd.Index([1, 2, 3, 4]), 'year'])
41084112
month sale
41094113
year
41104114
1 2012 1 55
41114115
2 2014 4 40
41124116
3 2013 7 84
41134117
4 2014 10 31
4118+
4119+
Create a MultiIndex using two Series:
4120+
4121+
>>> s = pd.Series([1, 2, 3, 4])
4122+
>>> df.set_index([s, s**2])
4123+
month year sale
4124+
1 1 1 2012 55
4125+
2 4 4 2014 40
4126+
3 9 7 2013 84
4127+
4 16 10 2014 31
41144128
"""
41154129
inplace = validate_bool_kwarg(inplace, 'inplace')
4116-
if not isinstance(keys, list):
4130+
4131+
err_msg = ('The parameter "keys" may be a column key, one-dimensional '
4132+
'array, or a list containing only valid column keys and '
4133+
'one-dimensional arrays.')
4134+
4135+
if (is_scalar(keys) or isinstance(keys, tuple)
4136+
or isinstance(keys, (ABCIndexClass, ABCSeries, np.ndarray))):
4137+
# make sure we have a container of keys/arrays we can iterate over
4138+
# tuples can appear as valid column keys!
41174139
keys = [keys]
4140+
elif not isinstance(keys, list):
4141+
raise ValueError(err_msg)
41184142

41194143
missing = []
41204144
for col in keys:
4121-
if (is_scalar(col) or isinstance(col, tuple)) and col in self:
4122-
# tuples can be both column keys or list-likes
4123-
# if they are valid column keys, everything is fine
4124-
continue
4125-
elif is_scalar(col) and col not in self:
4126-
# tuples that are not column keys are considered list-like,
4127-
# not considered missing
4128-
missing.append(col)
4129-
elif (not is_list_like(col, allow_sets=False)
4145+
if (is_scalar(col) or isinstance(col, tuple)):
4146+
# if col is a valid column key, everything is fine
4147+
# tuples are always considered keys, never as list-likes
4148+
if col not in self:
4149+
missing.append(col)
4150+
elif (not isinstance(col, (ABCIndexClass, ABCSeries,
4151+
np.ndarray, list))
41304152
or getattr(col, 'ndim', 1) > 1):
4131-
raise TypeError('The parameter "keys" may only contain a '
4132-
'combination of valid column keys and '
4133-
'one-dimensional list-likes')
4153+
raise ValueError(err_msg)
41344154

41354155
if missing:
41364156
raise KeyError('{}'.format(missing))
@@ -4163,12 +4183,6 @@ def set_index(self, keys, drop=True, append=False, inplace=False,
41634183
elif isinstance(col, (list, np.ndarray)):
41644184
arrays.append(col)
41654185
names.append(None)
4166-
elif (is_list_like(col)
4167-
and not (isinstance(col, tuple) and col in self)):
4168-
# all other list-likes (but avoid valid column keys)
4169-
col = list(col) # ensure iterator do not get read twice etc.
4170-
arrays.append(col)
4171-
names.append(None)
41724186
# from here, col can only be a column label
41734187
else:
41744188
arrays.append(frame[col]._values)

pandas/tests/frame/test_alter_axes.py

+32-21
Original file line numberDiff line numberDiff line change
@@ -118,7 +118,7 @@ def test_set_index_after_mutation(self):
118118
# Add list-of-list constructor because list is ambiguous -> lambda
119119
# also test index name if append=True (name is duplicate here for B)
120120
@pytest.mark.parametrize('box', [Series, Index, np.array,
121-
list, tuple, iter, lambda x: [list(x)],
121+
list, lambda x: [list(x)],
122122
lambda x: MultiIndex.from_arrays([x])])
123123
@pytest.mark.parametrize('append, index_name', [(True, None),
124124
(True, 'B'), (True, 'test'), (False, None)])
@@ -135,7 +135,7 @@ def test_set_index_pass_single_array(self, frame_of_index_cols,
135135
with pytest.raises(KeyError, match=msg):
136136
df.set_index(key, drop=drop, append=append)
137137
else:
138-
# np.array/tuple/iter/list-of-list "forget" the name of B
138+
# np.array/list-of-list "forget" the name of B
139139
name_mi = getattr(key, 'names', None)
140140
name = [getattr(key, 'name', None)] if name_mi is None else name_mi
141141

@@ -150,8 +150,7 @@ def test_set_index_pass_single_array(self, frame_of_index_cols,
150150

151151
# MultiIndex constructor does not work directly on Series -> lambda
152152
# also test index name if append=True (name is duplicate here for A & B)
153-
@pytest.mark.parametrize('box', [Series, Index, np.array,
154-
list, tuple, iter,
153+
@pytest.mark.parametrize('box', [Series, Index, np.array, list,
155154
lambda x: MultiIndex.from_arrays([x])])
156155
@pytest.mark.parametrize('append, index_name',
157156
[(True, None), (True, 'A'), (True, 'B'),
@@ -163,7 +162,7 @@ def test_set_index_pass_arrays(self, frame_of_index_cols,
163162
df.index.name = index_name
164163

165164
keys = ['A', box(df['B'])]
166-
# np.array/list/tuple/iter "forget" the name of B
165+
# np.array/list "forget" the name of B
167166
names = ['A', None if box in [np.array, list, tuple, iter] else 'B']
168167

169168
result = df.set_index(keys, drop=drop, append=append)
@@ -179,12 +178,10 @@ def test_set_index_pass_arrays(self, frame_of_index_cols,
179178
# MultiIndex constructor does not work directly on Series -> lambda
180179
# We also emulate a "constructor" for the label -> lambda
181180
# also test index name if append=True (name is duplicate here for A)
182-
@pytest.mark.parametrize('box2', [Series, Index, np.array,
183-
list, tuple, iter,
181+
@pytest.mark.parametrize('box2', [Series, Index, np.array, list,
184182
lambda x: MultiIndex.from_arrays([x]),
185183
lambda x: x.name])
186-
@pytest.mark.parametrize('box1', [Series, Index, np.array,
187-
list, tuple, iter,
184+
@pytest.mark.parametrize('box1', [Series, Index, np.array, list,
188185
lambda x: MultiIndex.from_arrays([x]),
189186
lambda x: x.name])
190187
@pytest.mark.parametrize('append, index_name', [(True, None),
@@ -198,17 +195,14 @@ def test_set_index_pass_arrays_duplicate(self, frame_of_index_cols, drop,
198195
keys = [box1(df['A']), box2(df['A'])]
199196
result = df.set_index(keys, drop=drop, append=append)
200197

201-
# if either box was iter, the content has been consumed; re-read it
202-
keys = [box1(df['A']), box2(df['A'])]
203-
204198
# need to adapt first drop for case that both keys are 'A' --
205199
# cannot drop the same column twice;
206200
# use "is" because == would give ambiguous Boolean error for containers
207201
first_drop = False if (keys[0] is 'A' and keys[1] is 'A') else drop
208202

209203
# to test against already-tested behaviour, we add sequentially,
210204
# hence second append always True; must wrap keys in list, otherwise
211-
# box = list would be illegal
205+
# box = list would be interpreted as keys
212206
expected = df.set_index([keys[0]], drop=first_drop, append=append)
213207
expected = expected.set_index([keys[1]], drop=drop, append=True)
214208
tm.assert_frame_equal(result, expected)
@@ -238,7 +232,7 @@ def test_set_index_verify_integrity(self, frame_of_index_cols):
238232

239233
@pytest.mark.parametrize('append', [True, False])
240234
@pytest.mark.parametrize('drop', [True, False])
241-
def test_set_index_raise(self, frame_of_index_cols, drop, append):
235+
def test_set_index_raise_keys(self, frame_of_index_cols, drop, append):
242236
df = frame_of_index_cols
243237

244238
with pytest.raises(KeyError, match="['foo', 'bar', 'baz']"):
@@ -249,14 +243,31 @@ def test_set_index_raise(self, frame_of_index_cols, drop, append):
249243
with pytest.raises(KeyError, match='X'):
250244
df.set_index([df['A'], df['B'], 'X'], drop=drop, append=append)
251245

252-
msg = 'The parameter "keys" may only contain a combination of.*'
253-
# forbidden type, e.g. set
254-
with pytest.raises(TypeError, match=msg):
255-
df.set_index(set(df['A']), drop=drop, append=append)
246+
msg = "[('foo', 'foo', 'foo', 'bar', 'bar')]"
247+
# tuples always raise KeyError
248+
with pytest.raises(KeyError, match=msg):
249+
df.set_index(tuple(df['A']), drop=drop, append=append)
250+
251+
# also within a list
252+
with pytest.raises(KeyError, match=msg):
253+
df.set_index(['A', df['A'], tuple(df['A'])],
254+
drop=drop, append=append)
255+
256+
@pytest.mark.parametrize('append', [True, False])
257+
@pytest.mark.parametrize('drop', [True, False])
258+
@pytest.mark.parametrize('box', [set, iter])
259+
def test_set_index_raise_on_type(self, frame_of_index_cols, box,
260+
drop, append):
261+
df = frame_of_index_cols
262+
263+
msg = 'The parameter "keys" may be a column key, .*'
264+
# forbidden type, e.g. set/tuple/iter
265+
with pytest.raises(ValueError, match=msg):
266+
df.set_index(box(df['A']), drop=drop, append=append)
256267

257-
# forbidden type in list, e.g. set
258-
with pytest.raises(TypeError, match=msg):
259-
df.set_index(['A', df['A'], set(df['A'])],
268+
# forbidden type in list, e.g. set/tuple/iter
269+
with pytest.raises(ValueError, match=msg):
270+
df.set_index(['A', df['A'], box(df['A'])],
260271
drop=drop, append=append)
261272

262273
def test_construction_with_categorical_index(self):

0 commit comments

Comments
 (0)