Skip to content

Commit 43327c2

Browse files
committed
BUG: fix DataFrame.__getitem__ and .loc with non-list listlikes
close pandas-dev#21294 close pandas-dev#21428
1 parent defdb34 commit 43327c2

File tree

3 files changed

+85
-74
lines changed

3 files changed

+85
-74
lines changed

doc/source/whatsnew/v0.23.1.txt

+2
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,8 @@ Bug Fixes
110110
- Bug in :meth:`Series.reset_index` where appropriate error was not raised with an invalid level name (:issue:`20925`)
111111
- Bug in :func:`interval_range` when ``start``/``periods`` or ``end``/``periods`` are specified with float ``start`` or ``end`` (:issue:`21161`)
112112
- Bug in :meth:`MultiIndex.set_names` where error raised for a ``MultiIndex`` with ``nlevels == 1`` (:issue:`21149`)
113+
- Bug in :meth:`DataFrame.__getitem__` and :meth:`DataFrame.loc` which did not accept columns keys passed as non-list iterables (:issue:`21294`)
114+
- Bug in :meth:`DataFrame.__getitem__` looking for np.nan in non-unique columns (:issue:`21428`)
113115
- Bug in :class:`IntervalIndex` constructors where creating an ``IntervalIndex`` from categorical data was not fully supported (:issue:`21243`, issue:`21253`)
114116
- Bug in :meth:`MultiIndex.sort_index` which was not guaranteed to sort correctly with ``level=1``; this was also causing data misalignment in particular :meth:`DataFrame.stack` operations (:issue:`20994`, :issue:`20945`, :issue:`21052`)
115117

pandas/core/frame.py

+55-47
Original file line numberDiff line numberDiff line change
@@ -2664,67 +2664,75 @@ def _ixs(self, i, axis=0):
26642664
def __getitem__(self, key):
26652665
key = com._apply_if_callable(key, self)
26662666

2667-
# shortcut if we are an actual column
2668-
is_mi_columns = isinstance(self.columns, MultiIndex)
2667+
# shortcut if the key is in columns
26692668
try:
2670-
if key in self.columns and not is_mi_columns:
2671-
return self._getitem_column(key)
2672-
except:
2669+
if self.columns.is_unique and key in self.columns:
2670+
if self.columns.nlevels > 1:
2671+
return self._getitem_multilevel(key)
2672+
return self._get_item_cache(key)
2673+
except (ValueError, TypeError):
26732674
pass
26742675

2675-
# see if we can slice the rows
2676+
# Do we have a slicer (on rows)?
26762677
indexer = convert_to_index_sliceable(self, key)
26772678
if indexer is not None:
2678-
return self._getitem_slice(indexer)
2679+
return self._slice(indexer, axis=0)
26792680

2680-
if isinstance(key, (Series, np.ndarray, Index, list)):
2681-
# either boolean or fancy integer index
2682-
return self._getitem_array(key)
2683-
elif isinstance(key, DataFrame):
2681+
# Do we have a (boolean) DataFrame?
2682+
if isinstance(key, DataFrame):
26842683
return self._getitem_frame(key)
2685-
elif is_mi_columns:
2686-
return self._getitem_multilevel(key)
2687-
else:
2688-
return self._getitem_column(key)
26892684

2690-
def _getitem_column(self, key):
2691-
""" return the actual column """
2685+
# Do we have a (boolean) 1d indexer?
2686+
if com.is_bool_indexer(key):
2687+
return self._getitem_bool_array(key)
2688+
2689+
# We are left with two options: a single key, and a collection of keys,
2690+
# We interpret tuples as collections only for non-MultiIndex
2691+
is_single_key = isinstance(key, tuple) or not is_list_like(key)
2692+
2693+
if is_single_key:
2694+
if self.columns.nlevels > 1:
2695+
return self._getitem_multilevel(key)
2696+
indexer = self.columns.get_loc(key)
2697+
if is_integer(indexer):
2698+
indexer = [indexer]
2699+
else:
2700+
indexer = self.loc._convert_to_indexer(key, axis=1)
26922701

2693-
# get column
2694-
if self.columns.is_unique:
2695-
return self._get_item_cache(key)
2702+
# take() does not accept boolean indexers
2703+
if getattr(indexer, "dtype", None) == bool:
2704+
indexer = np.where(indexer)[0]
26962705

2697-
# duplicate columns & possible reduce dimensionality
2698-
result = self._constructor(self._data.get(key))
2699-
if result.columns.is_unique:
2700-
result = result[key]
2706+
data = self._take(indexer, axis=1)
27012707

2702-
return result
2708+
if is_single_key:
2709+
# What does looking for a single key in a non-unique index return?
2710+
# The behavior is inconsistent. It returns a Series, except when
2711+
# - the key itself is repeated (test on data.shape, #9519), or
2712+
# - we have a MultiIndex on columns (test on self.columns, #21309)
2713+
if data.shape[1] == 1 and not isinstance(self.columns, MultiIndex):
2714+
data = data[key]
27032715

2704-
def _getitem_slice(self, key):
2705-
return self._slice(key, axis=0)
2716+
return data
27062717

2707-
def _getitem_array(self, key):
2718+
def _getitem_bool_array(self, key):
27082719
# also raises Exception if object array with NA values
2709-
if com.is_bool_indexer(key):
2710-
# warning here just in case -- previously __setitem__ was
2711-
# reindexing but __getitem__ was not; it seems more reasonable to
2712-
# go with the __setitem__ behavior since that is more consistent
2713-
# with all other indexing behavior
2714-
if isinstance(key, Series) and not key.index.equals(self.index):
2715-
warnings.warn("Boolean Series key will be reindexed to match "
2716-
"DataFrame index.", UserWarning, stacklevel=3)
2717-
elif len(key) != len(self.index):
2718-
raise ValueError('Item wrong length %d instead of %d.' %
2719-
(len(key), len(self.index)))
2720-
# check_bool_indexer will throw exception if Series key cannot
2721-
# be reindexed to match DataFrame rows
2722-
key = check_bool_indexer(self.index, key)
2723-
indexer = key.nonzero()[0]
2724-
return self._take(indexer, axis=0)
2725-
else:
2726-
indexer = self.loc._convert_to_indexer(key, axis=1)
2727-
return self._take(indexer, axis=1)
2720+
# warning here just in case -- previously __setitem__ was
2721+
# reindexing but __getitem__ was not; it seems more reasonable to
2722+
# go with the __setitem__ behavior since that is more consistent
2723+
# with all other indexing behavior
2724+
if isinstance(key, Series) and not key.index.equals(self.index):
2725+
warnings.warn("Boolean Series key will be reindexed to match "
2726+
"DataFrame index.", UserWarning, stacklevel=3)
2727+
elif len(key) != len(self.index):
2728+
raise ValueError('Item wrong length %d instead of %d.' %
2729+
(len(key), len(self.index)))
2730+
2731+
# check_bool_indexer will throw exception if Series key cannot
2732+
# be reindexed to match DataFrame rows
2733+
key = check_bool_indexer(self.index, key)
2734+
indexer = key.nonzero()[0]
2735+
return self._take(indexer, axis=0)
27282736

27292737
def _getitem_multilevel(self, key):
27302738
loc = self.columns.get_loc(key)

pandas/tests/frame/test_indexing.py

+28-27
Original file line numberDiff line numberDiff line change
@@ -92,45 +92,46 @@ def test_get(self):
9292
result = df.get(None)
9393
assert result is None
9494

95-
def test_getitem_iterator(self):
95+
def test_loc_iterable(self):
9696
idx = iter(['A', 'B', 'C'])
9797
result = self.frame.loc[:, idx]
9898
expected = self.frame.loc[:, ['A', 'B', 'C']]
9999
assert_frame_equal(result, expected)
100100

101-
idx = iter(['A', 'B', 'C'])
102-
result = self.frame.loc[:, idx]
103-
expected = self.frame.loc[:, ['A', 'B', 'C']]
104-
assert_frame_equal(result, expected)
101+
@pytest.mark.parametrize(
102+
"idx_type",
103+
[list, iter, Index, set,
104+
lambda l: dict(zip(l, range(len(l)))),
105+
lambda l: dict(zip(l, range(len(l)))).keys()],
106+
ids=["list", "iter", "Index", "set", "dict", "dict_keys"])
107+
@pytest.mark.parametrize("levels", [1, 2])
108+
def test_getitem_listlike(self, idx_type, levels):
109+
# GH 21294
110+
111+
if levels == 1:
112+
frame, missing = self.frame, 'food'
113+
else:
114+
# MultiIndex columns
115+
frame = DataFrame(randn(8, 3),
116+
columns=Index([('foo', 'bar'), ('baz', 'qux'),
117+
('peek', 'aboo')],
118+
name=('sth', 'sth2')))
119+
missing = ('good', 'food')
105120

106-
def test_getitem_list(self):
107-
self.frame.columns.name = 'foo'
121+
keys = [frame.columns[1], frame.columns[0]]
122+
idx = idx_type(keys)
123+
idx_check = list(idx_type(keys))
108124

109-
result = self.frame[['B', 'A']]
110-
result2 = self.frame[Index(['B', 'A'])]
125+
result = frame[idx]
111126

112-
expected = self.frame.loc[:, ['B', 'A']]
113-
expected.columns.name = 'foo'
127+
expected = frame.loc[:, idx_check]
128+
expected.columns.names = frame.columns.names
114129

115130
assert_frame_equal(result, expected)
116-
assert_frame_equal(result2, expected)
117131

118-
assert result.columns.name == 'foo'
119-
120-
with tm.assert_raises_regex(KeyError, 'not in index'):
121-
self.frame[['B', 'A', 'food']]
132+
idx = idx_type(keys + [missing])
122133
with tm.assert_raises_regex(KeyError, 'not in index'):
123-
self.frame[Index(['B', 'A', 'foo'])]
124-
125-
# tuples
126-
df = DataFrame(randn(8, 3),
127-
columns=Index([('foo', 'bar'), ('baz', 'qux'),
128-
('peek', 'aboo')], name=('sth', 'sth2')))
129-
130-
result = df[[('foo', 'bar'), ('baz', 'qux')]]
131-
expected = df.iloc[:, :2]
132-
assert_frame_equal(result, expected)
133-
assert result.columns.names == ('sth', 'sth2')
134+
frame[idx]
134135

135136
def test_getitem_callable(self):
136137
# GH 12533

0 commit comments

Comments
 (0)