-
-
Notifications
You must be signed in to change notification settings - Fork 18.4k
BUG: fix DataFrame.__getitem__ and .loc with non-list listlikes #21313
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
2e63f5b
c2dbc40
1d98cc7
837104b
5bd6eb8
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2670,68 +2670,80 @@ def _ixs(self, i, axis=0): | |
def __getitem__(self, key): | ||
key = com._apply_if_callable(key, self) | ||
|
||
# shortcut if we are an actual column | ||
is_mi_columns = isinstance(self.columns, MultiIndex) | ||
# shortcut if the key is in columns | ||
try: | ||
if key in self.columns and not is_mi_columns: | ||
return self._getitem_column(key) | ||
except: | ||
if self.columns.is_unique and key in self.columns: | ||
if self.columns.nlevels > 1: | ||
return self._getitem_multilevel(key) | ||
return self._get_item_cache(key) | ||
except (TypeError, ValueError): | ||
# The TypeError correctly catches non hashable "key" (e.g. list) | ||
# The ValueError can be removed once GH #21729 is fixed | ||
pass | ||
|
||
# see if we can slice the rows | ||
# Do we have a slicer (on rows)? | ||
indexer = convert_to_index_sliceable(self, key) | ||
if indexer is not None: | ||
return self._getitem_slice(indexer) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why the change here? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. removed? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Yes, removed, it was a pretty useless one-liner |
||
return self._slice(indexer, axis=0) | ||
|
||
if isinstance(key, (Series, np.ndarray, Index, list)): | ||
# either boolean or fancy integer index | ||
return self._getitem_array(key) | ||
elif isinstance(key, DataFrame): | ||
# Do we have a (boolean) DataFrame? | ||
if isinstance(key, DataFrame): | ||
return self._getitem_frame(key) | ||
elif is_mi_columns: | ||
return self._getitem_multilevel(key) | ||
|
||
# Do we have a (boolean) 1d indexer? | ||
if com.is_bool_indexer(key): | ||
return self._getitem_bool_array(key) | ||
|
||
# We are left with two options: a single key, and a collection of keys, | ||
# We interpret tuples as collections only for non-MultiIndex | ||
is_single_key = isinstance(key, tuple) or not is_list_like(key) | ||
|
||
if is_single_key: | ||
if self.columns.nlevels > 1: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. isn’t this case handled by _getitem_multilevel (above)? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. only if |
||
return self._getitem_multilevel(key) | ||
indexer = self.columns.get_loc(key) | ||
if is_integer(indexer): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this is an argument for _take to accept a scalar integer There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Not too sure. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. moreover, the problem should disappear when we fix #9519 , that is, when the return type becomes predictable from the index (non-)uniqueness |
||
indexer = [indexer] | ||
else: | ||
return self._getitem_column(key) | ||
if is_iterator(key): | ||
key = list(key) | ||
indexer = self.loc._convert_to_indexer(key, axis=1, | ||
raise_missing=True) | ||
|
||
def _getitem_column(self, key): | ||
""" return the actual column """ | ||
# take() does not accept boolean indexers | ||
if getattr(indexer, "dtype", None) == bool: | ||
indexer = np.where(indexer)[0] | ||
|
||
# get column | ||
if self.columns.is_unique: | ||
return self._get_item_cache(key) | ||
data = self._take(indexer, axis=1) | ||
|
||
# duplicate columns & possible reduce dimensionality | ||
result = self._constructor(self._data.get(key)) | ||
if result.columns.is_unique: | ||
result = result[key] | ||
if is_single_key: | ||
# What does looking for a single key in a non-unique index return? | ||
# The behavior is inconsistent. It returns a Series, except when | ||
# - the key itself is repeated (test on data.shape, #9519), or | ||
# - we have a MultiIndex on columns (test on self.columns, #21309) | ||
if data.shape[1] == 1 and not isinstance(self.columns, MultiIndex): | ||
data = data[key] | ||
|
||
return result | ||
|
||
def _getitem_slice(self, key): | ||
return self._slice(key, axis=0) | ||
return data | ||
|
||
def _getitem_array(self, key): | ||
def _getitem_bool_array(self, key): | ||
# also raises Exception if object array with NA values | ||
if com.is_bool_indexer(key): | ||
# warning here just in case -- previously __setitem__ was | ||
# reindexing but __getitem__ was not; it seems more reasonable to | ||
# go with the __setitem__ behavior since that is more consistent | ||
# with all other indexing behavior | ||
if isinstance(key, Series) and not key.index.equals(self.index): | ||
warnings.warn("Boolean Series key will be reindexed to match " | ||
"DataFrame index.", UserWarning, stacklevel=3) | ||
elif len(key) != len(self.index): | ||
raise ValueError('Item wrong length %d instead of %d.' % | ||
(len(key), len(self.index))) | ||
# check_bool_indexer will throw exception if Series key cannot | ||
# be reindexed to match DataFrame rows | ||
key = check_bool_indexer(self.index, key) | ||
indexer = key.nonzero()[0] | ||
return self._take(indexer, axis=0) | ||
else: | ||
indexer = self.loc._convert_to_indexer(key, axis=1, | ||
raise_missing=True) | ||
return self._take(indexer, axis=1) | ||
# warning here just in case -- previously __setitem__ was | ||
# reindexing but __getitem__ was not; it seems more reasonable to | ||
# go with the __setitem__ behavior since that is more consistent | ||
# with all other indexing behavior | ||
if isinstance(key, Series) and not key.index.equals(self.index): | ||
warnings.warn("Boolean Series key will be reindexed to match " | ||
"DataFrame index.", UserWarning, stacklevel=3) | ||
elif len(key) != len(self.index): | ||
raise ValueError('Item wrong length %d instead of %d.' % | ||
(len(key), len(self.index))) | ||
|
||
# check_bool_indexer will throw exception if Series key cannot | ||
# be reindexed to match DataFrame rows | ||
key = check_bool_indexer(self.index, key) | ||
indexer = key.nonzero()[0] | ||
return self._take(indexer, axis=0) | ||
|
||
def _getitem_multilevel(self, key): | ||
loc = self.columns.get_loc(key) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -501,9 +501,11 @@ def test_constructor_dict_of_tuples(self): | |
tm.assert_frame_equal(result, expected, check_dtype=False) | ||
|
||
def test_constructor_dict_multiindex(self): | ||
check = lambda result, expected: tm.assert_frame_equal( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you parameterize this test? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Not here, but I can avoid fixing the lambda |
||
result, expected, check_dtype=True, check_index_type=True, | ||
check_column_type=True, check_names=True) | ||
def check(result, expected): | ||
return tm.assert_frame_equal(result, expected, check_dtype=True, | ||
check_index_type=True, | ||
check_column_type=True, | ||
check_names=True) | ||
d = {('a', 'a'): {('i', 'i'): 0, ('i', 'j'): 1, ('j', 'i'): 2}, | ||
('b', 'a'): {('i', 'i'): 6, ('i', 'j'): 5, ('j', 'i'): 4}, | ||
('b', 'c'): {('i', 'i'): 7, ('i', 'j'): 8, ('j', 'i'): 9}} | ||
|
@@ -1655,19 +1657,21 @@ def check(df): | |
for i in range(len(df.columns)): | ||
df.iloc[:, i] | ||
|
||
# allow single nans to succeed | ||
indexer = np.arange(len(df.columns))[isna(df.columns)] | ||
|
||
if len(indexer) == 1: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. comment on each of these cases There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. (done) |
||
tm.assert_series_equal(df.iloc[:, indexer[0]], | ||
df.loc[:, np.nan]) | ||
|
||
# multiple nans should fail | ||
else: | ||
|
||
# No NaN found -> error | ||
if len(indexer) == 0: | ||
def f(): | ||
df.loc[:, np.nan] | ||
pytest.raises(TypeError, f) | ||
# single nan should result in Series | ||
elif len(indexer) == 1: | ||
tm.assert_series_equal(df.iloc[:, indexer[0]], | ||
df.loc[:, np.nan]) | ||
# multiple nans should result in DataFrame | ||
else: | ||
tm.assert_frame_equal(df.iloc[:, indexer], | ||
df.loc[:, np.nan]) | ||
|
||
df = DataFrame([[1, 2, 3], [4, 5, 6]], index=[1, np.nan]) | ||
check(df) | ||
|
@@ -1683,6 +1687,11 @@ def f(): | |
columns=[np.nan, 1.1, 2.2, np.nan]) | ||
check(df) | ||
|
||
# GH 21428 (non-unique columns) | ||
df = DataFrame([[0.0, 1, 2, 3.0], [4, 5, 6, 7]], | ||
columns=[np.nan, 1, 2, 2]) | ||
check(df) | ||
|
||
def test_constructor_lists_to_object_dtype(self): | ||
# from #1074 | ||
d = DataFrame({'a': [np.nan, False]}) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
why are you changing to directly use
_get_item_cache here rather than _getitem_column? (is it removed)?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, removed. It did a uniqueness test which is no more necessary, and was misleading anyway, as it could not really manage all cases in which a single column is returned.