Skip to content

BUG: Various inconsistencies in DataFrame getitem/setitem behavior (fixes #2765) #2776

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
196 changes: 102 additions & 94 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,8 @@
_default_index, _is_sequence)
from pandas.core.generic import NDFrame
from pandas.core.index import Index, MultiIndex, _ensure_index
from pandas.core.indexing import _NDFrameIndexer, _maybe_droplevels
from pandas.core.indexing import (_NDFrameIndexer, _maybe_droplevels,
_is_index_slice, _check_bool_indexer)
from pandas.core.internals import BlockManager, make_block, form_blocks
from pandas.core.series import Series, _radd_compat, _dtype_from_scalar
from pandas.compat.scipy import scoreatpercentile as _quantile
Expand Down Expand Up @@ -1970,73 +1971,53 @@ def iget_value(self, i, j):
return self.get_value(row, col)

def __getitem__(self, key):
# slice rows
if isinstance(key, slice):
from pandas.core.indexing import _is_index_slice
idx_type = self.index.inferred_type
if idx_type == 'floating':
indexer = self.ix._convert_to_indexer(key, axis=0)
elif idx_type == 'integer' or _is_index_slice(key):
indexer = key
else:
indexer = self.ix._convert_to_indexer(key, axis=0)
new_data = self._data.get_slice(indexer, axis=1)
return self._constructor(new_data)
# either boolean or fancy integer index
# slice rows
return self._getitem_slice(key)
elif isinstance(key, (np.ndarray, list)):
if isinstance(key, list):
key = lib.list_to_object_array(key)

# also raises Exception if object array with NA values
if com._is_bool_indexer(key):
key = np.asarray(key, dtype=bool)
# either boolean or fancy integer index
return self._getitem_array(key)
elif isinstance(key, DataFrame):
return self._getitem_frame(key)
elif isinstance(self.columns, MultiIndex):
return self._getitem_multilevel(key)
elif isinstance(key, DataFrame):
if key.values.dtype == bool:
return self.where(key, try_cast = False)
else:
raise ValueError('Cannot index using non-boolean DataFrame')
else:
# get column
return self._get_item_cache(key)

def _getitem_slice(self, key):
idx_type = self.index.inferred_type
if idx_type == 'floating':
indexer = self.ix._convert_to_indexer(key, axis=0)
elif idx_type == 'integer' or _is_index_slice(key):
indexer = key
else:
indexer = self.ix._convert_to_indexer(key, axis=0)
return self._slice(indexer, axis=0)

def _getitem_array(self, key):
if key.dtype == np.bool_:
if len(key) != len(self.index):
# also raises Exception if object array with NA values
if com._is_bool_indexer(key):
# warning here just in case -- previously __setitem__ was
# reindexing but __getitem__ was not; it seems more reasonable to
# go with the __setitem__ behavior since that is more consistent
# with all other indexing behavior
if isinstance(key, Series) and not key.index.equals(self.index):
import warnings
warnings.warn("Boolean Series key will be reindexed to match "
"DataFrame index.", UserWarning)
elif len(key) != len(self.index):
raise ValueError('Item wrong length %d instead of %d!' %
(len(key), len(self.index)))

inds, = key.nonzero()
return self.take(inds)
# _check_bool_indexer will throw exception if Series key cannot
# be reindexed to match DataFrame rows
key = _check_bool_indexer(self.index, key)
indexer = key.nonzero()[0]
return self.take(indexer, axis=0)
else:
if self.columns.is_unique:
indexer = self.columns.get_indexer(key)
mask = indexer == -1
if mask.any():
raise KeyError("No column(s) named: %s" %
com.pprint_thing(key[mask]))
result = self.reindex(columns=key)
if result.columns.name is None:
result.columns.name = self.columns.name
return result
else:
mask = self.columns.isin(key)
for k in key:
if k not in self.columns:
raise KeyError("No column(s) named: %s" %
com.pprint_thing(k))
return self.take(mask.nonzero()[0], axis=1)

def _slice(self, slobj, axis=0):
if axis == 0:
mgr_axis = 1
else:
mgr_axis = 0

new_data = self._data.get_slice(slobj, axis=mgr_axis)
return self._constructor(new_data)

indexer = self.ix._convert_to_indexer(key, axis=1)
return self.take(indexer, axis=1)

def _getitem_multilevel(self, key):
loc = self.columns.get_loc(key)
if isinstance(loc, (slice, np.ndarray)):
Expand All @@ -2061,6 +2042,20 @@ def _getitem_multilevel(self, key):
else:
return self._get_item_cache(key)

def _getitem_frame(self, key):
if key.values.dtype != np.bool_:
raise ValueError('Must pass DataFrame with boolean values only')
return self.where(key)

def _slice(self, slobj, axis=0):
if axis == 0:
mgr_axis = 1
else:
mgr_axis = 0

new_data = self._data.get_slice(slobj, axis=mgr_axis)
return self._constructor(new_data)

def _box_item_values(self, key, values):
items = self.columns[self.columns.get_loc(key)]
if values.ndim == 2:
Expand Down Expand Up @@ -2094,39 +2089,55 @@ def __setattr__(self, name, value):
object.__setattr__(self, name, value)

def __setitem__(self, key, value):
# support boolean setting with DataFrame input, e.g.
# df[df > df2] = 0
if isinstance(key, DataFrame):
self._boolean_set(key, value)
if isinstance(key, slice):
# slice rows
self._setitem_slice(key, value)
elif isinstance(key, (np.ndarray, list)):
return self._set_item_multiple(key, value)
self._setitem_array(key, value)
elif isinstance(key, DataFrame):
self._setitem_frame(key, value)
else:
# set column
self._set_item(key, value)

def _setitem_slice(self, key, value):
idx_type = self.index.inferred_type
if idx_type == 'floating':
indexer = self.ix._convert_to_indexer(key, axis=0)
elif idx_type == 'integer' or _is_index_slice(key):
indexer = key
else:
indexer = self.ix._convert_to_indexer(key, axis=0)
self.ix._setitem_with_indexer(indexer, value)

def _boolean_set(self, key, value):
def _setitem_array(self, key, value):
# also raises Exception if object array with NA values
if com._is_bool_indexer(key):
if len(key) != len(self.index):
raise ValueError('Item wrong length %d instead of %d!' %
(len(key), len(self.index)))
key = _check_bool_indexer(self.index, key)
indexer = key.nonzero()[0]
self.ix._setitem_with_indexer(indexer, value)
else:
if isinstance(value, DataFrame):
if len(value.columns) != len(key):
raise AssertionError('Columns must be same length as key')
for k1, k2 in zip(key, value.columns):
self[k1] = value[k2]
else:
indexer = self.ix._convert_to_indexer(key, axis=1)
self.ix._setitem_with_indexer((slice(None), indexer), value)

def _setitem_frame(self, key, value):
# support boolean setting with DataFrame input, e.g.
# df[df > df2] = 0
if key.values.dtype != np.bool_:
raise ValueError('Must pass DataFrame with boolean values only')

if self._is_mixed_type:
raise ValueError('Cannot do boolean setting on mixed-type frame')

self.where(-key, value, inplace=True)

def _set_item_multiple(self, keys, value):
if isinstance(value, DataFrame):
if len(value.columns) != len(keys):
raise AssertionError('Columns must be same length as keys')
for k1, k2 in zip(keys, value.columns):
self[k1] = value[k2]
else:
if isinstance(keys, np.ndarray) and keys.dtype == np.bool_:
# boolean slicing should happen on rows, consistent with
# behavior of getitem
self.ix[keys, :] = value
else:
self.ix[:, keys] = value

def _set_item(self, key, value):
"""
Add series to DataFrame in specified column.
Expand Down Expand Up @@ -2920,7 +2931,7 @@ def take(self, indices, axis=0):
"""
if isinstance(indices, list):
indices = np.array(indices)
if self._data.is_mixed_dtype():
if self._is_mixed_type:
if axis == 0:
new_data = self._data.take(indices, axis=1)
return DataFrame(new_data)
Expand Down Expand Up @@ -3249,7 +3260,7 @@ def sortlevel(self, level=0, axis=0, ascending=True, inplace=False):

new_axis, indexer = the_axis.sortlevel(level, ascending=ascending)

if self._data.is_mixed_dtype() and not inplace:
if self._is_mixed_type and not inplace:
if axis == 0:
return self.reindex(index=new_axis)
else:
Expand Down Expand Up @@ -5240,25 +5251,22 @@ def where(self, cond, other=NA, inplace=False, try_cast=False, raise_on_error=Tr
-------
wh: DataFrame
"""
if not hasattr(cond, 'shape'):
raise ValueError('where requires an ndarray like object for its '
'condition')

if isinstance(cond, np.ndarray):
if isinstance(cond, DataFrame):
# this already checks for index/column equality
cond = cond.reindex(self.index, columns=self.columns)
else:
if not hasattr(cond, 'shape'):
raise ValueError('where requires an ndarray like object for its '
'condition')
if cond.shape != self.shape:
raise ValueError('Array conditional must be same shape as self')
cond = self._constructor(cond, index=self.index,
columns=self.columns)

if cond.shape != self.shape:
cond = cond.reindex(self.index, columns=self.columns)

if inplace:
cond = -(cond.fillna(True).astype(bool))
else:
cond = cond.fillna(False).astype(bool)
elif inplace:
cond = -cond
if inplace:
cond = -(cond.fillna(True).astype(bool))
else:
cond = cond.fillna(False).astype(bool)

if isinstance(other, DataFrame):
_, other = self.align(other, join='left', fill_value=NA)
Expand Down
49 changes: 21 additions & 28 deletions pandas/core/indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -265,7 +265,7 @@ def _convert_for_reindex(self, key, axis=0):

if com._is_bool_indexer(key):
key = _check_bool_indexer(labels, key)
return labels[np.asarray(key)]
return labels[key]
else:
if isinstance(key, Index):
# want Index objects to pass through untouched
Expand Down Expand Up @@ -340,28 +340,19 @@ def _getitem_axis(self, key, axis=0):
raise ValueError('Cannot index with multidimensional key')

return self._getitem_iterable(key, axis=axis)
elif axis == 0:
is_int_index = _is_integer_index(labels)

idx = key
else:
if com.is_integer(key):
if isinstance(labels, MultiIndex):
if axis == 0 and isinstance(labels, MultiIndex):
try:
return self._get_label(key, axis=0)
return self._get_label(key, axis=axis)
except (KeyError, TypeError):
if _is_integer_index(self.obj.index.levels[0]):
raise

if not is_int_index:
return self._get_loc(key, axis=0)
if not _is_integer_index(labels):
return self._get_loc(key, axis=axis)

return self._get_label(idx, axis=0)
else:
labels = self.obj._get_axis(axis)
lab = key
if com.is_integer(key) and not _is_integer_index(labels):
return self._get_loc(key, axis=axis)
return self._get_label(lab, axis=axis)
return self._get_label(key, axis=axis)

def _getitem_iterable(self, key, axis=0):
labels = self.obj._get_axis(axis)
Expand All @@ -377,11 +368,10 @@ def _reindex(keys, level=None):

if com._is_bool_indexer(key):
key = _check_bool_indexer(labels, key)
inds, = np.asarray(key, dtype=bool).nonzero()
inds, = key.nonzero()
return self.obj.take(inds, axis=axis)
else:
was_index = isinstance(key, Index)
if was_index:
if isinstance(key, Index):
# want Index objects to pass through untouched
keyarr = key
else:
Expand Down Expand Up @@ -489,8 +479,9 @@ def _convert_to_indexer(self, obj, axis=0):

elif _is_list_like(obj):
if com._is_bool_indexer(obj):
objarr = _check_bool_indexer(labels, obj)
return objarr
obj = _check_bool_indexer(labels, obj)
inds, = obj.nonzero()
return inds
else:
if isinstance(obj, Index):
objarr = obj.values
Expand Down Expand Up @@ -672,17 +663,19 @@ def _setitem_with_indexer(self, indexer, value):
def _check_bool_indexer(ax, key):
# boolean indexing, need to check that the data are aligned, otherwise
# disallowed
result = key
if _is_series(key) and key.dtype == np.bool_:
if not key.index.equals(ax):
result = key.reindex(ax)

if isinstance(result, np.ndarray) and result.dtype == np.object_:
# this function assumes that com._is_bool_indexer(key) == True

result = key
if _is_series(key) and not key.index.equals(ax):
result = result.reindex(ax)
mask = com.isnull(result)
if mask.any():
raise IndexingError('cannot index with vector containing '
'NA / NaN values')
raise IndexingError('Unalignable boolean Series key provided')

# com._is_bool_indexer has already checked for nulls in the case of an
# object array key, so no check needed here
result = np.asarray(result, dtype=bool)
return result


Expand Down
Loading