From 7b69204904827069dcdbf67275ba7f070e70ff6b Mon Sep 17 00:00:00 2001 From: Stephen Lin Date: Mon, 28 Jan 2013 16:32:11 -0500 Subject: [PATCH] BUG: Various inconsistencies in DataFrame __getitem__ and __setitem__ behavior --- pandas/core/frame.py | 196 +++++++++++++++++--------------- pandas/core/indexing.py | 49 ++++---- pandas/core/series.py | 29 +---- pandas/sparse/frame.py | 7 -- pandas/tests/test_frame.py | 43 ++++++- pandas/tests/test_multilevel.py | 23 ++++ 6 files changed, 191 insertions(+), 156 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 6c96317a645f7..2ec95c6010f06 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -26,7 +26,8 @@ _default_index, _is_sequence) from pandas.core.generic import NDFrame from pandas.core.index import Index, MultiIndex, _ensure_index -from pandas.core.indexing import _NDFrameIndexer, _maybe_droplevels +from pandas.core.indexing import (_NDFrameIndexer, _maybe_droplevels, + _is_index_slice, _check_bool_indexer) from pandas.core.internals import BlockManager, make_block, form_blocks from pandas.core.series import Series, _radd_compat, _dtype_from_scalar from pandas.compat.scipy import scoreatpercentile as _quantile @@ -1970,73 +1971,53 @@ def iget_value(self, i, j): return self.get_value(row, col) def __getitem__(self, key): - # slice rows if isinstance(key, slice): - from pandas.core.indexing import _is_index_slice - idx_type = self.index.inferred_type - if idx_type == 'floating': - indexer = self.ix._convert_to_indexer(key, axis=0) - elif idx_type == 'integer' or _is_index_slice(key): - indexer = key - else: - indexer = self.ix._convert_to_indexer(key, axis=0) - new_data = self._data.get_slice(indexer, axis=1) - return self._constructor(new_data) - # either boolean or fancy integer index + # slice rows + return self._getitem_slice(key) elif isinstance(key, (np.ndarray, list)): - if isinstance(key, list): - key = lib.list_to_object_array(key) - - # also raises Exception if object array with NA values - if com._is_bool_indexer(key): - key = np.asarray(key, dtype=bool) + # either boolean or fancy integer index return self._getitem_array(key) + elif isinstance(key, DataFrame): + return self._getitem_frame(key) elif isinstance(self.columns, MultiIndex): return self._getitem_multilevel(key) - elif isinstance(key, DataFrame): - if key.values.dtype == bool: - return self.where(key, try_cast = False) - else: - raise ValueError('Cannot index using non-boolean DataFrame') else: + # get column return self._get_item_cache(key) + def _getitem_slice(self, key): + idx_type = self.index.inferred_type + if idx_type == 'floating': + indexer = self.ix._convert_to_indexer(key, axis=0) + elif idx_type == 'integer' or _is_index_slice(key): + indexer = key + else: + indexer = self.ix._convert_to_indexer(key, axis=0) + return self._slice(indexer, axis=0) + def _getitem_array(self, key): - if key.dtype == np.bool_: - if len(key) != len(self.index): + # also raises Exception if object array with NA values + if com._is_bool_indexer(key): + # warning here just in case -- previously __setitem__ was + # reindexing but __getitem__ was not; it seems more reasonable to + # go with the __setitem__ behavior since that is more consistent + # with all other indexing behavior + if isinstance(key, Series) and not key.index.equals(self.index): + import warnings + warnings.warn("Boolean Series key will be reindexed to match " + "DataFrame index.", UserWarning) + elif len(key) != len(self.index): raise ValueError('Item wrong length %d instead of %d!' % (len(key), len(self.index))) - - inds, = key.nonzero() - return self.take(inds) + # _check_bool_indexer will throw exception if Series key cannot + # be reindexed to match DataFrame rows + key = _check_bool_indexer(self.index, key) + indexer = key.nonzero()[0] + return self.take(indexer, axis=0) else: - if self.columns.is_unique: - indexer = self.columns.get_indexer(key) - mask = indexer == -1 - if mask.any(): - raise KeyError("No column(s) named: %s" % - com.pprint_thing(key[mask])) - result = self.reindex(columns=key) - if result.columns.name is None: - result.columns.name = self.columns.name - return result - else: - mask = self.columns.isin(key) - for k in key: - if k not in self.columns: - raise KeyError("No column(s) named: %s" % - com.pprint_thing(k)) - return self.take(mask.nonzero()[0], axis=1) - - def _slice(self, slobj, axis=0): - if axis == 0: - mgr_axis = 1 - else: - mgr_axis = 0 - - new_data = self._data.get_slice(slobj, axis=mgr_axis) - return self._constructor(new_data) - + indexer = self.ix._convert_to_indexer(key, axis=1) + return self.take(indexer, axis=1) + def _getitem_multilevel(self, key): loc = self.columns.get_loc(key) if isinstance(loc, (slice, np.ndarray)): @@ -2061,6 +2042,20 @@ def _getitem_multilevel(self, key): else: return self._get_item_cache(key) + def _getitem_frame(self, key): + if key.values.dtype != np.bool_: + raise ValueError('Must pass DataFrame with boolean values only') + return self.where(key) + + def _slice(self, slobj, axis=0): + if axis == 0: + mgr_axis = 1 + else: + mgr_axis = 0 + + new_data = self._data.get_slice(slobj, axis=mgr_axis) + return self._constructor(new_data) + def _box_item_values(self, key, values): items = self.columns[self.columns.get_loc(key)] if values.ndim == 2: @@ -2094,39 +2089,55 @@ def __setattr__(self, name, value): object.__setattr__(self, name, value) def __setitem__(self, key, value): - # support boolean setting with DataFrame input, e.g. - # df[df > df2] = 0 - if isinstance(key, DataFrame): - self._boolean_set(key, value) + if isinstance(key, slice): + # slice rows + self._setitem_slice(key, value) elif isinstance(key, (np.ndarray, list)): - return self._set_item_multiple(key, value) + self._setitem_array(key, value) + elif isinstance(key, DataFrame): + self._setitem_frame(key, value) else: # set column self._set_item(key, value) + + def _setitem_slice(self, key, value): + idx_type = self.index.inferred_type + if idx_type == 'floating': + indexer = self.ix._convert_to_indexer(key, axis=0) + elif idx_type == 'integer' or _is_index_slice(key): + indexer = key + else: + indexer = self.ix._convert_to_indexer(key, axis=0) + self.ix._setitem_with_indexer(indexer, value) - def _boolean_set(self, key, value): + def _setitem_array(self, key, value): + # also raises Exception if object array with NA values + if com._is_bool_indexer(key): + if len(key) != len(self.index): + raise ValueError('Item wrong length %d instead of %d!' % + (len(key), len(self.index))) + key = _check_bool_indexer(self.index, key) + indexer = key.nonzero()[0] + self.ix._setitem_with_indexer(indexer, value) + else: + if isinstance(value, DataFrame): + if len(value.columns) != len(key): + raise AssertionError('Columns must be same length as key') + for k1, k2 in zip(key, value.columns): + self[k1] = value[k2] + else: + indexer = self.ix._convert_to_indexer(key, axis=1) + self.ix._setitem_with_indexer((slice(None), indexer), value) + + def _setitem_frame(self, key, value): + # support boolean setting with DataFrame input, e.g. + # df[df > df2] = 0 if key.values.dtype != np.bool_: raise ValueError('Must pass DataFrame with boolean values only') - if self._is_mixed_type: raise ValueError('Cannot do boolean setting on mixed-type frame') - self.where(-key, value, inplace=True) - def _set_item_multiple(self, keys, value): - if isinstance(value, DataFrame): - if len(value.columns) != len(keys): - raise AssertionError('Columns must be same length as keys') - for k1, k2 in zip(keys, value.columns): - self[k1] = value[k2] - else: - if isinstance(keys, np.ndarray) and keys.dtype == np.bool_: - # boolean slicing should happen on rows, consistent with - # behavior of getitem - self.ix[keys, :] = value - else: - self.ix[:, keys] = value - def _set_item(self, key, value): """ Add series to DataFrame in specified column. @@ -2920,7 +2931,7 @@ def take(self, indices, axis=0): """ if isinstance(indices, list): indices = np.array(indices) - if self._data.is_mixed_dtype(): + if self._is_mixed_type: if axis == 0: new_data = self._data.take(indices, axis=1) return DataFrame(new_data) @@ -3249,7 +3260,7 @@ def sortlevel(self, level=0, axis=0, ascending=True, inplace=False): new_axis, indexer = the_axis.sortlevel(level, ascending=ascending) - if self._data.is_mixed_dtype() and not inplace: + if self._is_mixed_type and not inplace: if axis == 0: return self.reindex(index=new_axis) else: @@ -5240,25 +5251,22 @@ def where(self, cond, other=NA, inplace=False, try_cast=False, raise_on_error=Tr ------- wh: DataFrame """ - if not hasattr(cond, 'shape'): - raise ValueError('where requires an ndarray like object for its ' - 'condition') - - if isinstance(cond, np.ndarray): + if isinstance(cond, DataFrame): + # this already checks for index/column equality + cond = cond.reindex(self.index, columns=self.columns) + else: + if not hasattr(cond, 'shape'): + raise ValueError('where requires an ndarray like object for its ' + 'condition') if cond.shape != self.shape: raise ValueError('Array conditional must be same shape as self') cond = self._constructor(cond, index=self.index, columns=self.columns) - if cond.shape != self.shape: - cond = cond.reindex(self.index, columns=self.columns) - - if inplace: - cond = -(cond.fillna(True).astype(bool)) - else: - cond = cond.fillna(False).astype(bool) - elif inplace: - cond = -cond + if inplace: + cond = -(cond.fillna(True).astype(bool)) + else: + cond = cond.fillna(False).astype(bool) if isinstance(other, DataFrame): _, other = self.align(other, join='left', fill_value=NA) diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 6966b0ab76aed..096ef7a110e93 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -265,7 +265,7 @@ def _convert_for_reindex(self, key, axis=0): if com._is_bool_indexer(key): key = _check_bool_indexer(labels, key) - return labels[np.asarray(key)] + return labels[key] else: if isinstance(key, Index): # want Index objects to pass through untouched @@ -340,28 +340,19 @@ def _getitem_axis(self, key, axis=0): raise ValueError('Cannot index with multidimensional key') return self._getitem_iterable(key, axis=axis) - elif axis == 0: - is_int_index = _is_integer_index(labels) - - idx = key + else: if com.is_integer(key): - if isinstance(labels, MultiIndex): + if axis == 0 and isinstance(labels, MultiIndex): try: - return self._get_label(key, axis=0) + return self._get_label(key, axis=axis) except (KeyError, TypeError): if _is_integer_index(self.obj.index.levels[0]): raise - if not is_int_index: - return self._get_loc(key, axis=0) + if not _is_integer_index(labels): + return self._get_loc(key, axis=axis) - return self._get_label(idx, axis=0) - else: - labels = self.obj._get_axis(axis) - lab = key - if com.is_integer(key) and not _is_integer_index(labels): - return self._get_loc(key, axis=axis) - return self._get_label(lab, axis=axis) + return self._get_label(key, axis=axis) def _getitem_iterable(self, key, axis=0): labels = self.obj._get_axis(axis) @@ -377,11 +368,10 @@ def _reindex(keys, level=None): if com._is_bool_indexer(key): key = _check_bool_indexer(labels, key) - inds, = np.asarray(key, dtype=bool).nonzero() + inds, = key.nonzero() return self.obj.take(inds, axis=axis) else: - was_index = isinstance(key, Index) - if was_index: + if isinstance(key, Index): # want Index objects to pass through untouched keyarr = key else: @@ -489,8 +479,9 @@ def _convert_to_indexer(self, obj, axis=0): elif _is_list_like(obj): if com._is_bool_indexer(obj): - objarr = _check_bool_indexer(labels, obj) - return objarr + obj = _check_bool_indexer(labels, obj) + inds, = obj.nonzero() + return inds else: if isinstance(obj, Index): objarr = obj.values @@ -672,17 +663,19 @@ def _setitem_with_indexer(self, indexer, value): def _check_bool_indexer(ax, key): # boolean indexing, need to check that the data are aligned, otherwise # disallowed - result = key - if _is_series(key) and key.dtype == np.bool_: - if not key.index.equals(ax): - result = key.reindex(ax) - if isinstance(result, np.ndarray) and result.dtype == np.object_: + # this function assumes that com._is_bool_indexer(key) == True + + result = key + if _is_series(key) and not key.index.equals(ax): + result = result.reindex(ax) mask = com.isnull(result) if mask.any(): - raise IndexingError('cannot index with vector containing ' - 'NA / NaN values') + raise IndexingError('Unalignable boolean Series key provided') + # com._is_bool_indexer has already checked for nulls in the case of an + # object array key, so no check needed here + result = np.asarray(result, dtype=bool) return result diff --git a/pandas/core/series.py b/pandas/core/series.py index c3ae78b1b5e1f..81d8ce47d4f14 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -19,7 +19,7 @@ _asarray_tuplesafe, is_integer_dtype) from pandas.core.index import (Index, MultiIndex, InvalidIndexError, _ensure_index, _handle_legacy_indexes) -from pandas.core.indexing import _SeriesIndexer +from pandas.core.indexing import _SeriesIndexer, _check_bool_indexer from pandas.tseries.index import DatetimeIndex from pandas.tseries.period import PeriodIndex, Period from pandas.util import py3compat @@ -521,8 +521,7 @@ def __getitem__(self, key): # special handling of boolean data with NAs stored in object # arrays. Since we can't represent NA with dtype=bool if _is_bool_indexer(key): - key = self._check_bool_indexer(key) - key = np.asarray(key, dtype=bool) + key = _check_bool_indexer(self.index, key) return self._get_with(key) @@ -675,7 +674,7 @@ def __setitem__(self, key, value): # Could not hash item if _is_bool_indexer(key): - key = self._check_bool_indexer(key) + key = _check_bool_indexer(self.index, key) self.where(~key,value,inplace=True) else: self._set_with(key, value) @@ -741,28 +740,6 @@ def __getslice__(self, i, j): slobj = slice(i, j) return self.__getitem__(slobj) - def _check_bool_indexer(self, key): - # boolean indexing, need to check that the data are aligned, otherwise - # disallowed - result = key - if isinstance(key, Series) and key.dtype == np.bool_: - if not key.index.equals(self.index): - result = key.reindex(self.index) - - if isinstance(result, np.ndarray) and result.dtype == np.object_: - mask = isnull(result) - if mask.any(): - raise ValueError('cannot index with vector containing ' - 'NA / NaN values') - - # coerce to bool type - if not hasattr(result, 'shape'): - result = np.array(result) - if result.dtype != np.bool_: - result = result.astype(np.bool_) - - return result - def __setslice__(self, i, j, value): """Set slice equal to given value(s)""" if i < 0: diff --git a/pandas/sparse/frame.py b/pandas/sparse/frame.py index a012909131135..cfbe5ea2ee0ed 100644 --- a/pandas/sparse/frame.py +++ b/pandas/sparse/frame.py @@ -335,14 +335,7 @@ def __getitem__(self, key): if isinstance(key, slice): date_rng = self.index[key] return self.reindex(date_rng) - elif isinstance(key, (np.ndarray, list)): - if isinstance(key, list): - key = lib.list_to_object_array(key) - - # also raises Exception if object array with NA values - if com._is_bool_indexer(key): - key = np.asarray(key, dtype=bool) return self._getitem_array(key) else: # pragma: no cover raise diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index 03fdd53ce19af..5ab8dd48868d4 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -203,6 +203,23 @@ def test_getitem_boolean(self): self.assertRaises(ValueError, self.tsframe.__getitem__, self.tsframe) + # test that Series work + indexer_obj = Series(indexer_obj, self.tsframe.index) + + subframe_obj = self.tsframe[indexer_obj] + assert_frame_equal(subframe_obj, subframe) + + # test that Series indexers reindex + import warnings + warnings.filterwarnings(action='ignore', category=UserWarning) + + indexer_obj = indexer_obj.reindex(self.tsframe.index[::-1]) + + subframe_obj = self.tsframe[indexer_obj] + assert_frame_equal(subframe_obj, subframe) + + warnings.filterwarnings(action='default', category=UserWarning) + # test df[df > 0] for df in [ self.tsframe, self.mixed_frame, self.mixed_float, self.mixed_int ]: @@ -222,7 +239,6 @@ def test_getitem_boolean(self): if bif[c].dtype != bifw[c].dtype: self.assert_(bif[c].dtype == df[c].dtype) - def test_getitem_boolean_casting(self): #### this currently disabled ### @@ -368,6 +384,17 @@ def test_setitem_boolean(self): df = self.frame.copy() values = self.frame.values + df[df['A'] > 0] = 4 + values[values[:, 0] > 0] = 4 + assert_almost_equal(df.values, values) + + # test that column reindexing works + series = df['A'] == 4 + series = series.reindex(df.index[::-1]) + df[series] = 1 + values[values[:, 0] == 4] = 1 + assert_almost_equal(df.values, values) + df[df > 0] = 5 values[values > 0] = 5 assert_almost_equal(df.values, values) @@ -381,6 +408,11 @@ def test_setitem_boolean(self): np.putmask(values[:-1], values[:-1] < 0, 2) assert_almost_equal(df.values, values) + # indexed with same shape but rows-reversed df + df[df[::-1] == 2] = 3 + values[values == 2] = 3 + assert_almost_equal(df.values, values) + self.assertRaises(Exception, df.__setitem__, df * 0, 2) # index with DataFrame @@ -871,6 +903,15 @@ def test_getitem_setitem_non_ix_labels(self): assert_frame_equal(result, expected) assert_frame_equal(result2, expected) + result = df.copy() + result.ix[start:end] = 0 + result2 = df.copy() + result2[start:end] = 0 + expected = df.copy() + expected[5:11] = 0 + assert_frame_equal(result, expected) + assert_frame_equal(result2, expected) + def test_ix_assign_column_mixed(self): # GH #1142 orig = self.mixed_frame.ix[:, 'B'].copy() diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index 87b820faa3dc8..ac9912e5c59eb 100644 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -248,6 +248,29 @@ def test_series_setitem(self): def test_series_slice_partial(self): pass + def test_frame_getitem_setitem_boolean(self): + df = self.frame.T.copy() + values = df.values + + result = df[df > 0] + expected = df.where(df > 0) + assert_frame_equal(result, expected) + + df[df > 0] = 5 + values[values > 0] = 5 + assert_almost_equal(df.values, values) + + df[df == 5] = 0 + values[values == 5] = 0 + assert_almost_equal(df.values, values) + + # a df that needs alignment first + df[df[:-1] < 0] = 2 + np.putmask(values[:-1], values[:-1] < 0, 2) + assert_almost_equal(df.values, values) + + self.assertRaises(Exception, df.__setitem__, df * 0, 2) + def test_frame_getitem_setitem_slice(self): # getitem result = self.frame.ix[:4]