diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 301ea9d28d001..849af9dfcb520 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -26,7 +26,8 @@ _default_index, _is_sequence) from pandas.core.generic import NDFrame from pandas.core.index import Index, MultiIndex, _ensure_index -from pandas.core.indexing import _NDFrameIndexer, _maybe_droplevels +from pandas.core.indexing import (_NDFrameIndexer, _maybe_droplevels, + _is_index_slice, _check_bool_indexer) from pandas.core.internals import BlockManager, make_block, form_blocks from pandas.core.series import Series, _radd_compat, _dtype_from_scalar from pandas.compat.scipy import scoreatpercentile as _quantile @@ -1939,73 +1940,53 @@ def iget_value(self, i, j): return self.get_value(row, col) def __getitem__(self, key): - # slice rows if isinstance(key, slice): - from pandas.core.indexing import _is_index_slice - idx_type = self.index.inferred_type - if idx_type == 'floating': - indexer = self.ix._convert_to_indexer(key, axis=0) - elif idx_type == 'integer' or _is_index_slice(key): - indexer = key - else: - indexer = self.ix._convert_to_indexer(key, axis=0) - new_data = self._data.get_slice(indexer, axis=1) - return self._constructor(new_data) - # either boolean or fancy integer index + # slice rows + return self._getitem_slice(key) elif isinstance(key, (np.ndarray, list)): - if isinstance(key, list): - key = lib.list_to_object_array(key) - - # also raises Exception if object array with NA values - if com._is_bool_indexer(key): - key = np.asarray(key, dtype=bool) + # either boolean or fancy integer index return self._getitem_array(key) + elif isinstance(key, DataFrame): + return self._getitem_frame(key) elif isinstance(self.columns, MultiIndex): return self._getitem_multilevel(key) - elif isinstance(key, DataFrame): - if key.values.dtype == bool: - return self.where(key) - else: - raise ValueError('Cannot index using non-boolean DataFrame') else: + # get column return self._get_item_cache(key) + def _getitem_slice(self, key): + idx_type = self.index.inferred_type + if idx_type == 'floating': + indexer = self.ix._convert_to_indexer(key, axis=0) + elif idx_type == 'integer' or _is_index_slice(key): + indexer = key + else: + indexer = self.ix._convert_to_indexer(key, axis=0) + return self._slice(indexer, axis=0) + def _getitem_array(self, key): - if key.dtype == np.bool_: - if len(key) != len(self.index): + # also raises Exception if object array with NA values + if com._is_bool_indexer(key): + # warning here just in case -- previously __setitem__ was + # reindexing but __getitem__ was not; it seems more reasonable to + # go with the __setitem__ behavior since that is more consistent + # with all other indexing behavior + if isinstance(key, Series) and not key.index.equals(self.index): + import warnings + warnings.warn("Boolean Series key will be reindexed to match " + "DataFrame index.", UserWarning) + elif len(key) != len(self.index): raise ValueError('Item wrong length %d instead of %d!' % (len(key), len(self.index))) - - inds, = key.nonzero() - return self.take(inds) + # _check_bool_indexer will throw exception if Series key cannot + # be reindexed to match DataFrame rows + key = _check_bool_indexer(self.index, key) + indexer = key.nonzero()[0] + return self.take(indexer, axis=0) else: - if self.columns.is_unique: - indexer = self.columns.get_indexer(key) - mask = indexer == -1 - if mask.any(): - raise KeyError("No column(s) named: %s" % - com.pprint_thing(key[mask])) - result = self.reindex(columns=key) - if result.columns.name is None: - result.columns.name = self.columns.name - return result - else: - mask = self.columns.isin(key) - for k in key: - if k not in self.columns: - raise KeyError("No column(s) named: %s" % - com.pprint_thing(k)) - return self.take(mask.nonzero()[0], axis=1) - - def _slice(self, slobj, axis=0): - if axis == 0: - mgr_axis = 1 - else: - mgr_axis = 0 - - new_data = self._data.get_slice(slobj, axis=mgr_axis) - return self._constructor(new_data) - + indexer = self.ix._convert_to_indexer(key, axis=1) + return self.take(indexer, axis=1) + def _getitem_multilevel(self, key): loc = self.columns.get_loc(key) if isinstance(loc, (slice, np.ndarray)): @@ -2030,6 +2011,20 @@ def _getitem_multilevel(self, key): else: return self._get_item_cache(key) + def _getitem_frame(self, key): + if key.values.dtype != np.bool_: + raise ValueError('Must pass DataFrame with boolean values only') + return self.where(key) + + def _slice(self, slobj, axis=0): + if axis == 0: + mgr_axis = 1 + else: + mgr_axis = 0 + + new_data = self._data.get_slice(slobj, axis=mgr_axis) + return self._constructor(new_data) + def _box_item_values(self, key, values): items = self.columns[self.columns.get_loc(key)] if values.ndim == 2: @@ -2063,39 +2058,55 @@ def __setattr__(self, name, value): object.__setattr__(self, name, value) def __setitem__(self, key, value): - # support boolean setting with DataFrame input, e.g. - # df[df > df2] = 0 - if isinstance(key, DataFrame): - self._boolean_set(key, value) + if isinstance(key, slice): + # slice rows + self._setitem_slice(key, value) elif isinstance(key, (np.ndarray, list)): - return self._set_item_multiple(key, value) + self._setitem_array(key, value) + elif isinstance(key, DataFrame): + self._setitem_frame(key, value) else: # set column self._set_item(key, value) + + def _setitem_slice(self, key, value): + idx_type = self.index.inferred_type + if idx_type == 'floating': + indexer = self.ix._convert_to_indexer(key, axis=0) + elif idx_type == 'integer' or _is_index_slice(key): + indexer = key + else: + indexer = self.ix._convert_to_indexer(key, axis=0) + self.ix._setitem_with_indexer(indexer, value) + + def _setitem_array(self, key, value): + # also raises Exception if object array with NA values + if com._is_bool_indexer(key): + if len(key) != len(self.index): + raise ValueError('Item wrong length %d instead of %d!' % + (len(key), len(self.index))) + key = _check_bool_indexer(self.index, key) + indexer = key.nonzero()[0] + self.ix._setitem_with_indexer(indexer, value) + else: + if isinstance(value, DataFrame): + if len(value.columns) != len(key): + raise AssertionError('Columns must be same length as key') + for k1, k2 in zip(key, value.columns): + self[k1] = value[k2] + else: + indexer = self.ix._convert_to_indexer(key, axis=1) + self.ix._setitem_with_indexer((slice(None), indexer), value) - def _boolean_set(self, key, value): + def _setitem_frame(self, key, value): + # support boolean setting with DataFrame input, e.g. + # df[df > df2] = 0 if key.values.dtype != np.bool_: raise ValueError('Must pass DataFrame with boolean values only') - if self._is_mixed_type: raise ValueError('Cannot do boolean setting on mixed-type frame') - self.where(-key, value, inplace=True) - def _set_item_multiple(self, keys, value): - if isinstance(value, DataFrame): - if len(value.columns) != len(keys): - raise AssertionError('Columns must be same length as keys') - for k1, k2 in zip(keys, value.columns): - self[k1] = value[k2] - else: - if isinstance(keys, np.ndarray) and keys.dtype == np.bool_: - # boolean slicing should happen on rows, consistent with - # behavior of getitem - self.ix[keys, :] = value - else: - self.ix[:, keys] = value - def _set_item(self, key, value): """ Add series to DataFrame in specified column. @@ -2889,7 +2900,7 @@ def take(self, indices, axis=0): """ if isinstance(indices, list): indices = np.array(indices) - if self._data.is_mixed_dtype(): + if self._is_mixed_type: if axis == 0: new_data = self._data.take(indices, axis=1) return DataFrame(new_data) @@ -3218,7 +3229,7 @@ def sortlevel(self, level=0, axis=0, ascending=True, inplace=False): new_axis, indexer = the_axis.sortlevel(level, ascending=ascending) - if self._data.is_mixed_dtype() and not inplace: + if self._is_mixed_type and not inplace: if axis == 0: return self.reindex(index=new_axis) else: @@ -5235,7 +5246,9 @@ def where(self, cond, other=NA, inplace=False): cond = self._constructor(cond, index=self.index, columns=self.columns) - if cond.shape != self.shape: + if (cond.shape != self.shape or + not cond.index.equals(self.index) or + not cond.columns.equals(self.columns)): cond = cond.reindex(self.index, columns=self.columns) if inplace: diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 6966b0ab76aed..0fce359205e4d 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -340,28 +340,19 @@ def _getitem_axis(self, key, axis=0): raise ValueError('Cannot index with multidimensional key') return self._getitem_iterable(key, axis=axis) - elif axis == 0: - is_int_index = _is_integer_index(labels) - - idx = key + else: if com.is_integer(key): - if isinstance(labels, MultiIndex): + if axis == 0 and isinstance(labels, MultiIndex): try: - return self._get_label(key, axis=0) + return self._get_label(key, axis=axis) except (KeyError, TypeError): if _is_integer_index(self.obj.index.levels[0]): raise - if not is_int_index: - return self._get_loc(key, axis=0) + if not _is_integer_index(labels): + return self._get_loc(key, axis=axis) - return self._get_label(idx, axis=0) - else: - labels = self.obj._get_axis(axis) - lab = key - if com.is_integer(key) and not _is_integer_index(labels): - return self._get_loc(key, axis=axis) - return self._get_label(lab, axis=axis) + return self._get_label(key, axis=axis) def _getitem_iterable(self, key, axis=0): labels = self.obj._get_axis(axis) @@ -377,11 +368,10 @@ def _reindex(keys, level=None): if com._is_bool_indexer(key): key = _check_bool_indexer(labels, key) - inds, = np.asarray(key, dtype=bool).nonzero() + inds, = key.nonzero() return self.obj.take(inds, axis=axis) else: - was_index = isinstance(key, Index) - if was_index: + if isinstance(key, Index): # want Index objects to pass through untouched keyarr = key else: @@ -489,8 +479,9 @@ def _convert_to_indexer(self, obj, axis=0): elif _is_list_like(obj): if com._is_bool_indexer(obj): - objarr = _check_bool_indexer(labels, obj) - return objarr + obj = _check_bool_indexer(labels, obj) + inds, = obj.nonzero() + return inds else: if isinstance(obj, Index): objarr = obj.values @@ -672,17 +663,19 @@ def _setitem_with_indexer(self, indexer, value): def _check_bool_indexer(ax, key): # boolean indexing, need to check that the data are aligned, otherwise # disallowed - result = key - if _is_series(key) and key.dtype == np.bool_: - if not key.index.equals(ax): - result = key.reindex(ax) - if isinstance(result, np.ndarray) and result.dtype == np.object_: + # this function assumes that com._is_bool_indexer(key) == True + + result = key + if _is_series(key) and not key.index.equals(ax): + result = result.reindex(ax) mask = com.isnull(result) if mask.any(): - raise IndexingError('cannot index with vector containing ' - 'NA / NaN values') + raise IndexingError('Unalignable boolean Series key provided') + # com._is_bool_indexer has already checked for nulls in the case of an + # object array key, so no check needed here + result = np.asarray(result, dtype=bool) return result diff --git a/pandas/sparse/frame.py b/pandas/sparse/frame.py index a012909131135..cfbe5ea2ee0ed 100644 --- a/pandas/sparse/frame.py +++ b/pandas/sparse/frame.py @@ -335,14 +335,7 @@ def __getitem__(self, key): if isinstance(key, slice): date_rng = self.index[key] return self.reindex(date_rng) - elif isinstance(key, (np.ndarray, list)): - if isinstance(key, list): - key = lib.list_to_object_array(key) - - # also raises Exception if object array with NA values - if com._is_bool_indexer(key): - key = np.asarray(key, dtype=bool) return self._getitem_array(key) else: # pragma: no cover raise diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index 09747ba3f09f0..78f743329f676 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -171,6 +171,25 @@ def test_getitem_boolean(self): self.assertRaises(ValueError, self.tsframe.__getitem__, self.tsframe) + # test that Series work + indexer_obj = Series(indexer_obj, self.tsframe.index) + + subframe_obj = self.tsframe[indexer_obj] + assert_frame_equal(subframe_obj, subframe) + + # test that Series indexers reindex + import warnings + warnings.filterwarnings(action='ignore', category=UserWarning) + + indexer_obj = indexer_obj.reindex(self.tsframe.index[::-1]) + + subframe_obj = self.tsframe[indexer_obj] + print '\n'+str(subframe_obj)+'\n' + print '\n'+str(subframe)+'\n' + assert_frame_equal(subframe_obj, subframe) + + warnings.filterwarnings(action='default', category=UserWarning) + # test df[df >0] works bif = self.tsframe[self.tsframe > 0] bifw = DataFrame(np.where(self.tsframe > 0, self.tsframe, np.nan), @@ -307,6 +326,17 @@ def test_setitem_boolean(self): df = self.frame.copy() values = self.frame.values + df[df['A'] > 0] = 4 + values[values[:, 0] > 0] = 4 + assert_almost_equal(df.values, values) + + # test that column reindexing works + series = df['A'] == 4 + series = series.reindex(df.index[::-1]) + df[series] = 1 + values[values[:, 0] == 4] = 1 + assert_almost_equal(df.values, values) + df[df > 0] = 5 values[values > 0] = 5 assert_almost_equal(df.values, values) @@ -320,6 +350,11 @@ def test_setitem_boolean(self): np.putmask(values[:-1], values[:-1] < 0, 2) assert_almost_equal(df.values, values) + # indexed with same shape but rows-reversed df + df[df[::-1] == 2] = 3 + values[values == 2] = 3 + assert_almost_equal(df.values, values) + self.assertRaises(Exception, df.__setitem__, df * 0, 2) # index with DataFrame @@ -810,6 +845,15 @@ def test_getitem_setitem_non_ix_labels(self): assert_frame_equal(result, expected) assert_frame_equal(result2, expected) + result = df.copy() + result.ix[start:end] = 0 + result2 = df.copy() + result2[start:end] = 0 + expected = df.copy() + expected[5:11] = 0 + assert_frame_equal(result, expected) + assert_frame_equal(result2, expected) + def test_ix_assign_column_mixed(self): # GH #1142 orig = self.mixed_frame.ix[:, 'B'].copy() diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index a4df141fefef9..0683479b881bf 100644 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -248,6 +248,29 @@ def test_series_setitem(self): def test_series_slice_partial(self): pass + def test_frame_getitem_setitem_boolean(self): + df = self.frame.T.copy() + values = df.values + + result = df[df > 0] + expected = df.where(df > 0) + assert_frame_equal(result, expected) + + df[df > 0] = 5 + values[values > 0] = 5 + assert_almost_equal(df.values, values) + + df[df == 5] = 0 + values[values == 5] = 0 + assert_almost_equal(df.values, values) + + # a df that needs alignment first + df[df[:-1] < 0] = 2 + np.putmask(values[:-1], values[:-1] < 0, 2) + assert_almost_equal(df.values, values) + + self.assertRaises(Exception, df.__setitem__, df * 0, 2) + def test_frame_getitem_setitem_slice(self): # getitem result = self.frame.ix[:4]