diff --git a/doc/source/release.rst b/doc/source/release.rst index 6227fedda11d4..f23852885668a 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -200,6 +200,11 @@ Improvements to existing features argument. (:issue:`5354`) - Added short docstrings to a few methods that were missing them + fixed the docstrings for Panel flex methods. (:issue:`5336`) + - ``NDFrame.drop()``, ``NDFrame.dropna()``, and ``.drop_duplicates()`` all + accept ``inplace`` as a kewyord argument; however, this only means that the + wrapper is updated inplace, a copy is still made internally. + (:issue:`1960`, :issue:`5247`, and related :issue:`2325` [still not + closed]) API Changes ~~~~~~~~~~~ @@ -474,6 +479,9 @@ See :ref:`Internal Refactoring` - Unity ``dropna`` for Series/DataFrame signature (:issue:`5250`), tests from :issue:`5234`, courtesy of @rockg - Rewrite assert_almost_equal() in cython for performance (:issue:`4398`) + - Added an internal ``_update_inplace`` method to facilitate updating + ``NDFrame`` wrappers on inplace ops (only is for convenience of caller, + doesn't actually prevent copies). (:issue:`5247`) .. _release.bug_fixes-0.13.0: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 690ac7c3e76c9..2361c6920985b 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2421,7 +2421,7 @@ def _maybe_cast(values, labels=None): #---------------------------------------------------------------------- # Reindex-based selection methods - def dropna(self, axis=0, how='any', thresh=None, subset=None): + def dropna(self, axis=0, how='any', thresh=None, subset=None, inplace=False): """ Return object with labels on given axis omitted where alternately any or all of the data are missing @@ -2438,6 +2438,8 @@ def dropna(self, axis=0, how='any', thresh=None, subset=None): subset : array-like Labels along other axis to consider, e.g. if you are dropping rows these would be a list of columns to include + inplace : bool, defalt False + If True, do operation inplace and return None. Returns ------- @@ -2448,31 +2450,36 @@ def dropna(self, axis=0, how='any', thresh=None, subset=None): for ax in axis: result = result.dropna(how=how, thresh=thresh, subset=subset, axis=ax) - return result - - axis = self._get_axis_number(axis) - agg_axis = 1 - axis - - agg_obj = self - if subset is not None: - agg_axis_name = self._get_axis_name(agg_axis) - agg_obj = self.reindex(**{agg_axis_name: subset}) + else: + axis = self._get_axis_number(axis) + agg_axis = 1 - axis + + agg_obj = self + if subset is not None: + agg_axis_name = self._get_axis_name(agg_axis) + agg_obj = self.reindex(**{agg_axis_name: subset}) + + count = agg_obj.count(axis=agg_axis) + + if thresh is not None: + mask = count >= thresh + elif how == 'any': + mask = count == len(agg_obj._get_axis(agg_axis)) + elif how == 'all': + mask = count > 0 + else: + if how is not None: + raise ValueError('invalid how option: %s' % how) + else: + raise TypeError('must specify how or thresh') - count = agg_obj.count(axis=agg_axis) + result = self.take(mask.nonzero()[0], axis=axis, convert=False) - if thresh is not None: - mask = count >= thresh - elif how == 'any': - mask = count == len(agg_obj._get_axis(agg_axis)) - elif how == 'all': - mask = count > 0 + if inplace: + self._update_inplace(result) else: - if how is not None: - raise ValueError('invalid how option: %s' % how) - else: - raise TypeError('must specify how or thresh') + return result - return self.take(mask.nonzero()[0], axis=axis, convert=False) def drop_duplicates(self, cols=None, take_last=False, inplace=False): """ diff --git a/pandas/core/generic.py b/pandas/core/generic.py index a48488f57e833..b230df7483760 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -1101,7 +1101,7 @@ def reindex_like(self, other, method=None, copy=True, limit=None): d = other._construct_axes_dict(method=method) return self.reindex(**d) - def drop(self, labels, axis=0, level=None): + def drop(self, labels, axis=0, level=None, inplace=False, **kwargs): """ Return new object with labels in requested axis removed @@ -1111,6 +1111,8 @@ def drop(self, labels, axis=0, level=None): axis : int or axis name level : int or name, default None For MultiIndex + inplace : bool, default False + If True, do operation inplace and return None. Returns ------- @@ -1132,7 +1134,7 @@ def drop(self, labels, axis=0, level=None): dropped.axes[axis_].set_names(axis.names, inplace=True) except AttributeError: pass - return dropped + result = dropped else: labels = com._index_labels_to_array(labels) @@ -1147,7 +1149,20 @@ def drop(self, labels, axis=0, level=None): slicer = [slice(None)] * self.ndim slicer[self._get_axis_number(axis_name)] = indexer - return self.ix[tuple(slicer)] + result = self.ix[tuple(slicer)] + + if inplace: + self._update_inplace(result) + else: + return result + + def _update_inplace(self, result): + "replace self internals with result." + # NOTE: This does *not* call __finalize__ and that's an explicit + # decision that we may revisit in the future. + self._reset_cache() + self._data = result._data + self._maybe_update_cacher() def add_prefix(self, prefix): """ diff --git a/pandas/core/panel.py b/pandas/core/panel.py index 04ace84cace37..d4ba7dd4e708a 100644 --- a/pandas/core/panel.py +++ b/pandas/core/panel.py @@ -615,7 +615,7 @@ def _reindex_multi(self, axes, copy, fill_value): return Panel(new_values, items=new_items, major_axis=new_major, minor_axis=new_minor) - def dropna(self, axis=0, how='any', **kwargs): + def dropna(self, axis=0, how='any', inplace=False, **kwargs): """ Drop 2D from panel, holding passed axis constant @@ -627,6 +627,8 @@ def dropna(self, axis=0, how='any', **kwargs): how : {'all', 'any'}, default 'any' 'any': one or more values are NA in the DataFrame along the axis. For 'all' they all must be. + inplace : bool, default False + If True, do operation inplace and return None. Returns ------- @@ -648,7 +650,11 @@ def dropna(self, axis=0, how='any', **kwargs): cond = mask == per_slice new_ax = self._get_axis(axis)[cond] - return self.reindex_axis(new_ax, axis=axis) + result = self.reindex_axis(new_ax, axis=axis) + if inplace: + self._update_inplace(result) + else: + return result def _combine(self, other, func, axis=0): if isinstance(other, Panel): diff --git a/pandas/core/series.py b/pandas/core/series.py index 572f0c44d0bc9..699dc9b31464e 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1155,7 +1155,7 @@ def nunique(self): """ return len(self.value_counts()) - def drop_duplicates(self, take_last=False): + def drop_duplicates(self, take_last=False, inplace=False): """ Return Series with duplicate values removed @@ -1163,13 +1163,20 @@ def drop_duplicates(self, take_last=False): ---------- take_last : boolean, default False Take the last observed index in a group. Default first + inplace : boolean, default False + If True, performs operation inplace and returns None. Returns ------- deduplicated : Series """ duplicated = self.duplicated(take_last=take_last) - return self[-duplicated] + result = self[-duplicated] + if inplace: + return self._update_inplace(result) + else: + return result + def duplicated(self, take_last=False): """ @@ -2190,18 +2197,25 @@ def to_csv(self, path, index=True, sep=",", na_rep='', index_label=index_label, mode=mode, nanRep=nanRep, encoding=encoding, date_format=date_format) - def dropna(self, axis=0, **kwargs): + def dropna(self, axis=0, inplace=False, **kwargs): """ Return Series without null values Returns ------- valid : Series + inplace : bool (default False) + Do operation in place. """ axis = self._get_axis_number(axis or 0) - return remove_na(self) + result = remove_na(self) + if inplace: + self._update_inplace(result) + else: + return result - valid = lambda self: self.dropna() + valid = lambda self, inplace=False, **kwargs: self.dropna(inplace=inplace, + **kwargs) def first_valid_index(self): """ diff --git a/pandas/sparse/series.py b/pandas/sparse/series.py index 52d536fc16d37..cf4060fa6d871 100644 --- a/pandas/sparse/series.py +++ b/pandas/sparse/series.py @@ -569,13 +569,16 @@ def cumsum(self, axis=0, dtype=None, out=None): return self._constructor(new_array, index=self.index, sparse_index=new_array.sp_index).__finalize__(self) return Series(new_array, index=self.index).__finalize__(self) - def dropna(self, axis=0, **kwargs): + def dropna(self, axis=0, inplace=False, **kwargs): """ Analogous to Series.dropna. If fill_value=NaN, returns a dense Series """ # TODO: make more efficient axis = self._get_axis_number(axis or 0) dense_valid = self.to_dense().valid() + if inplace: + raise NotImplementedError("Cannot perform inplace dropna" + " operations on a SparseSeries") if isnull(self.fill_value): return dense_valid else: diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index 6c7877a8d6e4a..12b960ad376ff 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -6464,10 +6464,13 @@ def test_drop_names(self): df.index.name, df.columns.name = 'first', 'second' df_dropped_b = df.drop('b') df_dropped_e = df.drop('e', axis=1) - self.assertEqual(df_dropped_b.index.name, 'first') - self.assertEqual(df_dropped_e.index.name, 'first') - self.assertEqual(df_dropped_b.columns.name, 'second') - self.assertEqual(df_dropped_e.columns.name, 'second') + df_inplace_b, df_inplace_e = df.copy(), df.copy() + df_inplace_b.drop('b', inplace=True) + df_inplace_e.drop('e', axis=1, inplace=True) + for obj in (df_dropped_b, df_dropped_e, df_inplace_b, df_inplace_e): + self.assertEqual(obj.index.name, 'first') + self.assertEqual(obj.columns.name, 'second') + self.assertEqual(list(df.columns), ['d', 'e', 'f']) def test_dropEmptyRows(self): N = len(self.frame.index) @@ -6475,12 +6478,21 @@ def test_dropEmptyRows(self): mat[:5] = nan frame = DataFrame({'foo': mat}, index=self.frame.index) + original = Series(mat, index=self.frame.index) + expected = original.dropna() + inplace_frame1, inplace_frame2 = frame.copy(), frame.copy() smaller_frame = frame.dropna(how='all') - self.assert_(np.array_equal(smaller_frame['foo'], mat[5:])) + # check that original was preserved + assert_series_equal(frame['foo'], original) + inplace_frame1.dropna(how='all', inplace=True) + assert_series_equal(smaller_frame['foo'], expected) + assert_series_equal(inplace_frame1['foo'], expected) smaller_frame = frame.dropna(how='all', subset=['foo']) - self.assert_(np.array_equal(smaller_frame['foo'], mat[5:])) + inplace_frame2.dropna(how='all', subset=['foo'], inplace=True) + assert_series_equal(smaller_frame['foo'], expected) + assert_series_equal(inplace_frame2['foo'], expected) def test_dropIncompleteRows(self): N = len(self.frame.index) @@ -6489,12 +6501,21 @@ def test_dropIncompleteRows(self): frame = DataFrame({'foo': mat}, index=self.frame.index) frame['bar'] = 5 + original = Series(mat, index=self.frame.index) + inp_frame1, inp_frame2 = frame.copy(), frame.copy() smaller_frame = frame.dropna() + assert_series_equal(frame['foo'], original) + inp_frame1.dropna(inplace=True) self.assert_(np.array_equal(smaller_frame['foo'], mat[5:])) + self.assert_(np.array_equal(inp_frame1['foo'], mat[5:])) samesize_frame = frame.dropna(subset=['bar']) + assert_series_equal(frame['foo'], original) + self.assert_((frame['bar'] == 5).all()) + inp_frame2.dropna(subset=['bar'], inplace=True) self.assert_(samesize_frame.index.equals(self.frame.index)) + self.assert_(inp_frame2.index.equals(self.frame.index)) def test_dropna(self): df = DataFrame(np.random.randn(6, 4)) @@ -6502,20 +6523,32 @@ def test_dropna(self): dropped = df.dropna(axis=1) expected = df.ix[:, [0, 1, 3]] + inp = df.copy() + inp.dropna(axis=1, inplace=True) assert_frame_equal(dropped, expected) + assert_frame_equal(inp, expected) dropped = df.dropna(axis=0) expected = df.ix[lrange(2, 6)] + inp = df.copy() + inp.dropna(axis=0, inplace=True) assert_frame_equal(dropped, expected) + assert_frame_equal(inp, expected) # threshold dropped = df.dropna(axis=1, thresh=5) expected = df.ix[:, [0, 1, 3]] + inp = df.copy() + inp.dropna(axis=1, thresh=5, inplace=True) assert_frame_equal(dropped, expected) + assert_frame_equal(inp, expected) dropped = df.dropna(axis=0, thresh=4) expected = df.ix[lrange(2, 6)] + inp = df.copy() + inp.dropna(axis=0, thresh=4, inplace=True) assert_frame_equal(dropped, expected) + assert_frame_equal(inp, expected) dropped = df.dropna(axis=1, thresh=4) assert_frame_equal(dropped, df) @@ -6525,7 +6558,10 @@ def test_dropna(self): # subset dropped = df.dropna(axis=0, subset=[0, 1, 3]) + inp = df.copy() + inp.dropna(axis=0, subset=[0, 1, 3], inplace=True) assert_frame_equal(dropped, df) + assert_frame_equal(inp, df) # all dropped = df.dropna(axis=1, how='all') @@ -6539,6 +6575,22 @@ def test_dropna(self): # bad input self.assertRaises(ValueError, df.dropna, axis=3) + + def test_drop_and_dropna_caching(self): + # tst that cacher updates + original = Series([1, 2, np.nan]) + expected = Series([1, 2], dtype=original.dtype) + df = pd.DataFrame({'A': original.values.copy()}) + df2 = df.copy() + df['A'].dropna() + assert_series_equal(df['A'], original) + df['A'].dropna(inplace=True) + assert_series_equal(df['A'], expected) + df2['A'].drop([1]) + assert_series_equal(df2['A'], original) + df2['A'].drop([1], inplace=True) + assert_series_equal(df2['A'], original.drop([1])) + def test_dropna_corner(self): # bad input self.assertRaises(ValueError, self.frame.dropna, how='foo') @@ -6549,13 +6601,18 @@ def test_dropna_multiple_axes(self): [4, np.nan, 5, 6], [np.nan, np.nan, np.nan, np.nan], [7, np.nan, 8, 9]]) - + cp = df.copy() result = df.dropna(how='all', axis=[0, 1]) result2 = df.dropna(how='all', axis=(0, 1)) expected = df.dropna(how='all').dropna(how='all', axis=1) assert_frame_equal(result, expected) assert_frame_equal(result2, expected) + assert_frame_equal(df, cp) + + inp = df.copy() + inp.dropna(how='all', axis=(0, 1), inplace=True) + assert_frame_equal(inp, expected) def test_drop_duplicates(self): df = DataFrame({'AAA': ['foo', 'bar', 'foo', 'bar', diff --git a/pandas/tests/test_panel.py b/pandas/tests/test_panel.py index 39ceba7469f36..96f14a09180ed 100644 --- a/pandas/tests/test_panel.py +++ b/pandas/tests/test_panel.py @@ -1600,6 +1600,9 @@ def test_dropna(self): result = p.dropna(axis=1) exp = p.ix[:, ['a', 'c', 'e'], :] assert_panel_equal(result, exp) + inp = p.copy() + inp.dropna(axis=1, inplace=True) + assert_panel_equal(inp, exp) result = p.dropna(axis=1, how='all') assert_panel_equal(result, p) diff --git a/pandas/tests/test_series.py b/pandas/tests/test_series.py index ad6fa68f063e1..4405fcc778886 100644 --- a/pandas/tests/test_series.py +++ b/pandas/tests/test_series.py @@ -3582,6 +3582,8 @@ def test_unique(self): def test_dropna_empty(self): s = Series([]) self.assert_(len(s.dropna()) == 0) + s.dropna(inplace=True) + self.assert_(len(s) == 0) # invalid axis self.assertRaises(ValueError, s.dropna, axis=1) @@ -3607,10 +3609,16 @@ def test_drop_duplicates(self): result = s.drop_duplicates() expected = s[[True, True, True, False]] assert_series_equal(result, expected) + sc = s.copy() + sc.drop_duplicates(inplace=True) + assert_series_equal(sc, expected) result = s.drop_duplicates(take_last=True) expected = s[[True, True, False, True]] assert_series_equal(result, expected) + sc = s.copy() + sc.drop_duplicates(take_last=True, inplace=True) + assert_series_equal(sc, expected) def test_sort(self): ts = self.ts.copy() @@ -5196,6 +5204,10 @@ def test_dropna_preserve_name(self): self.ts[:5] = np.nan result = self.ts.dropna() self.assertEquals(result.name, self.ts.name) + name = self.ts.name + ts = self.ts.copy() + ts.dropna(inplace=True) + self.assertEquals(ts.name, name) def test_numpy_unique(self): # it works!