From 1672f26919cd08bcc08b267723d965b4b47c321f Mon Sep 17 00:00:00 2001 From: sinhrks Date: Sun, 24 Jul 2016 16:28:32 +0900 Subject: [PATCH] ENH: add Index.dropna --- doc/source/api.rst | 2 ++ doc/source/whatsnew/v0.19.0.txt | 43 +++++++++++++++++++++++------- pandas/indexes/base.py | 23 ++++++++++++++++ pandas/indexes/multi.py | 13 +++++++++ pandas/tests/indexes/test_base.py | 41 ++++++++++++++++++++++++++++ pandas/tests/indexes/test_multi.py | 21 +++++++++++++++ 6 files changed, 134 insertions(+), 9 deletions(-) diff --git a/doc/source/api.rst b/doc/source/api.rst index 7b9fbb9b41a79..a510f663d19ee 100644 --- a/doc/source/api.rst +++ b/doc/source/api.rst @@ -1349,6 +1349,8 @@ Modifying and Computations Index.unique Index.nunique Index.value_counts + Index.fillna + Index.dropna Conversion ~~~~~~~~~~ diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 685c214454719..0ca9731c76011 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -259,6 +259,40 @@ Using the anchoring suffix, you can also specify the day of month to use instead pd.date_range('2015-01-01', freq='SM-14', periods=4) +.. _whatsnew_0190.enhancements.index: + +New Index methods +^^^^^^^^^^^^^^^^^ + +Following methods and options are added to ``Index`` to be more consistent with ``Series`` and ``DataFrame``. + +- ``Index`` now supports the ``.where()`` function for same shape indexing (:issue:`13170`) + + .. ipython:: python + + idx = pd.Index(['a', 'b', 'c']) + idx.where([True, False, True]) + + +- ``Index`` now supports ``.dropna`` to exclude missing values (:issue:`6194`) + + .. ipython:: python + + idx = pd.Index([1, 2, np.nan, 4]) + idx.dropna() + +For ``MultiIndex``, values are dropped if any level is missing by default. Specifying +``how='all'`` only drops values where all levels are missing. + + midx = pd.MultiIndex.from_arrays([[1, 2, np.nan, 4], + [1, 2, np.nan, np.nan]]) + midx + midx.dropna() + midx.dropna(how='all') + +- ``Index.astype()`` now accepts an optional boolean argument ``copy``, which allows optional copying if the requirements on dtype are satisfied (:issue:`13209`) +- ``Index`` now supports ``.str.extractall()`` which returns a ``DataFrame``, the see :ref:`docs here ` (:issue:`10008`, :issue:`13156`) + .. _whatsnew_0190.enhancements.other: Other enhancements @@ -273,7 +307,6 @@ Other enhancements pd.to_numeric(s, downcast='unsigned') pd.to_numeric(s, downcast='integer') -- ``Index`` now supports ``.str.extractall()`` which returns a ``DataFrame``, the see :ref:`docs here ` (:issue:`10008`, :issue:`13156`) - ``.to_hdf/read_hdf()`` now accept path objects (e.g. ``pathlib.Path``, ``py.path.local``) for the file path (:issue:`11773`) .. ipython:: python @@ -295,14 +328,6 @@ Other enhancements - The ``pd.read_html()`` has gained support for the ``na_values``, ``converters``, ``keep_default_na`` options (:issue:`13461`) -- ``Index.astype()`` now accepts an optional boolean argument ``copy``, which allows optional copying if the requirements on dtype are satisfied (:issue:`13209`) -- ``Index`` now supports the ``.where()`` function for same shape indexing (:issue:`13170`) - - .. ipython:: python - - idx = pd.Index(['a', 'b', 'c']) - idx.where([True, False, True]) - - ``Categorical.astype()`` now accepts an optional boolean argument ``copy``, effective when dtype is categorical (:issue:`13209`) - ``DataFrame`` has gained the ``.asof()`` method to return the last non-NaN values according to the selected subset (:issue:`13358`) - Consistent with the Python API, ``pd.read_csv()`` will now interpret ``+inf`` as positive infinity (:issue:`13274`) diff --git a/pandas/indexes/base.py b/pandas/indexes/base.py index b5ce456bda254..32bcb0bcc732f 100644 --- a/pandas/indexes/base.py +++ b/pandas/indexes/base.py @@ -3243,6 +3243,29 @@ def fillna(self, value=None, downcast=None): return Index(result, name=self.name) return self._shallow_copy() + _index_shared_docs['dropna'] = """ + Return Index without NA/NaN values + + Parameters + ---------- + how : {'any', 'all'}, default 'any' + If the Index is a MultiIndex, drop the value when any or all levels + are NaN. + + Returns + ------- + valid : Index + """ + + @Appender(_index_shared_docs['dropna']) + def dropna(self, how='any'): + if how not in ('any', 'all'): + raise ValueError("invalid how option: {0}".format(how)) + + if self.hasnans: + return self._shallow_copy(self.values[~self._isnan]) + return self._shallow_copy() + def _evaluate_with_timedelta_like(self, other, op, opstr): raise TypeError("can only perform ops with timedelta like values") diff --git a/pandas/indexes/multi.py b/pandas/indexes/multi.py index 184744915bd8d..95ef18d23a037 100644 --- a/pandas/indexes/multi.py +++ b/pandas/indexes/multi.py @@ -597,6 +597,19 @@ def fillna(self, value=None, downcast=None): # isnull is not implemented for MultiIndex raise NotImplementedError('isnull is not defined for MultiIndex') + @Appender(_index_shared_docs['dropna']) + def dropna(self, how='any'): + nans = [label == -1 for label in self.labels] + if how == 'any': + indexer = np.any(nans, axis=0) + elif how == 'all': + indexer = np.all(nans, axis=0) + else: + raise ValueError("invalid how option: {0}".format(how)) + + new_labels = [label[~indexer] for label in self.labels] + return self.copy(labels=new_labels, deep=True) + def get_value(self, series, key): # somewhat broken encapsulation from pandas.core.indexing import maybe_droplevels diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 0ddc71b01c22a..88e49c4b55c8a 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -1837,6 +1837,47 @@ def test_logical_compat(self): self.assertEqual(idx.all(), idx.values.all()) self.assertEqual(idx.any(), idx.values.any()) + def test_dropna(self): + # GH 6194 + for dtype in [None, object, 'category']: + idx = pd.Index([1, 2, 3], dtype=dtype) + tm.assert_index_equal(idx.dropna(), idx) + + idx = pd.Index([1., 2., 3.], dtype=dtype) + tm.assert_index_equal(idx.dropna(), idx) + nanidx = pd.Index([1., 2., np.nan, 3.], dtype=dtype) + tm.assert_index_equal(nanidx.dropna(), idx) + + idx = pd.Index(['A', 'B', 'C'], dtype=dtype) + tm.assert_index_equal(idx.dropna(), idx) + nanidx = pd.Index(['A', np.nan, 'B', 'C'], dtype=dtype) + tm.assert_index_equal(nanidx.dropna(), idx) + + tm.assert_index_equal(nanidx.dropna(how='any'), idx) + tm.assert_index_equal(nanidx.dropna(how='all'), idx) + + idx = pd.DatetimeIndex(['2011-01-01', '2011-01-02', '2011-01-03']) + tm.assert_index_equal(idx.dropna(), idx) + nanidx = pd.DatetimeIndex(['2011-01-01', '2011-01-02', + '2011-01-03', pd.NaT]) + tm.assert_index_equal(nanidx.dropna(), idx) + + idx = pd.TimedeltaIndex(['1 days', '2 days', '3 days']) + tm.assert_index_equal(idx.dropna(), idx) + nanidx = pd.TimedeltaIndex([pd.NaT, '1 days', '2 days', + '3 days', pd.NaT]) + tm.assert_index_equal(nanidx.dropna(), idx) + + idx = pd.PeriodIndex(['2012-02', '2012-04', '2012-05'], freq='M') + tm.assert_index_equal(idx.dropna(), idx) + nanidx = pd.PeriodIndex(['2012-02', '2012-04', 'NaT', '2012-05'], + freq='M') + tm.assert_index_equal(nanidx.dropna(), idx) + + msg = "invalid how option: xxx" + with tm.assertRaisesRegexp(ValueError, msg): + pd.Index([1, 2, 3]).dropna(how='xxx') + def test_get_combined_index(): from pandas.core.index import _get_combined_index diff --git a/pandas/tests/indexes/test_multi.py b/pandas/tests/indexes/test_multi.py index 408f81fe1e982..809e1ab05ef6e 100644 --- a/pandas/tests/indexes/test_multi.py +++ b/pandas/tests/indexes/test_multi.py @@ -2258,3 +2258,24 @@ def test_rangeindex_fallback_coercion_bug(self): result = df.index.get_level_values('buzz') expected = pd.Int64Index(np.tile(np.arange(10), 10), name='buzz') tm.assert_index_equal(result, expected) + + def test_dropna(self): + # GH 6194 + idx = pd.MultiIndex.from_arrays([[1, np.nan, 3, np.nan, 5], + [1, 2, np.nan, np.nan, 5], + ['a', 'b', 'c', np.nan, 'e']]) + + exp = pd.MultiIndex.from_arrays([[1, 5], + [1, 5], + ['a', 'e']]) + tm.assert_index_equal(idx.dropna(), exp) + tm.assert_index_equal(idx.dropna(how='any'), exp) + + exp = pd.MultiIndex.from_arrays([[1, np.nan, 3, 5], + [1, 2, np.nan, 5], + ['a', 'b', 'c', 'e']]) + tm.assert_index_equal(idx.dropna(how='all'), exp) + + msg = "invalid how option: xxx" + with tm.assertRaisesRegexp(ValueError, msg): + idx.dropna(how='xxx')