ENH: add Index.dropna

sinhrks · sinhrks · commit 1672f26919cd · 2016-07-28T07:34:27.000+09:00
diff --git a/doc/source/api.rst b/doc/source/api.rst
@@ -1349,6 +1349,8 @@ Modifying and Computations
    Index.unique
    Index.nunique
    Index.value_counts
+   Index.fillna
+   Index.dropna
 
 Conversion
 ~~~~~~~~~~
diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt
@@ -259,6 +259,40 @@ Using the anchoring suffix, you can also specify the day of month to use instead
 
     pd.date_range('2015-01-01', freq='SM-14', periods=4)
 
+.. _whatsnew_0190.enhancements.index:
+
+New Index methods
+^^^^^^^^^^^^^^^^^
+
+Following methods and options are added to ``Index`` to be more consistent with ``Series`` and ``DataFrame``.
+
+- ``Index`` now supports the ``.where()`` function for same shape indexing (:issue:`13170`)
+
+  .. ipython:: python
+
+     idx = pd.Index(['a', 'b', 'c'])
+     idx.where([True, False, True])
+
+
+- ``Index`` now supports ``.dropna`` to exclude missing values (:issue:`6194`)
+
+  .. ipython:: python
+
+     idx = pd.Index([1, 2, np.nan, 4])
+     idx.dropna()
+
+For ``MultiIndex``, values are dropped if any level is missing by default. Specifying
+``how='all'`` only drops values where all levels are missing.
+
+     midx = pd.MultiIndex.from_arrays([[1, 2, np.nan, 4],
+                                       [1, 2, np.nan, np.nan]])
+     midx
+     midx.dropna()
+     midx.dropna(how='all')
+
+- ``Index.astype()`` now accepts an optional boolean argument ``copy``, which allows optional copying if the requirements on dtype are satisfied (:issue:`13209`)
+- ``Index`` now supports ``.str.extractall()`` which returns a ``DataFrame``, the see :ref:`docs here <text.extractall>` (:issue:`10008`, :issue:`13156`)
+
 .. _whatsnew_0190.enhancements.other:
 
 Other enhancements
@@ -273,7 +307,6 @@ Other enhancements
      pd.to_numeric(s, downcast='unsigned')
      pd.to_numeric(s, downcast='integer')
 
-- ``Index`` now supports ``.str.extractall()`` which returns a ``DataFrame``, the see :ref:`docs here <text.extractall>` (:issue:`10008`, :issue:`13156`)
 - ``.to_hdf/read_hdf()`` now accept path objects (e.g. ``pathlib.Path``, ``py.path.local``) for the file path (:issue:`11773`)
 
   .. ipython:: python
@@ -295,14 +328,6 @@ Other enhancements
 
 - The ``pd.read_html()`` has gained support for the ``na_values``, ``converters``, ``keep_default_na``  options (:issue:`13461`)
 
-- ``Index.astype()`` now accepts an optional boolean argument ``copy``, which allows optional copying if the requirements on dtype are satisfied (:issue:`13209`)
-- ``Index`` now supports the ``.where()`` function for same shape indexing (:issue:`13170`)
-
-  .. ipython:: python
-
-     idx = pd.Index(['a', 'b', 'c'])
-     idx.where([True, False, True])
-
 - ``Categorical.astype()`` now accepts an optional boolean argument ``copy``, effective when dtype is categorical (:issue:`13209`)
 - ``DataFrame`` has gained the ``.asof()`` method to return the last non-NaN values according to the selected subset (:issue:`13358`)
 - Consistent with the Python API, ``pd.read_csv()`` will now interpret ``+inf`` as positive infinity (:issue:`13274`)
diff --git a/pandas/indexes/base.py b/pandas/indexes/base.py
@@ -3243,6 +3243,29 @@ def fillna(self, value=None, downcast=None):
                 return Index(result, name=self.name)
         return self._shallow_copy()
 
+    _index_shared_docs['dropna'] = """
+        Return Index without NA/NaN values
+
+        Parameters
+        ----------
+        how :  {'any', 'all'}, default 'any'
+            If the Index is a MultiIndex, drop the value when any or all levels
+            are NaN.
+
+        Returns
+        -------
+        valid : Index
+        """
+
+    @Appender(_index_shared_docs['dropna'])
+    def dropna(self, how='any'):
+        if how not in ('any', 'all'):
+            raise ValueError("invalid how option: {0}".format(how))
+
+        if self.hasnans:
+            return self._shallow_copy(self.values[~self._isnan])
+        return self._shallow_copy()
+
     def _evaluate_with_timedelta_like(self, other, op, opstr):
         raise TypeError("can only perform ops with timedelta like values")
 
diff --git a/pandas/indexes/multi.py b/pandas/indexes/multi.py
@@ -597,6 +597,19 @@ def fillna(self, value=None, downcast=None):
         # isnull is not implemented for MultiIndex
         raise NotImplementedError('isnull is not defined for MultiIndex')
 
+    @Appender(_index_shared_docs['dropna'])
+    def dropna(self, how='any'):
+        nans = [label == -1 for label in self.labels]
+        if how == 'any':
+            indexer = np.any(nans, axis=0)
+        elif how == 'all':
+            indexer = np.all(nans, axis=0)
+        else:
+            raise ValueError("invalid how option: {0}".format(how))
+
+        new_labels = [label[~indexer] for label in self.labels]
+        return self.copy(labels=new_labels, deep=True)
+
     def get_value(self, series, key):
         # somewhat broken encapsulation
         from pandas.core.indexing import maybe_droplevels
diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py
@@ -1837,6 +1837,47 @@ def test_logical_compat(self):
         self.assertEqual(idx.all(), idx.values.all())
         self.assertEqual(idx.any(), idx.values.any())
 
+    def test_dropna(self):
+        # GH 6194
+        for dtype in [None, object, 'category']:
+            idx = pd.Index([1, 2, 3], dtype=dtype)
+            tm.assert_index_equal(idx.dropna(), idx)
+
+            idx = pd.Index([1., 2., 3.], dtype=dtype)
+            tm.assert_index_equal(idx.dropna(), idx)
+            nanidx = pd.Index([1., 2., np.nan, 3.], dtype=dtype)
+            tm.assert_index_equal(nanidx.dropna(), idx)
+
+            idx = pd.Index(['A', 'B', 'C'], dtype=dtype)
+            tm.assert_index_equal(idx.dropna(), idx)
+            nanidx = pd.Index(['A', np.nan, 'B', 'C'], dtype=dtype)
+            tm.assert_index_equal(nanidx.dropna(), idx)
+
+            tm.assert_index_equal(nanidx.dropna(how='any'), idx)
+            tm.assert_index_equal(nanidx.dropna(how='all'), idx)
+
+        idx = pd.DatetimeIndex(['2011-01-01', '2011-01-02', '2011-01-03'])
+        tm.assert_index_equal(idx.dropna(), idx)
+        nanidx = pd.DatetimeIndex(['2011-01-01', '2011-01-02',
+                                   '2011-01-03', pd.NaT])
+        tm.assert_index_equal(nanidx.dropna(), idx)
+
+        idx = pd.TimedeltaIndex(['1 days', '2 days', '3 days'])
+        tm.assert_index_equal(idx.dropna(), idx)
+        nanidx = pd.TimedeltaIndex([pd.NaT, '1 days', '2 days',
+                                   '3 days', pd.NaT])
+        tm.assert_index_equal(nanidx.dropna(), idx)
+
+        idx = pd.PeriodIndex(['2012-02', '2012-04', '2012-05'], freq='M')
+        tm.assert_index_equal(idx.dropna(), idx)
+        nanidx = pd.PeriodIndex(['2012-02', '2012-04', 'NaT', '2012-05'],
+                                freq='M')
+        tm.assert_index_equal(nanidx.dropna(), idx)
+
+        msg = "invalid how option: xxx"
+        with tm.assertRaisesRegexp(ValueError, msg):
+            pd.Index([1, 2, 3]).dropna(how='xxx')
+
 
 def test_get_combined_index():
     from pandas.core.index import _get_combined_index
diff --git a/pandas/tests/indexes/test_multi.py b/pandas/tests/indexes/test_multi.py
@@ -2258,3 +2258,24 @@ def test_rangeindex_fallback_coercion_bug(self):
         result = df.index.get_level_values('buzz')
         expected = pd.Int64Index(np.tile(np.arange(10), 10), name='buzz')
         tm.assert_index_equal(result, expected)
+
+    def test_dropna(self):
+        # GH 6194
+        idx = pd.MultiIndex.from_arrays([[1, np.nan, 3, np.nan, 5],
+                                         [1, 2, np.nan, np.nan, 5],
+                                         ['a', 'b', 'c', np.nan, 'e']])
+
+        exp = pd.MultiIndex.from_arrays([[1, 5],
+                                         [1, 5],
+                                         ['a', 'e']])
+        tm.assert_index_equal(idx.dropna(), exp)
+        tm.assert_index_equal(idx.dropna(how='any'), exp)
+
+        exp = pd.MultiIndex.from_arrays([[1, np.nan, 3, 5],
+                                         [1, 2, np.nan, 5],
+                                         ['a', 'b', 'c', 'e']])
+        tm.assert_index_equal(idx.dropna(how='all'), exp)
+
+        msg = "invalid how option: xxx"
+        with tm.assertRaisesRegexp(ValueError, msg):
+            idx.dropna(how='xxx')