ENH: Implement mode(dropna=False)

reidy-p · reidy-p · commit 7796dffce61f · 2018-04-21T22:54:52.000+01:00
diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt
@@ -442,6 +442,7 @@ Other Enhancements
 - Updated ``to_gbq`` and ``read_gbq`` signature and documentation to reflect changes from
   the Pandas-GBQ library version 0.4.0. Adds intersphinx mapping to Pandas-GBQ
   library. (:issue:`20564`)
+- :func:`Series.mode` and :func:`DataFrame.mode` now support the ``dropna`` parameter which can be used to specify whether NaN/NaT values should be considered (:issue:`17534`)
 
 .. _whatsnew_0230.api_breaking:
 
diff --git a/pandas/_libs/hashtable_func_helper.pxi.in b/pandas/_libs/hashtable_func_helper.pxi.in
@@ -288,7 +288,8 @@ def ismember_{{dtype}}({{scalar}}[:] arr, {{scalar}}[:] values, bint hasnans=0):
 {{py:
 
 # dtype, ctype, table_type, npy_dtype
-dtypes = [('int64', 'int64_t', 'int64', 'int64'),
+dtypes = [('float64', 'float64_t', 'float64', 'float64'),
+          ('int64', 'int64_t', 'int64', 'int64'),
           ('uint64', 'uint64_t', 'uint64', 'uint64'),
           ('object', 'object', 'pymap', 'object_')]
 }}
@@ -302,11 +303,11 @@ dtypes = [('int64', 'int64_t', 'int64', 'int64'),
 {{if dtype == 'object'}}
 
 
-def mode_{{dtype}}(ndarray[{{ctype}}] values):
+def mode_{{dtype}}(ndarray[{{ctype}}] values, bint dropna):
 {{else}}
 
 
-def mode_{{dtype}}({{ctype}}[:] values):
+def mode_{{dtype}}({{ctype}}[:] values, bint dropna):
 {{endif}}
     cdef:
         int count, max_count = 1
@@ -317,9 +318,9 @@ def mode_{{dtype}}({{ctype}}[:] values):
 
     table = kh_init_{{table_type}}()
     {{if dtype == 'object'}}
-    build_count_table_{{dtype}}(values, table, 1)
+    build_count_table_{{dtype}}(values, table, dropna)
     {{else}}
-    build_count_table_{{dtype}}(values, table, 0)
+    build_count_table_{{dtype}}(values, table, dropna)
     {{endif}}
 
     modes = np.empty(table.n_buckets, dtype=np.{{npy_dtype}})
@@ -329,7 +330,6 @@ def mode_{{dtype}}({{ctype}}[:] values):
         for k in range(table.n_buckets):
             if kh_exist_{{table_type}}(table, k):
                 count = table.vals[k]
-
                 if count == max_count:
                     j += 1
                 elif count > max_count:
diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
@@ -25,8 +25,8 @@
     is_bool_dtype, needs_i8_conversion,
     is_datetimetz,
     is_datetime64_any_dtype, is_datetime64tz_dtype,
-    is_timedelta64_dtype, is_interval_dtype,
-    is_scalar, is_list_like,
+    is_timedelta64_dtype, is_datetimelike,
+    is_interval_dtype, is_scalar, is_list_like,
     _ensure_platform_int, _ensure_object,
     _ensure_float64, _ensure_uint64,
     _ensure_int64)
@@ -791,14 +791,16 @@ def duplicated(values, keep='first'):
     return f(values, keep=keep)
 
 
-def mode(values):
+def mode(values, dropna=True):
     """
     Returns the mode(s) of an array.
 
     Parameters
     ----------
     values : array-like
         Array over which to check for duplicate values.
+    dropna : boolean, default True
+        Don't consider counts of NaN/NaT.
 
     Returns
     -------
@@ -811,20 +813,18 @@ def mode(values):
 
     # categorical is a fast-path
     if is_categorical_dtype(values):
-
         if isinstance(values, Series):
-            return Series(values.values.mode(), name=values.name)
-        return values.mode()
+            return Series(values.values.mode(dropna=dropna), name=values.name)
+        return values.mode(dropna=dropna)
 
-    values, dtype, ndtype = _ensure_data(values)
+    if dropna and is_datetimelike(values):
+        mask = values.isnull()
+        values = values[~mask]
 
-    # TODO: this should support float64
-    if ndtype not in ['int64', 'uint64', 'object']:
-        ndtype = 'object'
-        values = _ensure_object(values)
+    values, dtype, ndtype = _ensure_data(values)
 
     f = getattr(htable, "mode_{dtype}".format(dtype=ndtype))
-    result = f(values)
+    result = f(values, dropna=dropna)
     try:
         result = np.sort(result)
     except TypeError as e:
diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py
@@ -2044,20 +2044,28 @@ def max(self, numeric_only=None, **kwargs):
         else:
             return self.categories[pointer]
 
-    def mode(self):
+    def mode(self, dropna=True):
         """
         Returns the mode(s) of the Categorical.
 
         Always returns `Categorical` even if only one value.
 
+        Parameters
+        ----------
+        dropna : boolean, default True
+            Don't consider counts of NaN/NaT.
+
         Returns
         -------
         modes : `Categorical` (sorted)
         """
 
         import pandas._libs.hashtable as htable
-        good = self._codes != -1
-        values = sorted(htable.mode_int64(_ensure_int64(self._codes[good])))
+        values = self._codes
+        if dropna:
+            good = self._codes != -1
+            values = self._codes[good]
+        values = sorted(htable.mode_int64(_ensure_int64(values), dropna))
         result = self._constructor(values=values, categories=self.categories,
                                    ordered=self.ordered, fastpath=True)
         return result
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -6992,7 +6992,7 @@ def _get_agg_axis(self, axis_num):
         else:
             raise ValueError('Axis must be 0 or 1 (got %r)' % axis_num)
 
-    def mode(self, axis=0, numeric_only=False):
+    def mode(self, axis=0, numeric_only=False, dropna=True):
         """
         Gets the mode(s) of each element along the axis selected. Adds a row
         for each mode per label, fills in gaps with nan.
@@ -7010,6 +7010,8 @@ def mode(self, axis=0, numeric_only=False):
             * 1 or 'columns' : get mode of each row
         numeric_only : boolean, default False
             if True, only apply to numeric columns
+        dropna : boolean, default True
+            Don't consider counts of NaN/NaT.
 
         Returns
         -------
@@ -7026,7 +7028,7 @@ def mode(self, axis=0, numeric_only=False):
         data = self if not numeric_only else self._get_numeric_data()
 
         def f(s):
-            return s.mode()
+            return s.mode(dropna=dropna)
 
         return data.apply(f, axis=axis)
 
diff --git a/pandas/core/series.py b/pandas/core/series.py
@@ -1420,17 +1420,22 @@ def count(self, level=None):
         return self._constructor(out, index=lev,
                                  dtype='int64').__finalize__(self)
 
-    def mode(self):
+    def mode(self, dropna=True):
         """Return the mode(s) of the dataset.
 
         Always returns Series even if only one value is returned.
 
+        Parameters
+        -------
+        dropna : boolean, default True
+            Don't consider counts of NaN/NaT.
+
         Returns
         -------
         modes : Series (sorted)
         """
         # TODO: Add option for bins like value_counts()
-        return algorithms.mode(self)
+        return algorithms.mode(self, dropna=dropna)
 
     def unique(self):
         """
diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py
@@ -15,7 +15,8 @@
 from pandas.compat import lrange, product, PY35
 from pandas import (compat, isna, notna, DataFrame, Series,
                     MultiIndex, date_range, Timestamp, Categorical,
-                    _np_version_under1p12, _np_version_under1p15)
+                    _np_version_under1p12, _np_version_under1p15,
+                    to_datetime, to_timedelta)
 import pandas as pd
 import pandas.core.nanops as nanops
 import pandas.core.algorithms as algorithms
@@ -889,6 +890,40 @@ def test_mode(self):
                                            dtype=df["C"].dtype)})
         tm.assert_frame_equal(df.mode(), exp)
 
+    def test_mode_dropna(self):
+        # GH 17534
+        # Test the dropna=False parameter for mode
+
+        df = pd.DataFrame({"A": [1, np.nan, np.nan, np.nan],
+                           "B": [np.nan, np.nan, 'a', np.nan],
+                           "C": Categorical([np.nan, np.nan, 'a', np.nan]),
+                           "D": to_datetime(['NaT', '2000-1-2', 'NaT', 'NaT']),
+                           "E": to_timedelta(['1 days', 'nan', 'nan', 'nan']),
+                           "F": [1, 1, np.nan, np.nan],
+                           "G": [np.nan, np.nan, 'a', 'a'],
+                           "H": Categorical(['a', np.nan, 'a', np.nan]),
+                           "I": to_datetime(['2000-1-2', '2000-1-2',
+                                             'NaT', 'NaT']),
+                           "J": to_timedelta(['1 days', 'nan',
+                                              '1 days', 'nan'])})
+
+        result = df.loc[:, 'A':'E'].mode(dropna=False)
+        expected = pd.DataFrame({'A': [np.nan],
+                                 'B': np.array([np.nan], dtype=object),
+                                 'C': Categorical([np.nan], categories=['a']),
+                                 'D': [pd.NaT],
+                                 'E': to_timedelta([pd.NaT])})
+        tm.assert_frame_equal(result, expected)
+
+        result = df.loc[:, 'F':'J'].mode(dropna=False)
+        expected = pd.DataFrame({'F': [1, np.nan],
+                                 'G': [np.nan, 'a'],
+                                 'H': Categorical([np.nan, 'a'],
+                                                  categories=['a']),
+                                 'I': to_datetime(['NaT', '2000-1-2']),
+                                 'J': to_timedelta(['nan', '1 days'])})
+        tm.assert_frame_equal(result, expected)
+
     def test_operators_timedelta64(self):
         from datetime import timedelta
         df = DataFrame(dict(A=date_range('2012-1-1', periods=3, freq='D'),
diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py
@@ -12,7 +12,7 @@
 
 from pandas import (Series, Categorical, DataFrame, isna, notna,
                     bdate_range, date_range, _np_version_under1p10,
-                    CategoricalIndex)
+                    CategoricalIndex, to_datetime, to_timedelta)
 from pandas.core.index import MultiIndex
 from pandas.core.indexes.datetimes import Timestamp
 from pandas.core.indexes.timedeltas import Timedelta
@@ -321,6 +321,30 @@ def test_mode(self):
         exp = Series(exp, dtype='category')
         tm.assert_series_equal(Series(c).mode(), exp)
 
+    @pytest.mark.parametrize('values, expected', [
+        ([np.nan, np.nan, 1], [np.nan]),
+        ([np.nan, 1], [1, np.nan]),
+        ([np.nan, np.nan, 'a'], np.array([np.nan], dtype=object)),
+        ([np.nan, 'a'], [np.nan, 'a']),
+        (Categorical([np.nan, np.nan, 'a']),
+         Categorical([np.nan], categories=['a'])),
+        (Categorical([np.nan, 'a']),
+         Categorical([np.nan, 'a'], categories=['a'])),
+        (Categorical([np.nan, np.nan, 1]),
+         Categorical([np.nan], categories=[1])),
+        (to_datetime(['NaT', '2000-1-2', 'NaT']), [pd.NaT]),
+        (to_datetime(['NaT', '2000-1-2']), to_datetime(['NaT', '2000-1-2'])),
+        (to_timedelta(['1 days', 'nan', 'nan']), to_timedelta(['NaT'])),
+        (to_timedelta(['1 days', 'nan']), to_timedelta(['nan', '1 days']))
+    ])
+    def test_mode_dropna(self, values, expected):
+        # GH 17534
+        # Test the dropna=False parameter for mode
+
+        result = Series(values).mode(dropna=False)
+        expected = Series(expected)
+        tm.assert_series_equal(result, expected)
+
     def test_prod(self):
         self._check_stat_op('prod', np.prod)