ENH: Implement mode(dropna=False) (#20779)

reidy-p · jreback · commit f1631bec96dd · 2018-05-31T17:44:29.000-04:00
diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt
@@ -13,6 +13,7 @@ New features
 Other Enhancements
 ^^^^^^^^^^^^^^^^^^
 - :func:`to_datetime` now supports the ``%Z`` and ``%z`` directive when passed into ``format`` (:issue:`13486`)
+- :func:`Series.mode` and :func:`DataFrame.mode` now support the ``dropna`` parameter which can be used to specify whether NaN/NaT values should be considered (:issue:`17534`)
 - :func:`to_csv` now supports ``compression`` keyword when a file handle is passed. (:issue:`21227`)
 -
 
diff --git a/pandas/_libs/hashtable_func_helper.pxi.in b/pandas/_libs/hashtable_func_helper.pxi.in
@@ -288,7 +288,8 @@ def ismember_{{dtype}}({{scalar}}[:] arr, {{scalar}}[:] values, bint hasnans=0):
 {{py:
 
 # dtype, ctype, table_type, npy_dtype
-dtypes = [('int64', 'int64_t', 'int64', 'int64'),
+dtypes = [('float64', 'float64_t', 'float64', 'float64'),
+          ('int64', 'int64_t', 'int64', 'int64'),
           ('uint64', 'uint64_t', 'uint64', 'uint64'),
           ('object', 'object', 'pymap', 'object_')]
 }}
@@ -302,11 +303,11 @@ dtypes = [('int64', 'int64_t', 'int64', 'int64'),
 {{if dtype == 'object'}}
 
 
-def mode_{{dtype}}(ndarray[{{ctype}}] values):
+def mode_{{dtype}}(ndarray[{{ctype}}] values, bint dropna):
 {{else}}
 
 
-def mode_{{dtype}}({{ctype}}[:] values):
+def mode_{{dtype}}({{ctype}}[:] values, bint dropna):
 {{endif}}
     cdef:
         int count, max_count = 1
@@ -317,9 +318,9 @@ def mode_{{dtype}}({{ctype}}[:] values):
 
     table = kh_init_{{table_type}}()
     {{if dtype == 'object'}}
-    build_count_table_{{dtype}}(values, table, 1)
+    build_count_table_{{dtype}}(values, table, dropna)
     {{else}}
-    build_count_table_{{dtype}}(values, table, 0)
+    build_count_table_{{dtype}}(values, table, dropna)
     {{endif}}
 
     modes = np.empty(table.n_buckets, dtype=np.{{npy_dtype}})
@@ -329,7 +330,6 @@ def mode_{{dtype}}({{ctype}}[:] values):
         for k in range(table.n_buckets):
             if kh_exist_{{table_type}}(table, k):
                 count = table.vals[k]
-
                 if count == max_count:
                     j += 1
                 elif count > max_count:
diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
@@ -25,8 +25,8 @@
     is_bool_dtype, needs_i8_conversion,
     is_datetimetz,
     is_datetime64_any_dtype, is_datetime64tz_dtype,
-    is_timedelta64_dtype, is_interval_dtype,
-    is_scalar, is_list_like,
+    is_timedelta64_dtype, is_datetimelike,
+    is_interval_dtype, is_scalar, is_list_like,
     _ensure_platform_int, _ensure_object,
     _ensure_float64, _ensure_uint64,
     _ensure_int64)
@@ -798,14 +798,18 @@ def duplicated(values, keep='first'):
     return f(values, keep=keep)
 
 
-def mode(values):
+def mode(values, dropna=True):
     """
     Returns the mode(s) of an array.
 
     Parameters
     ----------
     values : array-like
         Array over which to check for duplicate values.
+    dropna : boolean, default True
+        Don't consider counts of NaN/NaT.
+
+        .. versionadded:: 0.24.0
 
     Returns
     -------
@@ -818,20 +822,18 @@ def mode(values):
 
     # categorical is a fast-path
     if is_categorical_dtype(values):
-
         if isinstance(values, Series):
-            return Series(values.values.mode(), name=values.name)
-        return values.mode()
+            return Series(values.values.mode(dropna=dropna), name=values.name)
+        return values.mode(dropna=dropna)
 
-    values, dtype, ndtype = _ensure_data(values)
+    if dropna and is_datetimelike(values):
+        mask = values.isnull()
+        values = values[~mask]
 
-    # TODO: this should support float64
-    if ndtype not in ['int64', 'uint64', 'object']:
-        ndtype = 'object'
-        values = _ensure_object(values)
+    values, dtype, ndtype = _ensure_data(values)
 
     f = getattr(htable, "mode_{dtype}".format(dtype=ndtype))
-    result = f(values)
+    result = f(values, dropna=dropna)
     try:
         result = np.sort(result)
     except TypeError as e:
diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py
@@ -2118,20 +2118,30 @@ def max(self, numeric_only=None, **kwargs):
         else:
             return self.categories[pointer]
 
-    def mode(self):
+    def mode(self, dropna=True):
         """
         Returns the mode(s) of the Categorical.
 
         Always returns `Categorical` even if only one value.
 
+        Parameters
+        ----------
+        dropna : boolean, default True
+            Don't consider counts of NaN/NaT.
+
+            .. versionadded:: 0.24.0
+
         Returns
         -------
         modes : `Categorical` (sorted)
         """
 
         import pandas._libs.hashtable as htable
-        good = self._codes != -1
-        values = sorted(htable.mode_int64(_ensure_int64(self._codes[good])))
+        values = self._codes
+        if dropna:
+            good = self._codes != -1
+            values = self._codes[good]
+        values = sorted(htable.mode_int64(_ensure_int64(values), dropna))
         result = self._constructor(values=values, categories=self.categories,
                                    ordered=self.ordered, fastpath=True)
         return result
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -7038,7 +7038,7 @@ def _get_agg_axis(self, axis_num):
         else:
             raise ValueError('Axis must be 0 or 1 (got %r)' % axis_num)
 
-    def mode(self, axis=0, numeric_only=False):
+    def mode(self, axis=0, numeric_only=False, dropna=True):
         """
         Gets the mode(s) of each element along the axis selected. Adds a row
         for each mode per label, fills in gaps with nan.
@@ -7056,6 +7056,10 @@ def mode(self, axis=0, numeric_only=False):
             * 1 or 'columns' : get mode of each row
         numeric_only : boolean, default False
             if True, only apply to numeric columns
+        dropna : boolean, default True
+            Don't consider counts of NaN/NaT.
+
+            .. versionadded:: 0.24.0
 
         Returns
         -------
@@ -7072,7 +7076,7 @@ def mode(self, axis=0, numeric_only=False):
         data = self if not numeric_only else self._get_numeric_data()
 
         def f(s):
-            return s.mode()
+            return s.mode(dropna=dropna)
 
         return data.apply(f, axis=axis)
 
diff --git a/pandas/core/series.py b/pandas/core/series.py
@@ -1431,17 +1431,24 @@ def count(self, level=None):
         return self._constructor(out, index=lev,
                                  dtype='int64').__finalize__(self)
 
-    def mode(self):
+    def mode(self, dropna=True):
         """Return the mode(s) of the dataset.
 
         Always returns Series even if only one value is returned.
 
+        Parameters
+        -------
+        dropna : boolean, default True
+            Don't consider counts of NaN/NaT.
+
+            .. versionadded:: 0.24.0
+
         Returns
         -------
         modes : Series (sorted)
         """
         # TODO: Add option for bins like value_counts()
-        return algorithms.mode(self)
+        return algorithms.mode(self, dropna=dropna)
 
     def unique(self):
         """
diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py
@@ -15,11 +15,11 @@
 from pandas.compat import lrange, product, PY35
 from pandas import (compat, isna, notna, DataFrame, Series,
                     MultiIndex, date_range, Timestamp, Categorical,
-                    _np_version_under1p12, _np_version_under1p15)
+                    _np_version_under1p12, _np_version_under1p15,
+                    to_datetime, to_timedelta)
 import pandas as pd
 import pandas.core.nanops as nanops
 import pandas.core.algorithms as algorithms
-import pandas.io.formats.printing as printing
 
 import pandas.util.testing as tm
 import pandas.util._test_decorators as td
@@ -840,54 +840,74 @@ def wrapper(x):
                 expected = pd.Series(unit, index=r1.index, dtype=r1.dtype)
                 tm.assert_series_equal(r1, expected)
 
-    def test_mode(self):
-        df = pd.DataFrame({"A": [12, 12, 11, 12, 19, 11],
-                           "B": [10, 10, 10, np.nan, 3, 4],
-                           "C": [8, 8, 8, 9, 9, 9],
-                           "D": np.arange(6, dtype='int64'),
-                           "E": [8, 8, 1, 1, 3, 3]})
-        tm.assert_frame_equal(df[["A"]].mode(),
-                              pd.DataFrame({"A": [12]}))
-        expected = pd.Series([0, 1, 2, 3, 4, 5], dtype='int64', name='D').\
-            to_frame()
-        tm.assert_frame_equal(df[["D"]].mode(), expected)
-        expected = pd.Series([1, 3, 8], dtype='int64', name='E').to_frame()
-        tm.assert_frame_equal(df[["E"]].mode(), expected)
-        tm.assert_frame_equal(df[["A", "B"]].mode(),
-                              pd.DataFrame({"A": [12], "B": [10.]}))
-        tm.assert_frame_equal(df.mode(),
-                              pd.DataFrame({"A": [12, np.nan, np.nan, np.nan,
-                                                  np.nan, np.nan],
-                                            "B": [10, np.nan, np.nan, np.nan,
-                                                  np.nan, np.nan],
-                                            "C": [8, 9, np.nan, np.nan, np.nan,
-                                                  np.nan],
-                                            "D": [0, 1, 2, 3, 4, 5],
-                                            "E": [1, 3, 8, np.nan, np.nan,
-                                                  np.nan]}))
-
-        # outputs in sorted order
-        df["C"] = list(reversed(df["C"]))
-        printing.pprint_thing(df["C"])
-        printing.pprint_thing(df["C"].mode())
-        a, b = (df[["A", "B", "C"]].mode(),
-                pd.DataFrame({"A": [12, np.nan],
-                              "B": [10, np.nan],
-                              "C": [8, 9]}))
-        printing.pprint_thing(a)
-        printing.pprint_thing(b)
-        tm.assert_frame_equal(a, b)
-        # should work with heterogeneous types
-        df = pd.DataFrame({"A": np.arange(6, dtype='int64'),
-                           "B": pd.date_range('2011', periods=6),
-                           "C": list('abcdef')})
-        exp = pd.DataFrame({"A": pd.Series(np.arange(6, dtype='int64'),
-                                           dtype=df["A"].dtype),
-                            "B": pd.Series(pd.date_range('2011', periods=6),
-                                           dtype=df["B"].dtype),
-                            "C": pd.Series(list('abcdef'),
-                                           dtype=df["C"].dtype)})
-        tm.assert_frame_equal(df.mode(), exp)
+    @pytest.mark.parametrize("dropna, expected", [
+        (True, {'A': [12],
+                'B': [10.0],
+                'C': [1.0],
+                'D': ['a'],
+                'E': Categorical(['a'], categories=['a']),
+                'F': to_datetime(['2000-1-2']),
+                'G': to_timedelta(['1 days'])}),
+        (False, {'A': [12],
+                 'B': [10.0],
+                 'C': [np.nan],
+                 'D': np.array([np.nan], dtype=object),
+                 'E': Categorical([np.nan], categories=['a']),
+                 'F': [pd.NaT],
+                 'G': to_timedelta([pd.NaT])}),
+        (True, {'H': [8, 9, np.nan, np.nan],
+                'I': [8, 9, np.nan, np.nan],
+                'J': [1, np.nan, np.nan, np.nan],
+                'K': Categorical(['a', np.nan, np.nan, np.nan],
+                                 categories=['a']),
+                'L': to_datetime(['2000-1-2', 'NaT', 'NaT', 'NaT']),
+                'M': to_timedelta(['1 days', 'nan', 'nan', 'nan']),
+                'N': [0, 1, 2, 3]}),
+        (False, {'H': [8, 9, np.nan, np.nan],
+                 'I': [8, 9, np.nan, np.nan],
+                 'J': [1, np.nan, np.nan, np.nan],
+                 'K': Categorical([np.nan, 'a', np.nan, np.nan],
+                                  categories=['a']),
+                 'L': to_datetime(['NaT', '2000-1-2', 'NaT', 'NaT']),
+                 'M': to_timedelta(['nan', '1 days', 'nan', 'nan']),
+                 'N': [0, 1, 2, 3]})
+    ])
+    def test_mode_dropna(self, dropna, expected):
+
+        df = DataFrame({"A": [12, 12, 19, 11],
+                        "B": [10, 10, np.nan, 3],
+                        "C": [1, np.nan, np.nan, np.nan],
+                        "D": [np.nan, np.nan, 'a', np.nan],
+                        "E": Categorical([np.nan, np.nan, 'a', np.nan]),
+                        "F": to_datetime(['NaT', '2000-1-2', 'NaT', 'NaT']),
+                        "G": to_timedelta(['1 days', 'nan', 'nan', 'nan']),
+                        "H": [8, 8, 9, 9],
+                        "I": [9, 9, 8, 8],
+                        "J": [1, 1, np.nan, np.nan],
+                        "K": Categorical(['a', np.nan, 'a', np.nan]),
+                        "L": to_datetime(['2000-1-2', '2000-1-2',
+                                          'NaT', 'NaT']),
+                        "M": to_timedelta(['1 days', 'nan',
+                                           '1 days', 'nan']),
+                        "N": np.arange(4, dtype='int64')})
+
+        result = df[sorted(list(expected.keys()))].mode(dropna=dropna)
+        expected = DataFrame(expected)
+        tm.assert_frame_equal(result, expected)
+
+    @pytest.mark.skipif(not compat.PY3, reason="only PY3")
+    def test_mode_sortwarning(self):
+        # Check for the warning that is raised when the mode
+        # results cannot be sorted
+
+        df = DataFrame({"A": [np.nan, np.nan, 'a', 'a']})
+        expected = DataFrame({'A': ['a', np.nan]})
+
+        with tm.assert_produces_warning(UserWarning, check_stacklevel=False):
+            result = df.mode(dropna=False)
+            result = result.sort_values(by='A').reset_index(drop=True)
+
+        tm.assert_frame_equal(result, expected)
 
     def test_operators_timedelta64(self):
         from datetime import timedelta
diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py

Original file line number	Diff line number	Diff line change
`@@ -13,6 +13,7 @@ New features`
`13`	`13`	`Other Enhancements`
`14`	`14`	`^^^^^^^^^^^^^^^^^^`
`15`	`15`	- :func:`to_datetime` now supports the ``%Z`` and ``%z`` directive when passed into ``format`` (:issue:`13486`)
	`16`	+- :func:`Series.mode` and :func:`DataFrame.mode` now support the ``dropna`` parameter which can be used to specify whether NaN/NaT values should be considered (:issue:`17534`)
`16`	`17`	- :func:`to_csv` now supports ``compression`` keyword when a file handle is passed. (:issue:`21227`)
`17`	`18`	`-`
`18`	`19`