From 3b50aecf08f9fb49f2ae9bb72d6af4904cb3e821 Mon Sep 17 00:00:00 2001 From: reidy-p Date: Fri, 20 Apr 2018 23:39:17 +0100 Subject: [PATCH 1/7] ENH: Implement mode(dropna=False) --- doc/source/whatsnew/v0.23.0.txt | 1 + pandas/_libs/hashtable_func_helper.pxi.in | 12 ++++---- pandas/core/algorithms.py | 24 +++++++-------- pandas/core/arrays/categorical.py | 14 +++++++-- pandas/core/frame.py | 6 ++-- pandas/core/series.py | 9 ++++-- pandas/tests/frame/test_analytics.py | 37 ++++++++++++++++++++++- pandas/tests/series/test_analytics.py | 26 +++++++++++++++- 8 files changed, 102 insertions(+), 27 deletions(-) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index a099fb40c35a7..3c7bc220c94c2 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -550,6 +550,7 @@ Other Enhancements - :func:`to_hdf` and :func:`read_hdf` now accept an ``errors`` keyword argument to control encoding error handling (:issue:`20835`) - :func:`cut` has gained the ``duplicates='raise'|'drop'`` option to control whether to raise on duplicated edges (:issue:`20947`) - :func:`date_range`, :func:`timedelta_range`, and :func:`interval_range` now return a linearly spaced index if ``start``, ``stop``, and ``periods`` are specified, but ``freq`` is not. (:issue:`20808`, :issue:`20983`, :issue:`20976`) +- :func:`Series.mode` and :func:`DataFrame.mode` now support the ``dropna`` parameter which can be used to specify whether NaN/NaT values should be considered (:issue:`17534`) .. _whatsnew_0230.api_breaking: diff --git a/pandas/_libs/hashtable_func_helper.pxi.in b/pandas/_libs/hashtable_func_helper.pxi.in index c97639481f12c..521e564447c59 100644 --- a/pandas/_libs/hashtable_func_helper.pxi.in +++ b/pandas/_libs/hashtable_func_helper.pxi.in @@ -288,7 +288,8 @@ def ismember_{{dtype}}({{scalar}}[:] arr, {{scalar}}[:] values, bint hasnans=0): {{py: # dtype, ctype, table_type, npy_dtype -dtypes = [('int64', 'int64_t', 'int64', 'int64'), +dtypes = [('float64', 'float64_t', 'float64', 'float64'), + ('int64', 'int64_t', 'int64', 'int64'), ('uint64', 'uint64_t', 'uint64', 'uint64'), ('object', 'object', 'pymap', 'object_')] }} @@ -302,11 +303,11 @@ dtypes = [('int64', 'int64_t', 'int64', 'int64'), {{if dtype == 'object'}} -def mode_{{dtype}}(ndarray[{{ctype}}] values): +def mode_{{dtype}}(ndarray[{{ctype}}] values, bint dropna): {{else}} -def mode_{{dtype}}({{ctype}}[:] values): +def mode_{{dtype}}({{ctype}}[:] values, bint dropna): {{endif}} cdef: int count, max_count = 1 @@ -317,9 +318,9 @@ def mode_{{dtype}}({{ctype}}[:] values): table = kh_init_{{table_type}}() {{if dtype == 'object'}} - build_count_table_{{dtype}}(values, table, 1) + build_count_table_{{dtype}}(values, table, dropna) {{else}} - build_count_table_{{dtype}}(values, table, 0) + build_count_table_{{dtype}}(values, table, dropna) {{endif}} modes = np.empty(table.n_buckets, dtype=np.{{npy_dtype}}) @@ -329,7 +330,6 @@ def mode_{{dtype}}({{ctype}}[:] values): for k in range(table.n_buckets): if kh_exist_{{table_type}}(table, k): count = table.vals[k] - if count == max_count: j += 1 elif count > max_count: diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 88bc497f9f22d..c5b18db96ead4 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -25,8 +25,8 @@ is_bool_dtype, needs_i8_conversion, is_datetimetz, is_datetime64_any_dtype, is_datetime64tz_dtype, - is_timedelta64_dtype, is_interval_dtype, - is_scalar, is_list_like, + is_timedelta64_dtype, is_datetimelike, + is_interval_dtype, is_scalar, is_list_like, _ensure_platform_int, _ensure_object, _ensure_float64, _ensure_uint64, _ensure_int64) @@ -798,7 +798,7 @@ def duplicated(values, keep='first'): return f(values, keep=keep) -def mode(values): +def mode(values, dropna=True): """ Returns the mode(s) of an array. @@ -806,6 +806,8 @@ def mode(values): ---------- values : array-like Array over which to check for duplicate values. + dropna : boolean, default True + Don't consider counts of NaN/NaT. Returns ------- @@ -818,20 +820,18 @@ def mode(values): # categorical is a fast-path if is_categorical_dtype(values): - if isinstance(values, Series): - return Series(values.values.mode(), name=values.name) - return values.mode() + return Series(values.values.mode(dropna=dropna), name=values.name) + return values.mode(dropna=dropna) - values, dtype, ndtype = _ensure_data(values) + if dropna and is_datetimelike(values): + mask = values.isnull() + values = values[~mask] - # TODO: this should support float64 - if ndtype not in ['int64', 'uint64', 'object']: - ndtype = 'object' - values = _ensure_object(values) + values, dtype, ndtype = _ensure_data(values) f = getattr(htable, "mode_{dtype}".format(dtype=ndtype)) - result = f(values) + result = f(values, dropna=dropna) try: result = np.sort(result) except TypeError as e: diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index a1a8f098b582e..abc7831dba190 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -2118,20 +2118,28 @@ def max(self, numeric_only=None, **kwargs): else: return self.categories[pointer] - def mode(self): + def mode(self, dropna=True): """ Returns the mode(s) of the Categorical. Always returns `Categorical` even if only one value. + Parameters + ---------- + dropna : boolean, default True + Don't consider counts of NaN/NaT. + Returns ------- modes : `Categorical` (sorted) """ import pandas._libs.hashtable as htable - good = self._codes != -1 - values = sorted(htable.mode_int64(_ensure_int64(self._codes[good]))) + values = self._codes + if dropna: + good = self._codes != -1 + values = self._codes[good] + values = sorted(htable.mode_int64(_ensure_int64(values), dropna)) result = self._constructor(values=values, categories=self.categories, ordered=self.ordered, fastpath=True) return result diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 22677b19192e1..932a299f1e95b 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -7039,7 +7039,7 @@ def _get_agg_axis(self, axis_num): else: raise ValueError('Axis must be 0 or 1 (got %r)' % axis_num) - def mode(self, axis=0, numeric_only=False): + def mode(self, axis=0, numeric_only=False, dropna=True): """ Gets the mode(s) of each element along the axis selected. Adds a row for each mode per label, fills in gaps with nan. @@ -7057,6 +7057,8 @@ def mode(self, axis=0, numeric_only=False): * 1 or 'columns' : get mode of each row numeric_only : boolean, default False if True, only apply to numeric columns + dropna : boolean, default True + Don't consider counts of NaN/NaT. Returns ------- @@ -7073,7 +7075,7 @@ def mode(self, axis=0, numeric_only=False): data = self if not numeric_only else self._get_numeric_data() def f(s): - return s.mode() + return s.mode(dropna=dropna) return data.apply(f, axis=axis) diff --git a/pandas/core/series.py b/pandas/core/series.py index c9329e8b9e572..4525d9e8c7c85 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1431,17 +1431,22 @@ def count(self, level=None): return self._constructor(out, index=lev, dtype='int64').__finalize__(self) - def mode(self): + def mode(self, dropna=True): """Return the mode(s) of the dataset. Always returns Series even if only one value is returned. + Parameters + ------- + dropna : boolean, default True + Don't consider counts of NaN/NaT. + Returns ------- modes : Series (sorted) """ # TODO: Add option for bins like value_counts() - return algorithms.mode(self) + return algorithms.mode(self, dropna=dropna) def unique(self): """ diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index d1a4a5f615b86..0d9044c0766a0 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -15,7 +15,8 @@ from pandas.compat import lrange, product, PY35 from pandas import (compat, isna, notna, DataFrame, Series, MultiIndex, date_range, Timestamp, Categorical, - _np_version_under1p12, _np_version_under1p15) + _np_version_under1p12, _np_version_under1p15, + to_datetime, to_timedelta) import pandas as pd import pandas.core.nanops as nanops import pandas.core.algorithms as algorithms @@ -889,6 +890,40 @@ def test_mode(self): dtype=df["C"].dtype)}) tm.assert_frame_equal(df.mode(), exp) + def test_mode_dropna(self): + # GH 17534 + # Test the dropna=False parameter for mode + + df = pd.DataFrame({"A": [1, np.nan, np.nan, np.nan], + "B": [np.nan, np.nan, 'a', np.nan], + "C": Categorical([np.nan, np.nan, 'a', np.nan]), + "D": to_datetime(['NaT', '2000-1-2', 'NaT', 'NaT']), + "E": to_timedelta(['1 days', 'nan', 'nan', 'nan']), + "F": [1, 1, np.nan, np.nan], + "G": [np.nan, np.nan, 'a', 'a'], + "H": Categorical(['a', np.nan, 'a', np.nan]), + "I": to_datetime(['2000-1-2', '2000-1-2', + 'NaT', 'NaT']), + "J": to_timedelta(['1 days', 'nan', + '1 days', 'nan'])}) + + result = df.loc[:, 'A':'E'].mode(dropna=False) + expected = pd.DataFrame({'A': [np.nan], + 'B': np.array([np.nan], dtype=object), + 'C': Categorical([np.nan], categories=['a']), + 'D': [pd.NaT], + 'E': to_timedelta([pd.NaT])}) + tm.assert_frame_equal(result, expected) + + result = df.loc[:, 'F':'J'].mode(dropna=False) + expected = pd.DataFrame({'F': [1, np.nan], + 'G': [np.nan, 'a'], + 'H': Categorical([np.nan, 'a'], + categories=['a']), + 'I': to_datetime(['NaT', '2000-1-2']), + 'J': to_timedelta(['nan', '1 days'])}) + tm.assert_frame_equal(result, expected) + def test_operators_timedelta64(self): from datetime import timedelta df = DataFrame(dict(A=date_range('2012-1-1', periods=3, freq='D'), diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py index 6ea40329f4bc3..8cc62f20a54ef 100644 --- a/pandas/tests/series/test_analytics.py +++ b/pandas/tests/series/test_analytics.py @@ -12,7 +12,7 @@ from pandas import (Series, Categorical, DataFrame, isna, notna, bdate_range, date_range, _np_version_under1p10, - CategoricalIndex) + CategoricalIndex, to_datetime, to_timedelta) from pandas.core.index import MultiIndex from pandas.core.indexes.datetimes import Timestamp from pandas.core.indexes.timedeltas import Timedelta @@ -321,6 +321,30 @@ def test_mode(self): exp = Series(exp, dtype='category') tm.assert_series_equal(Series(c).mode(), exp) + @pytest.mark.parametrize('values, expected', [ + ([np.nan, np.nan, 1], [np.nan]), + ([np.nan, 1], [1, np.nan]), + ([np.nan, np.nan, 'a'], np.array([np.nan], dtype=object)), + ([np.nan, 'a'], [np.nan, 'a']), + (Categorical([np.nan, np.nan, 'a']), + Categorical([np.nan], categories=['a'])), + (Categorical([np.nan, 'a']), + Categorical([np.nan, 'a'], categories=['a'])), + (Categorical([np.nan, np.nan, 1]), + Categorical([np.nan], categories=[1])), + (to_datetime(['NaT', '2000-1-2', 'NaT']), [pd.NaT]), + (to_datetime(['NaT', '2000-1-2']), to_datetime(['NaT', '2000-1-2'])), + (to_timedelta(['1 days', 'nan', 'nan']), to_timedelta(['NaT'])), + (to_timedelta(['1 days', 'nan']), to_timedelta(['nan', '1 days'])) + ]) + def test_mode_dropna(self, values, expected): + # GH 17534 + # Test the dropna=False parameter for mode + + result = Series(values).mode(dropna=False) + expected = Series(expected) + tm.assert_series_equal(result, expected) + def test_prod(self): self._check_stat_op('prod', np.prod) From c4270a578bc1c2b6cd8023f6deaa9e6d865069e6 Mon Sep 17 00:00:00 2001 From: reidy-p Date: Wed, 25 Apr 2018 22:42:54 +0100 Subject: [PATCH 2/7] clean up and parametrize mode tests and add versionadded --- pandas/core/algorithms.py | 2 + pandas/core/arrays/categorical.py | 2 + pandas/core/frame.py | 2 + pandas/core/series.py | 2 + pandas/tests/frame/test_analytics.py | 136 +++++++---------- pandas/tests/series/test_analytics.py | 202 ++++++++++++++------------ 6 files changed, 173 insertions(+), 173 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index c5b18db96ead4..ccf647aebb6bb 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -809,6 +809,8 @@ def mode(values, dropna=True): dropna : boolean, default True Don't consider counts of NaN/NaT. + .. versionadded:: 0.23.0 + Returns ------- mode : Series diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index abc7831dba190..c9a425f831279 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -2129,6 +2129,8 @@ def mode(self, dropna=True): dropna : boolean, default True Don't consider counts of NaN/NaT. + .. versionadded:: 0.23.0 + Returns ------- modes : `Categorical` (sorted) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 932a299f1e95b..f829b38d29932 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -7060,6 +7060,8 @@ def mode(self, axis=0, numeric_only=False, dropna=True): dropna : boolean, default True Don't consider counts of NaN/NaT. + .. versionadded:: 0.23.0 + Returns ------- modes : DataFrame (sorted) diff --git a/pandas/core/series.py b/pandas/core/series.py index 4525d9e8c7c85..5071472e848b3 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1441,6 +1441,8 @@ def mode(self, dropna=True): dropna : boolean, default True Don't consider counts of NaN/NaT. + .. versionadded:: 0.23.0 + Returns ------- modes : Series (sorted) diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index 0d9044c0766a0..5dc4d5e67d951 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -20,7 +20,6 @@ import pandas as pd import pandas.core.nanops as nanops import pandas.core.algorithms as algorithms -import pandas.io.formats.printing as printing import pandas.util.testing as tm import pandas.util._test_decorators as td @@ -841,87 +840,62 @@ def wrapper(x): expected = pd.Series(unit, index=r1.index, dtype=r1.dtype) tm.assert_series_equal(r1, expected) - def test_mode(self): - df = pd.DataFrame({"A": [12, 12, 11, 12, 19, 11], - "B": [10, 10, 10, np.nan, 3, 4], - "C": [8, 8, 8, 9, 9, 9], - "D": np.arange(6, dtype='int64'), - "E": [8, 8, 1, 1, 3, 3]}) - tm.assert_frame_equal(df[["A"]].mode(), - pd.DataFrame({"A": [12]})) - expected = pd.Series([0, 1, 2, 3, 4, 5], dtype='int64', name='D').\ - to_frame() - tm.assert_frame_equal(df[["D"]].mode(), expected) - expected = pd.Series([1, 3, 8], dtype='int64', name='E').to_frame() - tm.assert_frame_equal(df[["E"]].mode(), expected) - tm.assert_frame_equal(df[["A", "B"]].mode(), - pd.DataFrame({"A": [12], "B": [10.]})) - tm.assert_frame_equal(df.mode(), - pd.DataFrame({"A": [12, np.nan, np.nan, np.nan, - np.nan, np.nan], - "B": [10, np.nan, np.nan, np.nan, - np.nan, np.nan], - "C": [8, 9, np.nan, np.nan, np.nan, - np.nan], - "D": [0, 1, 2, 3, 4, 5], - "E": [1, 3, 8, np.nan, np.nan, - np.nan]})) - - # outputs in sorted order - df["C"] = list(reversed(df["C"])) - printing.pprint_thing(df["C"]) - printing.pprint_thing(df["C"].mode()) - a, b = (df[["A", "B", "C"]].mode(), - pd.DataFrame({"A": [12, np.nan], - "B": [10, np.nan], - "C": [8, 9]})) - printing.pprint_thing(a) - printing.pprint_thing(b) - tm.assert_frame_equal(a, b) - # should work with heterogeneous types - df = pd.DataFrame({"A": np.arange(6, dtype='int64'), - "B": pd.date_range('2011', periods=6), - "C": list('abcdef')}) - exp = pd.DataFrame({"A": pd.Series(np.arange(6, dtype='int64'), - dtype=df["A"].dtype), - "B": pd.Series(pd.date_range('2011', periods=6), - dtype=df["B"].dtype), - "C": pd.Series(list('abcdef'), - dtype=df["C"].dtype)}) - tm.assert_frame_equal(df.mode(), exp) - - def test_mode_dropna(self): - # GH 17534 - # Test the dropna=False parameter for mode - - df = pd.DataFrame({"A": [1, np.nan, np.nan, np.nan], - "B": [np.nan, np.nan, 'a', np.nan], - "C": Categorical([np.nan, np.nan, 'a', np.nan]), - "D": to_datetime(['NaT', '2000-1-2', 'NaT', 'NaT']), - "E": to_timedelta(['1 days', 'nan', 'nan', 'nan']), - "F": [1, 1, np.nan, np.nan], - "G": [np.nan, np.nan, 'a', 'a'], - "H": Categorical(['a', np.nan, 'a', np.nan]), - "I": to_datetime(['2000-1-2', '2000-1-2', + @pytest.mark.parametrize("dropna, expected", [ + (True, {'A': [12], + 'B': [10.0], + 'C': [1.0], + 'D': ['a'], + 'E': Categorical(['a'], categories=['a']), + 'F': to_datetime(['2000-1-2']), + 'G': to_timedelta(['1 days'])}), + (False, {'A': [12], + 'B': [10.0], + 'C': [np.nan], + 'D': np.array([np.nan], dtype=object), + 'E': Categorical([np.nan], categories=['a']), + 'F': [pd.NaT], + 'G': to_timedelta([pd.NaT])}), + (True, {'H': [8, 9, np.nan, np.nan], + 'I': [8, 9, np.nan, np.nan], + 'J': [1, np.nan, np.nan, np.nan], + 'K': ['a', np.nan, np.nan, np.nan], + 'L': Categorical(['a', np.nan, np.nan, np.nan], + categories=['a']), + 'M': to_datetime(['2000-1-2', 'NaT', 'NaT', 'NaT']), + 'N': to_timedelta(['1 days', 'nan', 'nan', 'nan']), + 'O': [0, 1, 2, 3]}), + (False, {'H': [8, 9, np.nan, np.nan], + 'I': [8, 9, np.nan, np.nan], + 'J': [1, np.nan, np.nan, np.nan], + 'K': [np.nan, 'a', np.nan, np.nan], + 'L': Categorical([np.nan, 'a', np.nan, np.nan], + categories=['a']), + 'M': to_datetime(['NaT', '2000-1-2', 'NaT', 'NaT']), + 'N': to_timedelta(['nan', '1 days', 'nan', 'nan']), + 'O': [0, 1, 2, 3]}) + ]) + def test_mode_dropna(self, dropna, expected): + + df = pd.DataFrame({"A": [12, 12, 19, 11], + "B": [10, 10, np.nan, 3], + "C": [1, np.nan, np.nan, np.nan], + "D": [np.nan, np.nan, 'a', np.nan], + "E": Categorical([np.nan, np.nan, 'a', np.nan]), + "F": to_datetime(['NaT', '2000-1-2', 'NaT', 'NaT']), + "G": to_timedelta(['1 days', 'nan', 'nan', 'nan']), + "H": [8, 8, 9, 9], + "I": [9, 9, 8, 8], + "J": [1, 1, np.nan, np.nan], + "K": [np.nan, np.nan, 'a', 'a'], + "L": Categorical(['a', np.nan, 'a', np.nan]), + "M": to_datetime(['2000-1-2', '2000-1-2', 'NaT', 'NaT']), - "J": to_timedelta(['1 days', 'nan', - '1 days', 'nan'])}) - - result = df.loc[:, 'A':'E'].mode(dropna=False) - expected = pd.DataFrame({'A': [np.nan], - 'B': np.array([np.nan], dtype=object), - 'C': Categorical([np.nan], categories=['a']), - 'D': [pd.NaT], - 'E': to_timedelta([pd.NaT])}) - tm.assert_frame_equal(result, expected) - - result = df.loc[:, 'F':'J'].mode(dropna=False) - expected = pd.DataFrame({'F': [1, np.nan], - 'G': [np.nan, 'a'], - 'H': Categorical([np.nan, 'a'], - categories=['a']), - 'I': to_datetime(['NaT', '2000-1-2']), - 'J': to_timedelta(['nan', '1 days'])}) + "N": to_timedelta(['1 days', 'nan', + '1 days', 'nan']), + "O": np.arange(4, dtype='int64')}) + + result = df[sorted(list(expected.keys()))].mode(dropna=dropna) + expected = pd.DataFrame(expected) tm.assert_frame_equal(result, expected) def test_operators_timedelta64(self): diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py index 8cc62f20a54ef..f9b0e6858c277 100644 --- a/pandas/tests/series/test_analytics.py +++ b/pandas/tests/series/test_analytics.py @@ -12,7 +12,7 @@ from pandas import (Series, Categorical, DataFrame, isna, notna, bdate_range, date_range, _np_version_under1p10, - CategoricalIndex, to_datetime, to_timedelta) + CategoricalIndex) from pandas.core.index import MultiIndex from pandas.core.indexes.datetimes import Timestamp from pandas.core.indexes.timedeltas import Timedelta @@ -225,125 +225,143 @@ def test_median(self): int_ts = Series(np.ones(10, dtype=int), index=lrange(10)) tm.assert_almost_equal(np.median(int_ts), int_ts.median()) - def test_mode(self): - # No mode should be found. - exp = Series([], dtype=np.float64) - tm.assert_series_equal(Series([]).mode(), exp) - - exp = Series([1], dtype=np.int64) - tm.assert_series_equal(Series([1]).mode(), exp) - - exp = Series(['a', 'b', 'c'], dtype=np.object) - tm.assert_series_equal(Series(['a', 'b', 'c']).mode(), exp) + @pytest.mark.parametrize('dropna, expected', [ + (True, Series([], dtype=np.float64)), + (False, Series([], dtype=np.float64)) + ]) + def test_mode_empty(self, dropna, expected): + s = Series([], dtype=np.float64) + tm.assert_series_equal(s.mode(dropna), expected) - # Test numerical data types. - exp_single = [1] + @pytest.mark.parametrize('dropna, expected1, expected2, expected3', [ + (True, [1], [1, 3], [1.0]), + (False, [1], [1, 3], [1, np.nan]), + ]) + def test_mode_numerical(self, dropna, expected1, expected2, expected3): data_single = [1] * 5 + [2] * 3 - - exp_multi = [1, 3] data_multi = [1] * 5 + [2] * 3 + [3] * 5 for dt in np.typecodes['AllInteger'] + np.typecodes['Float']: s = Series(data_single, dtype=dt) - exp = Series(exp_single, dtype=dt) - tm.assert_series_equal(s.mode(), exp) + expected1 = Series(expected1, dtype=dt) + tm.assert_series_equal(s.mode(dropna), expected1) s = Series(data_multi, dtype=dt) - exp = Series(exp_multi, dtype=dt) - tm.assert_series_equal(s.mode(), exp) + expected2 = Series(expected2, dtype=dt) + tm.assert_series_equal(s.mode(dropna), expected2) + + s = Series([1, np.nan]) + expected = Series(expected3) + tm.assert_series_equal(s.mode(dropna), expected) + @pytest.mark.parametrize('dropna, expected1, expected2, expected3', [ + (True, ['b'], ['bar'], ['nan']), + (False, ['b'], ['bar', np.nan], ['nan']) + ]) + def test_mode_str_obj(self, dropna, expected1, expected2, expected3): # Test string and object types. - exp = ['b'] data = ['a'] * 2 + ['b'] * 3 s = Series(data, dtype='c') - exp = Series(exp, dtype='c') - tm.assert_series_equal(s.mode(), exp) + expected1 = Series(expected1, dtype='c') + tm.assert_series_equal(s.mode(dropna), expected1) + + data = ['foo', 'bar', 'bar', np.nan, np.nan] - exp = ['bar'] - data = ['foo'] * 2 + ['bar'] * 3 + s = Series(data, dtype=object) + expected2 = Series(expected2, dtype=object) + result = s.mode(dropna).sort_values().reset_index(drop=True) + tm.assert_series_equal(result, expected2) - for dt in [str, object]: - s = Series(data, dtype=dt) - exp = Series(exp, dtype=dt) - tm.assert_series_equal(s.mode(), exp) + data = ['foo', 'bar', 'bar', np.nan, np.nan, np.nan] - # Test datetime types. - exp = Series(['1900-05-03', '2011-01-03', - '2013-01-02'], dtype='M8[ns]') + s = Series(data, dtype=str) + expected3 = Series(expected3, dtype=str) + tm.assert_series_equal(s.mode(dropna), expected3) + + @pytest.mark.parametrize('dropna, expected1, expected2', [ + (True, ['foo'], ['foo']), + (False, ['foo'], ['foo', np.nan]) + ]) + def test_mode_mixeddtype(self, dropna, expected1, expected2): + expected = Series(expected1) + s = Series([1, 'foo', 'foo']) + tm.assert_series_equal(s.mode(dropna), expected) + + expected = Series(expected2) + s = Series([1, 'foo', 'foo', np.nan, np.nan]) + result = s.mode(dropna).sort_values().reset_index(drop=True) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize('dropna, expected1, expected2', [ + (True, ['1900-05-03', '2011-01-03', '2013-01-02'], + ['2011-01-03', '2013-01-02']), + (False, [np.nan], ['nan', '2011-01-03', '2013-01-02']), + ]) + def test_mode_datetime(self, dropna, expected1, expected2): + expected1 = Series(expected1, dtype='M8[ns]') s = Series(['2011-01-03', '2013-01-02', - '1900-05-03'], dtype='M8[ns]') - tm.assert_series_equal(s.mode(), exp) + '1900-05-03', 'nan', 'nan'], dtype='M8[ns]') + tm.assert_series_equal(s.mode(dropna), expected1) - exp = Series(['2011-01-03', '2013-01-02'], dtype='M8[ns]') + expected2 = Series(expected2, dtype='M8[ns]') s = Series(['2011-01-03', '2013-01-02', '1900-05-03', - '2011-01-03', '2013-01-02'], dtype='M8[ns]') - tm.assert_series_equal(s.mode(), exp) + '2011-01-03', '2013-01-02', 'nan', 'nan'], + dtype='M8[ns]') + tm.assert_series_equal(s.mode(dropna), expected2) + @pytest.mark.parametrize('dropna, expected1, expected2', [ + (True, ['-1 days', '0 days', '1 days'], ['2 min', '1 day']), + (False, ['nan'], ['nan', '2 min', '1 day']), + ]) + def test_mode_timedelta(self, dropna, expected1, expected2): # gh-5986: Test timedelta types. - exp = Series(['-1 days', '0 days', '1 days'], dtype='timedelta64[ns]') - s = Series(['1 days', '-1 days', '0 days'], + + s = Series(['1 days', '-1 days', '0 days', 'nan', 'nan'], dtype='timedelta64[ns]') - tm.assert_series_equal(s.mode(), exp) + expected1 = Series(expected1, dtype='timedelta64[ns]') + tm.assert_series_equal(s.mode(dropna), expected1) - exp = Series(['2 min', '1 day'], dtype='timedelta64[ns]') s = Series(['1 day', '1 day', '-1 day', '-1 day 2 min', - '2 min', '2 min'], dtype='timedelta64[ns]') - tm.assert_series_equal(s.mode(), exp) - - # Test mixed dtype. - exp = Series(['foo']) - s = Series([1, 'foo', 'foo']) - tm.assert_series_equal(s.mode(), exp) - + '2 min', '2 min', 'nan', 'nan'], + dtype='timedelta64[ns]') + expected2 = Series(expected2, dtype='timedelta64[ns]') + tm.assert_series_equal(s.mode(dropna), expected2) + + @pytest.mark.parametrize('dropna, expected1, expected2, expected3', [ + (True, Categorical([1, 2], categories=[1, 2]), + Categorical(['a'], categories=[1, 'a']), + Categorical([1, 3], categories=[1, 2, 3])), + (False, Categorical([np.nan], categories=[1, 2]), + Categorical([np.nan, 'a'], categories=[1, 'a']), + Categorical([np.nan, 1, 3], categories=[1, 2, 3])), + ]) + def test_mode_category(self, dropna, expected1, expected2, expected3): + s = Series(Categorical([1, 2, np.nan, np.nan])) + expected1 = Series(expected1, dtype='category') + tm.assert_series_equal(s.mode(dropna), expected1) + + s = Series(Categorical([1, 'a', 'a', np.nan, np.nan])) + expected2 = Series(expected2, dtype='category') + tm.assert_series_equal(s.mode(dropna), expected2) + + s = Series(Categorical([1, 1, 2, 3, 3, np.nan, np.nan])) + expected3 = Series(expected3, dtype='category') + tm.assert_series_equal(s.mode(dropna), expected3) + + @pytest.mark.parametrize('dropna, expected1, expected2', [ + (True, [2**63], [1, 2**63]), + (False, [2**63], [1, 2**63]) + ]) + def test_mode_intoverflow(self, dropna, expected1, expected2): # Test for uint64 overflow. - exp = Series([2**63], dtype=np.uint64) + expected1 = Series(expected1, dtype=np.uint64) s = Series([1, 2**63, 2**63], dtype=np.uint64) - tm.assert_series_equal(s.mode(), exp) + tm.assert_series_equal(s.mode(dropna), expected1) - exp = Series([1, 2**63], dtype=np.uint64) + expected2 = Series(expected2, dtype=np.uint64) s = Series([1, 2**63], dtype=np.uint64) - tm.assert_series_equal(s.mode(), exp) - - # Test category dtype. - c = Categorical([1, 2]) - exp = Categorical([1, 2], categories=[1, 2]) - exp = Series(exp, dtype='category') - tm.assert_series_equal(Series(c).mode(), exp) - - c = Categorical([1, 'a', 'a']) - exp = Categorical(['a'], categories=[1, 'a']) - exp = Series(exp, dtype='category') - tm.assert_series_equal(Series(c).mode(), exp) - - c = Categorical([1, 1, 2, 3, 3]) - exp = Categorical([1, 3], categories=[1, 2, 3]) - exp = Series(exp, dtype='category') - tm.assert_series_equal(Series(c).mode(), exp) - - @pytest.mark.parametrize('values, expected', [ - ([np.nan, np.nan, 1], [np.nan]), - ([np.nan, 1], [1, np.nan]), - ([np.nan, np.nan, 'a'], np.array([np.nan], dtype=object)), - ([np.nan, 'a'], [np.nan, 'a']), - (Categorical([np.nan, np.nan, 'a']), - Categorical([np.nan], categories=['a'])), - (Categorical([np.nan, 'a']), - Categorical([np.nan, 'a'], categories=['a'])), - (Categorical([np.nan, np.nan, 1]), - Categorical([np.nan], categories=[1])), - (to_datetime(['NaT', '2000-1-2', 'NaT']), [pd.NaT]), - (to_datetime(['NaT', '2000-1-2']), to_datetime(['NaT', '2000-1-2'])), - (to_timedelta(['1 days', 'nan', 'nan']), to_timedelta(['NaT'])), - (to_timedelta(['1 days', 'nan']), to_timedelta(['nan', '1 days'])) - ]) - def test_mode_dropna(self, values, expected): - # GH 17534 - # Test the dropna=False parameter for mode - - result = Series(values).mode(dropna=False) - expected = Series(expected) - tm.assert_series_equal(result, expected) + tm.assert_series_equal(s.mode(dropna), expected2) def test_prod(self): self._check_stat_op('prod', np.prod) From bce740abc5b01a3fa80cf45c03e39c3cb6b5fee5 Mon Sep 17 00:00:00 2001 From: reidy-p Date: Tue, 8 May 2018 19:19:29 +0100 Subject: [PATCH 3/7] Test separately for warning and fix typos --- pandas/tests/series/test_analytics.py | 29 ++++++++++++++++++++------- 1 file changed, 22 insertions(+), 7 deletions(-) diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py index f9b0e6858c277..a52f04bf3ff80 100644 --- a/pandas/tests/series/test_analytics.py +++ b/pandas/tests/series/test_analytics.py @@ -256,7 +256,7 @@ def test_mode_numerical(self, dropna, expected1, expected2, expected3): @pytest.mark.parametrize('dropna, expected1, expected2, expected3', [ (True, ['b'], ['bar'], ['nan']), - (False, ['b'], ['bar', np.nan], ['nan']) + (False, ['b'], [np.nan], ['nan']) ]) def test_mode_str_obj(self, dropna, expected1, expected2, expected3): # Test string and object types. @@ -266,7 +266,7 @@ def test_mode_str_obj(self, dropna, expected1, expected2, expected3): expected1 = Series(expected1, dtype='c') tm.assert_series_equal(s.mode(dropna), expected1) - data = ['foo', 'bar', 'bar', np.nan, np.nan] + data = ['foo', 'bar', 'bar', np.nan, np.nan, np.nan] s = Series(data, dtype=object) expected2 = Series(expected2, dtype=object) @@ -281,22 +281,22 @@ def test_mode_str_obj(self, dropna, expected1, expected2, expected3): @pytest.mark.parametrize('dropna, expected1, expected2', [ (True, ['foo'], ['foo']), - (False, ['foo'], ['foo', np.nan]) + (False, ['foo'], [np.nan]) ]) def test_mode_mixeddtype(self, dropna, expected1, expected2): expected = Series(expected1) s = Series([1, 'foo', 'foo']) tm.assert_series_equal(s.mode(dropna), expected) - expected = Series(expected2) - s = Series([1, 'foo', 'foo', np.nan, np.nan]) + expected = Series(expected2, dtype=object) + s = Series([1, 'foo', 'foo', np.nan, np.nan, np.nan]) result = s.mode(dropna).sort_values().reset_index(drop=True) tm.assert_series_equal(result, expected) @pytest.mark.parametrize('dropna, expected1, expected2', [ (True, ['1900-05-03', '2011-01-03', '2013-01-02'], ['2011-01-03', '2013-01-02']), - (False, [np.nan], ['nan', '2011-01-03', '2013-01-02']), + (False, [np.nan], [np.nan, '2011-01-03', '2013-01-02']), ]) def test_mode_datetime(self, dropna, expected1, expected2): expected1 = Series(expected1, dtype='M8[ns]') @@ -312,7 +312,7 @@ def test_mode_datetime(self, dropna, expected1, expected2): @pytest.mark.parametrize('dropna, expected1, expected2', [ (True, ['-1 days', '0 days', '1 days'], ['2 min', '1 day']), - (False, ['nan'], ['nan', '2 min', '1 day']), + (False, [np.nan], [np.nan, '2 min', '1 day']), ]) def test_mode_timedelta(self, dropna, expected1, expected2): # gh-5986: Test timedelta types. @@ -363,6 +363,21 @@ def test_mode_intoverflow(self, dropna, expected1, expected2): s = Series([1, 2**63], dtype=np.uint64) tm.assert_series_equal(s.mode(dropna), expected2) + @pytest.mark.parametrize('dropna, expected', [ + (False, ['foo', np.nan]), + ]) + def test_mode_sortwarning(self, dropna, expected): + # Check for the warning that is raised when the mode + # results cannot be sorted + + expected = Series(expected) + s = Series([1, 'foo', 'foo', np.nan, np.nan]) + + with tm.assert_produces_warning(UserWarning, check_stacklevel=False): + result = s.mode(dropna).sort_values().reset_index(drop=True) + + tm.assert_series_equal(result, expected) + def test_prod(self): self._check_stat_op('prod', np.prod) From 1fc489376a9987d588c7e10af47eb33cbf03ee0f Mon Sep 17 00:00:00 2001 From: reidy-p Date: Tue, 8 May 2018 21:48:45 +0100 Subject: [PATCH 4/7] Only check for warning with python 3 --- pandas/tests/series/test_analytics.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py index a52f04bf3ff80..ce89eadc0509b 100644 --- a/pandas/tests/series/test_analytics.py +++ b/pandas/tests/series/test_analytics.py @@ -363,6 +363,7 @@ def test_mode_intoverflow(self, dropna, expected1, expected2): s = Series([1, 2**63], dtype=np.uint64) tm.assert_series_equal(s.mode(dropna), expected2) + @pytest.mark.skipif(not compat.PY3, reason="only PY3") @pytest.mark.parametrize('dropna, expected', [ (False, ['foo', np.nan]), ]) From 0399822f16bbe44cfb910ab342dea7774d8a0d4e Mon Sep 17 00:00:00 2001 From: reidy-p Date: Sun, 13 May 2018 22:01:58 +0100 Subject: [PATCH 5/7] Test for sort warning in DataFrame and clean up sort test for Series --- pandas/tests/frame/test_analytics.py | 67 ++++++++++++++++----------- pandas/tests/series/test_analytics.py | 10 ++-- 2 files changed, 43 insertions(+), 34 deletions(-) diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index 5dc4d5e67d951..b8f1acc2aa679 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -858,44 +858,55 @@ def wrapper(x): (True, {'H': [8, 9, np.nan, np.nan], 'I': [8, 9, np.nan, np.nan], 'J': [1, np.nan, np.nan, np.nan], - 'K': ['a', np.nan, np.nan, np.nan], - 'L': Categorical(['a', np.nan, np.nan, np.nan], + 'K': Categorical(['a', np.nan, np.nan, np.nan], categories=['a']), - 'M': to_datetime(['2000-1-2', 'NaT', 'NaT', 'NaT']), - 'N': to_timedelta(['1 days', 'nan', 'nan', 'nan']), - 'O': [0, 1, 2, 3]}), + 'L': to_datetime(['2000-1-2', 'NaT', 'NaT', 'NaT']), + 'M': to_timedelta(['1 days', 'nan', 'nan', 'nan']), + 'N': [0, 1, 2, 3]}), (False, {'H': [8, 9, np.nan, np.nan], 'I': [8, 9, np.nan, np.nan], 'J': [1, np.nan, np.nan, np.nan], - 'K': [np.nan, 'a', np.nan, np.nan], - 'L': Categorical([np.nan, 'a', np.nan, np.nan], + 'K': Categorical([np.nan, 'a', np.nan, np.nan], categories=['a']), - 'M': to_datetime(['NaT', '2000-1-2', 'NaT', 'NaT']), - 'N': to_timedelta(['nan', '1 days', 'nan', 'nan']), - 'O': [0, 1, 2, 3]}) + 'L': to_datetime(['NaT', '2000-1-2', 'NaT', 'NaT']), + 'M': to_timedelta(['nan', '1 days', 'nan', 'nan']), + 'N': [0, 1, 2, 3]}) ]) def test_mode_dropna(self, dropna, expected): - df = pd.DataFrame({"A": [12, 12, 19, 11], - "B": [10, 10, np.nan, 3], - "C": [1, np.nan, np.nan, np.nan], - "D": [np.nan, np.nan, 'a', np.nan], - "E": Categorical([np.nan, np.nan, 'a', np.nan]), - "F": to_datetime(['NaT', '2000-1-2', 'NaT', 'NaT']), - "G": to_timedelta(['1 days', 'nan', 'nan', 'nan']), - "H": [8, 8, 9, 9], - "I": [9, 9, 8, 8], - "J": [1, 1, np.nan, np.nan], - "K": [np.nan, np.nan, 'a', 'a'], - "L": Categorical(['a', np.nan, 'a', np.nan]), - "M": to_datetime(['2000-1-2', '2000-1-2', - 'NaT', 'NaT']), - "N": to_timedelta(['1 days', 'nan', - '1 days', 'nan']), - "O": np.arange(4, dtype='int64')}) + df = DataFrame({"A": [12, 12, 19, 11], + "B": [10, 10, np.nan, 3], + "C": [1, np.nan, np.nan, np.nan], + "D": [np.nan, np.nan, 'a', np.nan], + "E": Categorical([np.nan, np.nan, 'a', np.nan]), + "F": to_datetime(['NaT', '2000-1-2', 'NaT', 'NaT']), + "G": to_timedelta(['1 days', 'nan', 'nan', 'nan']), + "H": [8, 8, 9, 9], + "I": [9, 9, 8, 8], + "J": [1, 1, np.nan, np.nan], + "K": Categorical(['a', np.nan, 'a', np.nan]), + "L": to_datetime(['2000-1-2', '2000-1-2', + 'NaT', 'NaT']), + "M": to_timedelta(['1 days', 'nan', + '1 days', 'nan']), + "N": np.arange(4, dtype='int64')}) result = df[sorted(list(expected.keys()))].mode(dropna=dropna) - expected = pd.DataFrame(expected) + expected = DataFrame(expected) + tm.assert_frame_equal(result, expected) + + @pytest.mark.skipif(not compat.PY3, reason="only PY3") + def test_mode_sortwarning(self): + # Check for the warning that is raised when the mode + # results cannot be sorted + + df = DataFrame({"A": [np.nan, np.nan, 'a', 'a']}) + expected = DataFrame({'A': ['a', np.nan]}) + + with tm.assert_produces_warning(UserWarning, check_stacklevel=False): + result = df.mode(dropna=False) + result = result.sort_values(by='A').reset_index(drop=True) + tm.assert_frame_equal(result, expected) def test_operators_timedelta64(self): diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py index ce89eadc0509b..077022ace426c 100644 --- a/pandas/tests/series/test_analytics.py +++ b/pandas/tests/series/test_analytics.py @@ -364,18 +364,16 @@ def test_mode_intoverflow(self, dropna, expected1, expected2): tm.assert_series_equal(s.mode(dropna), expected2) @pytest.mark.skipif(not compat.PY3, reason="only PY3") - @pytest.mark.parametrize('dropna, expected', [ - (False, ['foo', np.nan]), - ]) - def test_mode_sortwarning(self, dropna, expected): + def test_mode_sortwarning(self): # Check for the warning that is raised when the mode # results cannot be sorted - expected = Series(expected) + expected = Series(['foo', np.nan]) s = Series([1, 'foo', 'foo', np.nan, np.nan]) with tm.assert_produces_warning(UserWarning, check_stacklevel=False): - result = s.mode(dropna).sort_values().reset_index(drop=True) + result = s.mode(dropna=False) + result = result.sort_values().reset_index(drop=True) tm.assert_series_equal(result, expected) From 7b593121fb4c37895a6740fc2437172df44251bb Mon Sep 17 00:00:00 2001 From: reidy-p Date: Thu, 17 May 2018 22:28:47 +0100 Subject: [PATCH 6/7] change to 0.24 and parametrize series tests --- doc/source/whatsnew/v0.23.0.txt | 1 - doc/source/whatsnew/v0.24.0.txt | 1 + pandas/core/algorithms.py | 2 +- pandas/core/arrays/categorical.py | 2 +- pandas/core/frame.py | 2 +- pandas/core/series.py | 2 +- pandas/tests/series/test_analytics.py | 94 ++++++++++++++++----------- 7 files changed, 61 insertions(+), 43 deletions(-) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index 3c7bc220c94c2..a099fb40c35a7 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -550,7 +550,6 @@ Other Enhancements - :func:`to_hdf` and :func:`read_hdf` now accept an ``errors`` keyword argument to control encoding error handling (:issue:`20835`) - :func:`cut` has gained the ``duplicates='raise'|'drop'`` option to control whether to raise on duplicated edges (:issue:`20947`) - :func:`date_range`, :func:`timedelta_range`, and :func:`interval_range` now return a linearly spaced index if ``start``, ``stop``, and ``periods`` are specified, but ``freq`` is not. (:issue:`20808`, :issue:`20983`, :issue:`20976`) -- :func:`Series.mode` and :func:`DataFrame.mode` now support the ``dropna`` parameter which can be used to specify whether NaN/NaT values should be considered (:issue:`17534`) .. _whatsnew_0230.api_breaking: diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index e931450cb5c01..cc979a21d8627 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -13,6 +13,7 @@ New features Other Enhancements ^^^^^^^^^^^^^^^^^^ - :func:`to_datetime` now supports the ``%Z`` and ``%z`` directive when passed into ``format`` (:issue:`13486`) +- :func:`Series.mode` and :func:`DataFrame.mode` now support the ``dropna`` parameter which can be used to specify whether NaN/NaT values should be considered (:issue:`17534`) - - diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index ccf647aebb6bb..0b0fa69588784 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -809,7 +809,7 @@ def mode(values, dropna=True): dropna : boolean, default True Don't consider counts of NaN/NaT. - .. versionadded:: 0.23.0 + .. versionadded:: 0.24.0 Returns ------- diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index c9a425f831279..30f9c56d24f02 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -2129,7 +2129,7 @@ def mode(self, dropna=True): dropna : boolean, default True Don't consider counts of NaN/NaT. - .. versionadded:: 0.23.0 + .. versionadded:: 0.24.0 Returns ------- diff --git a/pandas/core/frame.py b/pandas/core/frame.py index f829b38d29932..c65458fb0d398 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -7060,7 +7060,7 @@ def mode(self, axis=0, numeric_only=False, dropna=True): dropna : boolean, default True Don't consider counts of NaN/NaT. - .. versionadded:: 0.23.0 + .. versionadded:: 0.24.0 Returns ------- diff --git a/pandas/core/series.py b/pandas/core/series.py index 5071472e848b3..6718ff637a886 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1441,7 +1441,7 @@ def mode(self, dropna=True): dropna : boolean, default True Don't consider counts of NaN/NaT. - .. versionadded:: 0.23.0 + .. versionadded:: 0.24.0 Returns ------- diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py index 077022ace426c..5b44cb596d201 100644 --- a/pandas/tests/series/test_analytics.py +++ b/pandas/tests/series/test_analytics.py @@ -231,28 +231,34 @@ def test_median(self): ]) def test_mode_empty(self, dropna, expected): s = Series([], dtype=np.float64) - tm.assert_series_equal(s.mode(dropna), expected) + result = s.mode(dropna) + tm.assert_series_equal(result, expected) - @pytest.mark.parametrize('dropna, expected1, expected2, expected3', [ - (True, [1], [1, 3], [1.0]), - (False, [1], [1, 3], [1, np.nan]), + @pytest.mark.parametrize('dropna, data, expected', [ + (True, [1, 1, 1, 2], [1]), + (True, [1, 1, 1, 2, 3, 3, 3], [1, 3]), + (False, [1, 1, 1, 2], [1]), + (False, [1, 1, 1, 2, 3, 3, 3], [1, 3]), ]) - def test_mode_numerical(self, dropna, expected1, expected2, expected3): - data_single = [1] * 5 + [2] * 3 - data_multi = [1] * 5 + [2] * 3 + [3] * 5 - - for dt in np.typecodes['AllInteger'] + np.typecodes['Float']: - s = Series(data_single, dtype=dt) - expected1 = Series(expected1, dtype=dt) - tm.assert_series_equal(s.mode(dropna), expected1) - - s = Series(data_multi, dtype=dt) - expected2 = Series(expected2, dtype=dt) - tm.assert_series_equal(s.mode(dropna), expected2) + @pytest.mark.parametrize( + 'dt', + list(np.typecodes['AllInteger'] + np.typecodes['Float']) + ) + def test_mode_numerical(self, dropna, data, expected, dt): + s = Series(data, dtype=dt) + result = s.mode(dropna) + expected = Series(expected, dtype=dt) + tm.assert_series_equal(result, expected) - s = Series([1, np.nan]) - expected = Series(expected3) - tm.assert_series_equal(s.mode(dropna), expected) + @pytest.mark.parametrize('dropna, expected', [ + (True, [1.0]), + (False, [1, np.nan]), + ]) + def test_mode_numerical_nan(self, dropna, expected): + s = Series([1, 1, 2, np.nan, np.nan]) + result = s.mode(dropna) + expected = Series(expected) + tm.assert_series_equal(result, expected) @pytest.mark.parametrize('dropna, expected1, expected2, expected3', [ (True, ['b'], ['bar'], ['nan']), @@ -263,34 +269,37 @@ def test_mode_str_obj(self, dropna, expected1, expected2, expected3): data = ['a'] * 2 + ['b'] * 3 s = Series(data, dtype='c') + result = s.mode(dropna) expected1 = Series(expected1, dtype='c') - tm.assert_series_equal(s.mode(dropna), expected1) + tm.assert_series_equal(result, expected1) data = ['foo', 'bar', 'bar', np.nan, np.nan, np.nan] s = Series(data, dtype=object) - expected2 = Series(expected2, dtype=object) result = s.mode(dropna).sort_values().reset_index(drop=True) + expected2 = Series(expected2, dtype=object) tm.assert_series_equal(result, expected2) data = ['foo', 'bar', 'bar', np.nan, np.nan, np.nan] s = Series(data, dtype=str) + result = s.mode(dropna) expected3 = Series(expected3, dtype=str) - tm.assert_series_equal(s.mode(dropna), expected3) + tm.assert_series_equal(result, expected3) @pytest.mark.parametrize('dropna, expected1, expected2', [ (True, ['foo'], ['foo']), (False, ['foo'], [np.nan]) ]) def test_mode_mixeddtype(self, dropna, expected1, expected2): - expected = Series(expected1) s = Series([1, 'foo', 'foo']) - tm.assert_series_equal(s.mode(dropna), expected) + result = s.mode(dropna) + expected = Series(expected1) + tm.assert_series_equal(result, expected) - expected = Series(expected2, dtype=object) s = Series([1, 'foo', 'foo', np.nan, np.nan, np.nan]) result = s.mode(dropna).sort_values().reset_index(drop=True) + expected = Series(expected2, dtype=object) tm.assert_series_equal(result, expected) @pytest.mark.parametrize('dropna, expected1, expected2', [ @@ -299,16 +308,18 @@ def test_mode_mixeddtype(self, dropna, expected1, expected2): (False, [np.nan], [np.nan, '2011-01-03', '2013-01-02']), ]) def test_mode_datetime(self, dropna, expected1, expected2): - expected1 = Series(expected1, dtype='M8[ns]') s = Series(['2011-01-03', '2013-01-02', '1900-05-03', 'nan', 'nan'], dtype='M8[ns]') - tm.assert_series_equal(s.mode(dropna), expected1) + result = s.mode(dropna) + expected1 = Series(expected1, dtype='M8[ns]') + tm.assert_series_equal(result, expected1) - expected2 = Series(expected2, dtype='M8[ns]') s = Series(['2011-01-03', '2013-01-02', '1900-05-03', '2011-01-03', '2013-01-02', 'nan', 'nan'], dtype='M8[ns]') - tm.assert_series_equal(s.mode(dropna), expected2) + result = s.mode(dropna) + expected2 = Series(expected2, dtype='M8[ns]') + tm.assert_series_equal(result, expected2) @pytest.mark.parametrize('dropna, expected1, expected2', [ (True, ['-1 days', '0 days', '1 days'], ['2 min', '1 day']), @@ -319,14 +330,16 @@ def test_mode_timedelta(self, dropna, expected1, expected2): s = Series(['1 days', '-1 days', '0 days', 'nan', 'nan'], dtype='timedelta64[ns]') + result = s.mode(dropna) expected1 = Series(expected1, dtype='timedelta64[ns]') - tm.assert_series_equal(s.mode(dropna), expected1) + tm.assert_series_equal(result, expected1) s = Series(['1 day', '1 day', '-1 day', '-1 day 2 min', '2 min', '2 min', 'nan', 'nan'], dtype='timedelta64[ns]') + result = s.mode(dropna) expected2 = Series(expected2, dtype='timedelta64[ns]') - tm.assert_series_equal(s.mode(dropna), expected2) + tm.assert_series_equal(result, expected2) @pytest.mark.parametrize('dropna, expected1, expected2, expected3', [ (True, Categorical([1, 2], categories=[1, 2]), @@ -338,16 +351,19 @@ def test_mode_timedelta(self, dropna, expected1, expected2): ]) def test_mode_category(self, dropna, expected1, expected2, expected3): s = Series(Categorical([1, 2, np.nan, np.nan])) + result = s.mode(dropna) expected1 = Series(expected1, dtype='category') - tm.assert_series_equal(s.mode(dropna), expected1) + tm.assert_series_equal(result, expected1) s = Series(Categorical([1, 'a', 'a', np.nan, np.nan])) + result = s.mode(dropna) expected2 = Series(expected2, dtype='category') - tm.assert_series_equal(s.mode(dropna), expected2) + tm.assert_series_equal(result, expected2) s = Series(Categorical([1, 1, 2, 3, 3, np.nan, np.nan])) + result = s.mode(dropna) expected3 = Series(expected3, dtype='category') - tm.assert_series_equal(s.mode(dropna), expected3) + tm.assert_series_equal(result, expected3) @pytest.mark.parametrize('dropna, expected1, expected2', [ (True, [2**63], [1, 2**63]), @@ -355,13 +371,15 @@ def test_mode_category(self, dropna, expected1, expected2, expected3): ]) def test_mode_intoverflow(self, dropna, expected1, expected2): # Test for uint64 overflow. - expected1 = Series(expected1, dtype=np.uint64) s = Series([1, 2**63, 2**63], dtype=np.uint64) - tm.assert_series_equal(s.mode(dropna), expected1) + result = s.mode(dropna) + expected1 = Series(expected1, dtype=np.uint64) + tm.assert_series_equal(result, expected1) - expected2 = Series(expected2, dtype=np.uint64) s = Series([1, 2**63], dtype=np.uint64) - tm.assert_series_equal(s.mode(dropna), expected2) + result = s.mode(dropna) + expected2 = Series(expected2, dtype=np.uint64) + tm.assert_series_equal(result, expected2) @pytest.mark.skipif(not compat.PY3, reason="only PY3") def test_mode_sortwarning(self): From 7b52c37ae208f0c8641ebc6dbe77737aac9987c9 Mon Sep 17 00:00:00 2001 From: reidy-p Date: Wed, 30 May 2018 19:56:37 +0100 Subject: [PATCH 7/7] Put series mode tests in class and remove uncessary sort_values and reset_index --- pandas/tests/series/test_analytics.py | 364 ++++++++++++-------------- 1 file changed, 174 insertions(+), 190 deletions(-) diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py index 5b44cb596d201..14ae1ef42865a 100644 --- a/pandas/tests/series/test_analytics.py +++ b/pandas/tests/series/test_analytics.py @@ -225,176 +225,6 @@ def test_median(self): int_ts = Series(np.ones(10, dtype=int), index=lrange(10)) tm.assert_almost_equal(np.median(int_ts), int_ts.median()) - @pytest.mark.parametrize('dropna, expected', [ - (True, Series([], dtype=np.float64)), - (False, Series([], dtype=np.float64)) - ]) - def test_mode_empty(self, dropna, expected): - s = Series([], dtype=np.float64) - result = s.mode(dropna) - tm.assert_series_equal(result, expected) - - @pytest.mark.parametrize('dropna, data, expected', [ - (True, [1, 1, 1, 2], [1]), - (True, [1, 1, 1, 2, 3, 3, 3], [1, 3]), - (False, [1, 1, 1, 2], [1]), - (False, [1, 1, 1, 2, 3, 3, 3], [1, 3]), - ]) - @pytest.mark.parametrize( - 'dt', - list(np.typecodes['AllInteger'] + np.typecodes['Float']) - ) - def test_mode_numerical(self, dropna, data, expected, dt): - s = Series(data, dtype=dt) - result = s.mode(dropna) - expected = Series(expected, dtype=dt) - tm.assert_series_equal(result, expected) - - @pytest.mark.parametrize('dropna, expected', [ - (True, [1.0]), - (False, [1, np.nan]), - ]) - def test_mode_numerical_nan(self, dropna, expected): - s = Series([1, 1, 2, np.nan, np.nan]) - result = s.mode(dropna) - expected = Series(expected) - tm.assert_series_equal(result, expected) - - @pytest.mark.parametrize('dropna, expected1, expected2, expected3', [ - (True, ['b'], ['bar'], ['nan']), - (False, ['b'], [np.nan], ['nan']) - ]) - def test_mode_str_obj(self, dropna, expected1, expected2, expected3): - # Test string and object types. - data = ['a'] * 2 + ['b'] * 3 - - s = Series(data, dtype='c') - result = s.mode(dropna) - expected1 = Series(expected1, dtype='c') - tm.assert_series_equal(result, expected1) - - data = ['foo', 'bar', 'bar', np.nan, np.nan, np.nan] - - s = Series(data, dtype=object) - result = s.mode(dropna).sort_values().reset_index(drop=True) - expected2 = Series(expected2, dtype=object) - tm.assert_series_equal(result, expected2) - - data = ['foo', 'bar', 'bar', np.nan, np.nan, np.nan] - - s = Series(data, dtype=str) - result = s.mode(dropna) - expected3 = Series(expected3, dtype=str) - tm.assert_series_equal(result, expected3) - - @pytest.mark.parametrize('dropna, expected1, expected2', [ - (True, ['foo'], ['foo']), - (False, ['foo'], [np.nan]) - ]) - def test_mode_mixeddtype(self, dropna, expected1, expected2): - s = Series([1, 'foo', 'foo']) - result = s.mode(dropna) - expected = Series(expected1) - tm.assert_series_equal(result, expected) - - s = Series([1, 'foo', 'foo', np.nan, np.nan, np.nan]) - result = s.mode(dropna).sort_values().reset_index(drop=True) - expected = Series(expected2, dtype=object) - tm.assert_series_equal(result, expected) - - @pytest.mark.parametrize('dropna, expected1, expected2', [ - (True, ['1900-05-03', '2011-01-03', '2013-01-02'], - ['2011-01-03', '2013-01-02']), - (False, [np.nan], [np.nan, '2011-01-03', '2013-01-02']), - ]) - def test_mode_datetime(self, dropna, expected1, expected2): - s = Series(['2011-01-03', '2013-01-02', - '1900-05-03', 'nan', 'nan'], dtype='M8[ns]') - result = s.mode(dropna) - expected1 = Series(expected1, dtype='M8[ns]') - tm.assert_series_equal(result, expected1) - - s = Series(['2011-01-03', '2013-01-02', '1900-05-03', - '2011-01-03', '2013-01-02', 'nan', 'nan'], - dtype='M8[ns]') - result = s.mode(dropna) - expected2 = Series(expected2, dtype='M8[ns]') - tm.assert_series_equal(result, expected2) - - @pytest.mark.parametrize('dropna, expected1, expected2', [ - (True, ['-1 days', '0 days', '1 days'], ['2 min', '1 day']), - (False, [np.nan], [np.nan, '2 min', '1 day']), - ]) - def test_mode_timedelta(self, dropna, expected1, expected2): - # gh-5986: Test timedelta types. - - s = Series(['1 days', '-1 days', '0 days', 'nan', 'nan'], - dtype='timedelta64[ns]') - result = s.mode(dropna) - expected1 = Series(expected1, dtype='timedelta64[ns]') - tm.assert_series_equal(result, expected1) - - s = Series(['1 day', '1 day', '-1 day', '-1 day 2 min', - '2 min', '2 min', 'nan', 'nan'], - dtype='timedelta64[ns]') - result = s.mode(dropna) - expected2 = Series(expected2, dtype='timedelta64[ns]') - tm.assert_series_equal(result, expected2) - - @pytest.mark.parametrize('dropna, expected1, expected2, expected3', [ - (True, Categorical([1, 2], categories=[1, 2]), - Categorical(['a'], categories=[1, 'a']), - Categorical([1, 3], categories=[1, 2, 3])), - (False, Categorical([np.nan], categories=[1, 2]), - Categorical([np.nan, 'a'], categories=[1, 'a']), - Categorical([np.nan, 1, 3], categories=[1, 2, 3])), - ]) - def test_mode_category(self, dropna, expected1, expected2, expected3): - s = Series(Categorical([1, 2, np.nan, np.nan])) - result = s.mode(dropna) - expected1 = Series(expected1, dtype='category') - tm.assert_series_equal(result, expected1) - - s = Series(Categorical([1, 'a', 'a', np.nan, np.nan])) - result = s.mode(dropna) - expected2 = Series(expected2, dtype='category') - tm.assert_series_equal(result, expected2) - - s = Series(Categorical([1, 1, 2, 3, 3, np.nan, np.nan])) - result = s.mode(dropna) - expected3 = Series(expected3, dtype='category') - tm.assert_series_equal(result, expected3) - - @pytest.mark.parametrize('dropna, expected1, expected2', [ - (True, [2**63], [1, 2**63]), - (False, [2**63], [1, 2**63]) - ]) - def test_mode_intoverflow(self, dropna, expected1, expected2): - # Test for uint64 overflow. - s = Series([1, 2**63, 2**63], dtype=np.uint64) - result = s.mode(dropna) - expected1 = Series(expected1, dtype=np.uint64) - tm.assert_series_equal(result, expected1) - - s = Series([1, 2**63], dtype=np.uint64) - result = s.mode(dropna) - expected2 = Series(expected2, dtype=np.uint64) - tm.assert_series_equal(result, expected2) - - @pytest.mark.skipif(not compat.PY3, reason="only PY3") - def test_mode_sortwarning(self): - # Check for the warning that is raised when the mode - # results cannot be sorted - - expected = Series(['foo', np.nan]) - s = Series([1, 'foo', 'foo', np.nan, np.nan]) - - with tm.assert_produces_warning(UserWarning, check_stacklevel=False): - result = s.mode(dropna=False) - result = result.sort_values().reset_index(drop=True) - - tm.assert_series_equal(result, expected) - def test_prod(self): self._check_stat_op('prod', np.prod) @@ -1940,6 +1770,180 @@ def s_main_dtypes(): return df +class TestMode(object): + + @pytest.mark.parametrize('dropna, expected', [ + (True, Series([], dtype=np.float64)), + (False, Series([], dtype=np.float64)) + ]) + def test_mode_empty(self, dropna, expected): + s = Series([], dtype=np.float64) + result = s.mode(dropna) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize('dropna, data, expected', [ + (True, [1, 1, 1, 2], [1]), + (True, [1, 1, 1, 2, 3, 3, 3], [1, 3]), + (False, [1, 1, 1, 2], [1]), + (False, [1, 1, 1, 2, 3, 3, 3], [1, 3]), + ]) + @pytest.mark.parametrize( + 'dt', + list(np.typecodes['AllInteger'] + np.typecodes['Float']) + ) + def test_mode_numerical(self, dropna, data, expected, dt): + s = Series(data, dtype=dt) + result = s.mode(dropna) + expected = Series(expected, dtype=dt) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize('dropna, expected', [ + (True, [1.0]), + (False, [1, np.nan]), + ]) + def test_mode_numerical_nan(self, dropna, expected): + s = Series([1, 1, 2, np.nan, np.nan]) + result = s.mode(dropna) + expected = Series(expected) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize('dropna, expected1, expected2, expected3', [ + (True, ['b'], ['bar'], ['nan']), + (False, ['b'], [np.nan], ['nan']) + ]) + def test_mode_str_obj(self, dropna, expected1, expected2, expected3): + # Test string and object types. + data = ['a'] * 2 + ['b'] * 3 + + s = Series(data, dtype='c') + result = s.mode(dropna) + expected1 = Series(expected1, dtype='c') + tm.assert_series_equal(result, expected1) + + data = ['foo', 'bar', 'bar', np.nan, np.nan, np.nan] + + s = Series(data, dtype=object) + result = s.mode(dropna) + expected2 = Series(expected2, dtype=object) + tm.assert_series_equal(result, expected2) + + data = ['foo', 'bar', 'bar', np.nan, np.nan, np.nan] + + s = Series(data, dtype=str) + result = s.mode(dropna) + expected3 = Series(expected3, dtype=str) + tm.assert_series_equal(result, expected3) + + @pytest.mark.parametrize('dropna, expected1, expected2', [ + (True, ['foo'], ['foo']), + (False, ['foo'], [np.nan]) + ]) + def test_mode_mixeddtype(self, dropna, expected1, expected2): + s = Series([1, 'foo', 'foo']) + result = s.mode(dropna) + expected = Series(expected1) + tm.assert_series_equal(result, expected) + + s = Series([1, 'foo', 'foo', np.nan, np.nan, np.nan]) + result = s.mode(dropna) + expected = Series(expected2, dtype=object) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize('dropna, expected1, expected2', [ + (True, ['1900-05-03', '2011-01-03', '2013-01-02'], + ['2011-01-03', '2013-01-02']), + (False, [np.nan], [np.nan, '2011-01-03', '2013-01-02']), + ]) + def test_mode_datetime(self, dropna, expected1, expected2): + s = Series(['2011-01-03', '2013-01-02', + '1900-05-03', 'nan', 'nan'], dtype='M8[ns]') + result = s.mode(dropna) + expected1 = Series(expected1, dtype='M8[ns]') + tm.assert_series_equal(result, expected1) + + s = Series(['2011-01-03', '2013-01-02', '1900-05-03', + '2011-01-03', '2013-01-02', 'nan', 'nan'], + dtype='M8[ns]') + result = s.mode(dropna) + expected2 = Series(expected2, dtype='M8[ns]') + tm.assert_series_equal(result, expected2) + + @pytest.mark.parametrize('dropna, expected1, expected2', [ + (True, ['-1 days', '0 days', '1 days'], ['2 min', '1 day']), + (False, [np.nan], [np.nan, '2 min', '1 day']), + ]) + def test_mode_timedelta(self, dropna, expected1, expected2): + # gh-5986: Test timedelta types. + + s = Series(['1 days', '-1 days', '0 days', 'nan', 'nan'], + dtype='timedelta64[ns]') + result = s.mode(dropna) + expected1 = Series(expected1, dtype='timedelta64[ns]') + tm.assert_series_equal(result, expected1) + + s = Series(['1 day', '1 day', '-1 day', '-1 day 2 min', + '2 min', '2 min', 'nan', 'nan'], + dtype='timedelta64[ns]') + result = s.mode(dropna) + expected2 = Series(expected2, dtype='timedelta64[ns]') + tm.assert_series_equal(result, expected2) + + @pytest.mark.parametrize('dropna, expected1, expected2, expected3', [ + (True, Categorical([1, 2], categories=[1, 2]), + Categorical(['a'], categories=[1, 'a']), + Categorical([3, 1], categories=[3, 2, 1], ordered=True)), + (False, Categorical([np.nan], categories=[1, 2]), + Categorical([np.nan, 'a'], categories=[1, 'a']), + Categorical([np.nan, 3, 1], categories=[3, 2, 1], ordered=True)), + ]) + def test_mode_category(self, dropna, expected1, expected2, expected3): + s = Series(Categorical([1, 2, np.nan, np.nan])) + result = s.mode(dropna) + expected1 = Series(expected1, dtype='category') + tm.assert_series_equal(result, expected1) + + s = Series(Categorical([1, 'a', 'a', np.nan, np.nan])) + result = s.mode(dropna) + expected2 = Series(expected2, dtype='category') + tm.assert_series_equal(result, expected2) + + s = Series(Categorical([1, 1, 2, 3, 3, np.nan, np.nan], + categories=[3, 2, 1], ordered=True)) + result = s.mode(dropna) + expected3 = Series(expected3, dtype='category') + tm.assert_series_equal(result, expected3) + + @pytest.mark.parametrize('dropna, expected1, expected2', [ + (True, [2**63], [1, 2**63]), + (False, [2**63], [1, 2**63]) + ]) + def test_mode_intoverflow(self, dropna, expected1, expected2): + # Test for uint64 overflow. + s = Series([1, 2**63, 2**63], dtype=np.uint64) + result = s.mode(dropna) + expected1 = Series(expected1, dtype=np.uint64) + tm.assert_series_equal(result, expected1) + + s = Series([1, 2**63], dtype=np.uint64) + result = s.mode(dropna) + expected2 = Series(expected2, dtype=np.uint64) + tm.assert_series_equal(result, expected2) + + @pytest.mark.skipif(not compat.PY3, reason="only PY3") + def test_mode_sortwarning(self): + # Check for the warning that is raised when the mode + # results cannot be sorted + + expected = Series(['foo', np.nan]) + s = Series([1, 'foo', 'foo', np.nan, np.nan]) + + with tm.assert_produces_warning(UserWarning, check_stacklevel=False): + result = s.mode(dropna=False) + result = result.sort_values().reset_index(drop=True) + + tm.assert_series_equal(result, expected) + + class TestNLargestNSmallest(object): @pytest.mark.parametrize( @@ -2068,26 +2072,6 @@ def test_min_max(self): assert np.isnan(_min) assert _max == 1 - def test_mode(self): - s = Series(Categorical([1, 1, 2, 4, 5, 5, 5], - categories=[5, 4, 3, 2, 1], ordered=True)) - res = s.mode() - exp = Series(Categorical([5], categories=[ - 5, 4, 3, 2, 1], ordered=True)) - tm.assert_series_equal(res, exp) - s = Series(Categorical([1, 1, 1, 4, 5, 5, 5], - categories=[5, 4, 3, 2, 1], ordered=True)) - res = s.mode() - exp = Series(Categorical([5, 1], categories=[ - 5, 4, 3, 2, 1], ordered=True)) - tm.assert_series_equal(res, exp) - s = Series(Categorical([1, 2, 3, 4, 5], categories=[5, 4, 3, 2, 1], - ordered=True)) - res = s.mode() - exp = Series(Categorical([5, 4, 3, 2, 1], categories=[5, 4, 3, 2, 1], - ordered=True)) - tm.assert_series_equal(res, exp) - def test_value_counts(self): # GH 12835 cats = Categorical(list('abcccb'), categories=list('cabd'))