diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 43bf6d9dd1fee..eeb5592d744d6 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -1,6 +1,7 @@ import cython from cython import Py_ssize_t +from cpython.ref cimport PyObject from cython cimport floating from libc.stdlib cimport ( free, @@ -56,6 +57,7 @@ cdef enum InterpolationEnumType: INTERPOLATION_NEAREST, INTERPOLATION_MIDPOINT +include "groupby_mode_helper.pxi" cdef inline float64_t median_linear(float64_t* a, int n) nogil: cdef: diff --git a/pandas/_libs/groupby_mode_helper.pxi.in b/pandas/_libs/groupby_mode_helper.pxi.in new file mode 100644 index 0000000000000..de784a9ce7d09 --- /dev/null +++ b/pandas/_libs/groupby_mode_helper.pxi.in @@ -0,0 +1,177 @@ +{{py: + +# name +cimported_types = [#'complex64', + #'complex128', + 'float32', + 'float64', + 'int8', + 'int16', + 'int32', + 'int64', + 'pymap', + 'str', + 'strbox', + 'uint8', + 'uint16', + 'uint32', + 'uint64'] +}} + +{{for name in cimported_types}} +from pandas._libs.khash cimport ( + kh_destroy_{{name}}, + kh_exist_{{name}}, + kh_get_{{name}}, + kh_init_{{name}}, + kh_put_{{name}}, + kh_resize_{{name}}, + kh_{{name}}_t, +) + +{{endfor}} + +from pandas._libs.khash cimport ( + #khcomplex64_t, + #khcomplex128_t, + khiter_t, +) + +from pandas._libs.hashtable import ( + # NaN checking + #is_nan_khcomplex128_t, + #is_nan_khcomplex64_t, + is_nan_float64_t, + is_nan_float32_t, + is_nan_int64_t, + is_nan_int32_t, + is_nan_int16_t, + is_nan_int8_t, + is_nan_uint64_t, + is_nan_uint32_t, + is_nan_uint16_t, + is_nan_uint8_t, + # Casting + #to_complex64, + #to_complex128, + #to_khcomplex128_t, + #to_khcomplex64_t, +) + +{{py: +# TODO: add complex64 and complex128 (requires comparisons between complex numbers) +# dtype, ttype, c_type, to_c_type, to_dtype +dtypes = [#('complex128', 'complex128', 'khcomplex128_t', + # 'to_khcomplex128_t', 'to_complex128'), + #('complex64', 'complex64', 'khcomplex64_t', + # 'to_khcomplex64_t', 'to_complex64'), + ('float64', 'float64', 'float64_t', '', ''), + ('float32', 'float32', 'float32_t', '', ''), + ('uint64', 'uint64', 'uint64_t', '', ''), + ('uint32', 'uint32', 'uint32_t', '', ''), + ('uint16', 'uint16', 'uint16_t', '', ''), + ('uint8', 'uint8', 'uint8_t', '', ''), + ('object', 'pymap', 'object', '', ''), + ('int64', 'int64', 'int64_t', '', ''), + ('int32', 'int32', 'int32_t', '', ''), + ('int16', 'int16', 'int16_t', '', ''), + ('int8', 'int8', 'int8_t', '', '')] + +}} + +{{for dtype, ttype, c_type, to_c_type, to_dtype in dtypes}} + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef {{c_type}} calc_mode_{{dtype}}(kh_{{ttype}}_t *table): + cdef: + {{c_type}} mode = 0 # fix annoying uninitialized warning + {{c_type}} val + int count, max_count = 0 + khiter_t k + + for k in range(table.n_buckets): + if kh_exist_{{ttype}}(table, k): + count = table.vals[k] + {{if dtype != 'object'}} + val = table.keys[k] + if count == max_count and val < mode: + {{else}} + val = table.keys[k] + if count == max_count: + {{endif}} + mode = val + elif count > max_count: + mode = val + max_count = count + return mode + + +@cython.wraparound(False) +@cython.boundscheck(False) +def group_mode_{{dtype}}(ndarray[{{c_type}}, ndim=1] out, + ndarray[{{c_type}}, ndim=1] values, + ndarray[int64_t, ndim=1] labels, + bint dropna = True): + """ + Calculates the mode of each group. + If multimodal returns the smallest mode in each group if numeric. + For all other datatypes, returns a mode. + """ + cdef: + Py_ssize_t i, N = len(values) + int64_t lab, curr_label = -1 + kh_{{ttype}}_t *table + khiter_t k + int ret = 0 + + table = kh_init_{{ttype}}() + {{if dtype != 'object'}} + #TODO: Fix NOGIL later + #with nogil: + for i in range(N): + lab = labels[i] + if lab < 0: # NaN case + continue + if lab != curr_label and curr_label != -1: + out[curr_label] = calc_mode_{{dtype}}(table) + # Reset variables + max_count = 0 + table = kh_init_{{ttype}}() + + val = {{to_c_type}}(values[i]) + + if not is_nan_{{c_type}}(val) or not dropna: + k = kh_get_{{ttype}}(table, val) + if k != table.n_buckets: + table.vals[k] += 1 + else: + k = kh_put_{{ttype}}(table, val, &ret) + table.vals[k] = 1 + curr_label = lab + # Calc mode for the last group + out[curr_label] = calc_mode_{{dtype}}(table) + {{else}} + for i in range(N): + lab = labels[i] + if lab < 0: # NaN case + continue + if lab != curr_label and curr_label != -1: + out[curr_label] = calc_mode_{{dtype}}(table) + # Reset variables + table = kh_init_{{ttype}}() + + val = values[i] + if not checknull(val) or not dropna: + k = kh_get_{{ttype}}(table, val) + if k != table.n_buckets: + table.vals[k] += 1 + else: + k = kh_put_{{ttype}}(table, val, &ret) + table.vals[k] = 1 + curr_label = lab + out[curr_label] = calc_mode_{{dtype}}(table) + {{endif}} + kh_destroy_{{ttype}}(table) +{{endfor}} diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index 0b6bb170cc531..21a7550a80496 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -42,7 +42,8 @@ c_types = ['khcomplex128_t', {{for c_type in c_types}} -cdef bint is_nan_{{c_type}}({{c_type}} val) nogil: +cpdef bint is_nan_{{c_type}}({{c_type}} val) nogil: + # TODO: create missing.pxi.in and move there as cdef? {{if c_type in {'khcomplex128_t', 'khcomplex64_t'} }} return val.real != val.real or val.imag != val.imag {{elif c_type in {'float64_t', 'float32_t'} }} diff --git a/pandas/core/groupby/base.py b/pandas/core/groupby/base.py index c169e29b74dbb..f92decce6c6be 100644 --- a/pandas/core/groupby/base.py +++ b/pandas/core/groupby/base.py @@ -141,6 +141,7 @@ def _gotitem(self, key, ndim, subset=None): "mad", "max", "mean", + "mode", "median", "min", "ngroup", diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index bc277bf67614d..072b53ba65381 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1578,6 +1578,35 @@ def median(self, numeric_only=True): numeric_only=numeric_only, ) + @final + @Substitution(name="groupby") + @Appender(_common_see_also) + def mode(self, dropna=True, numeric_only=False): + """ + Compute mode of groups, excluding missing values. If a group has + multiple modes, the smallest mode will be used. + + Parameters + ---------- + dropna : bool, default True + Do not use NaNs in mode calculation + numeric_only: bool, default False + Include only float, int, boolean columns. If None, will attempt to use + everything, then use only numeric data. + Returns + ------- + Series or DataFrame + Mode of values within each group. + """ + # Note: get_cythonized_result iterates in python, slow for many columns? + return self._get_cythonized_result( + "group_mode", + aggregate=True, + numeric_only=numeric_only, + needs_values=True, + dropna=dropna, + ) + @final @Substitution(name="groupby") @Appender(_common_see_also) @@ -2585,7 +2614,7 @@ def cummax(self, axis=0, **kwargs): def _get_cythonized_result( self, how: str, - cython_dtype: np.dtype, + cython_dtype: np.dtype = None, aggregate: bool = False, numeric_only: bool = True, needs_counts: bool = False, @@ -2666,12 +2695,18 @@ def _get_cythonized_result( labels, _, ngroups = grouper.group_info output: Dict[base.OutputKey, np.ndarray] = {} - base_func = getattr(libgroupby, how) + if cython_dtype is not None: + base_func = getattr(libgroupby, how) error_msg = "" for idx, obj in enumerate(self._iterate_slices()): name = obj.name values = obj._values + if cython_dtype is None: + cython_dtype = values.dtype + # We also need to get the specific function for that dtype + how += f"_{cython_dtype}" + base_func = getattr(libgroupby, how) if numeric_only and not is_numeric_dtype(values): continue diff --git a/pandas/tests/groupby/aggregate/test_cython.py b/pandas/tests/groupby/aggregate/test_cython.py index f9b45f4d9f4cf..d0085a408785b 100644 --- a/pandas/tests/groupby/aggregate/test_cython.py +++ b/pandas/tests/groupby/aggregate/test_cython.py @@ -388,3 +388,25 @@ def test_cython_agg_EA_known_dtypes(data, op_name, action, with_na): result = grouped["col"].aggregate(op_name) assert result.dtype == expected_dtype + + +def test_mode_numeric(): + data = { + "A": [0, 0, 0, 0, 1, 1, 1, 1, 1, 1.0, np.nan, np.nan], + "B": ["A", "B"] * 6, + "C": [2, 4, 3, 1, 2, 3, 4, 5, 2, 7, 9, 9], + } + df = DataFrame(data) + df.drop(columns="B", inplace=True) + # Group by 1 column + result = df.groupby("A").mode() + exp = DataFrame({"C": [1, 2]}, index=Series([0.0, 1.0], name="A")) + tm.assert_frame_equal(result, exp) + # Group by 2 column + df = DataFrame(data) + result = df.groupby(by=["A", "B"]).mode() + exp = DataFrame( + {"C": [3, 1, 2, 7]}, + index=pd.MultiIndex.from_product([[0.0, 1.0], ["A", "B"]], names=["A", "B"]), + ) + tm.assert_frame_equal(result, exp) diff --git a/pandas/tests/groupby/test_allowlist.py b/pandas/tests/groupby/test_allowlist.py index de8335738791d..3a5e0fb0231d2 100644 --- a/pandas/tests/groupby/test_allowlist.py +++ b/pandas/tests/groupby/test_allowlist.py @@ -285,6 +285,7 @@ def test_tab_completion(mframe): "mean", "median", "min", + "mode", "ngroups", "nth", "ohlc", diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 4049ef46f3006..f518997c7e7db 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -52,6 +52,7 @@ def f(a): "max": np.NaN, "mean": np.NaN, "median": np.NaN, + "mode": np.NaN, "min": np.NaN, "nth": np.NaN, "nunique": 0, diff --git a/setup.py b/setup.py index 45548fed68322..47a7b18d20d32 100755 --- a/setup.py +++ b/setup.py @@ -56,6 +56,10 @@ def is_platform_mac(): _pxi_dep_template = { "algos": ["_libs/algos_common_helper.pxi.in", "_libs/algos_take_helper.pxi.in"], + "groupby": [ + "_libs/groupby_mode_helper.pxi.in", + "_libs/khash_for_primitive_helper.pxi.in", + ], "hashtable": [ "_libs/hashtable_class_helper.pxi.in", "_libs/hashtable_func_helper.pxi.in", @@ -440,7 +444,14 @@ def srcpath(name=None, suffix=".pyx", subdir="src"): "include": klib_include, "depends": _pxi_dep["algos"], }, - "_libs.groupby": {"pyxfile": "_libs/groupby"}, + "_libs.groupby": { + "pyxfile": "_libs/groupby", + "include": klib_include, + "depends": ( + ["pandas/_libs/src/klib/khash_python.h", "pandas/_libs/src/klib/khash.h"] + + _pxi_dep["groupby"] + ), + }, "_libs.hashing": {"pyxfile": "_libs/hashing", "depends": []}, "_libs.hashtable": { "pyxfile": "_libs/hashtable",