From 10d72e88fd46a60261bde46c397b63c618b0c905 Mon Sep 17 00:00:00 2001 From: Thomas Li Date: Wed, 17 Feb 2021 09:37:37 -0800 Subject: [PATCH 1/5] Proof of concept --- pandas/_libs/groupby.pyx | 3 + pandas/_libs/groupby_mode_helper.pxi.in | 217 ++++++++++++++++++ pandas/_libs/hashtable_class_helper.pxi.in | 3 +- pandas/core/groupby/groupby.py | 39 +++- pandas/tests/groupby/aggregate/test_cython.py | 22 ++ setup.py | 13 +- 6 files changed, 293 insertions(+), 4 deletions(-) create mode 100644 pandas/_libs/groupby_mode_helper.pxi.in diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 553ecbc58e745..476c40506ba68 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -1,6 +1,7 @@ import cython from cython import Py_ssize_t +from cpython.ref cimport PyObject from cython cimport floating from libc.stdlib cimport free, malloc @@ -33,6 +34,7 @@ from pandas._libs.algos import groupsort_indexer, rank_1d, take_2d_axis1_float64 from pandas._libs.missing cimport checknull +include "groupby_mode_helper.pxi" cdef int64_t NPY_NAT = get_nat() _int64_max = np.iinfo(np.int64).max @@ -46,6 +48,7 @@ cdef enum InterpolationEnumType: INTERPOLATION_NEAREST, INTERPOLATION_MIDPOINT +include "groupby_mode_helper.pxi" cdef inline float64_t median_linear(float64_t* a, int n) nogil: cdef: diff --git a/pandas/_libs/groupby_mode_helper.pxi.in b/pandas/_libs/groupby_mode_helper.pxi.in new file mode 100644 index 0000000000000..54d4f2ca8458f --- /dev/null +++ b/pandas/_libs/groupby_mode_helper.pxi.in @@ -0,0 +1,217 @@ +{{py: + +# name +cimported_types = [#'complex64', + #'complex128', + 'float32', + 'float64', + 'int8', + 'int16', + 'int32', + 'int64', + 'pymap', + 'str', + 'strbox', + 'uint8', + 'uint16', + 'uint32', + 'uint64'] +}} + +{{for name in cimported_types}} +from pandas._libs.khash cimport ( + kh_destroy_{{name}}, + kh_exist_{{name}}, + kh_get_{{name}}, + kh_init_{{name}}, + kh_put_{{name}}, + kh_resize_{{name}}, + kh_{{name}}_t, +) + +{{endfor}} + +from pandas._libs.khash cimport ( + #khcomplex64_t, + #khcomplex128_t, + khiter_t, +) + +from pandas._libs.hashtable import ( + # NaN checking + #is_nan_khcomplex128_t, + #is_nan_khcomplex64_t, + is_nan_float64_t, + is_nan_float32_t, + is_nan_int64_t, + is_nan_int32_t, + is_nan_int16_t, + is_nan_int8_t, + is_nan_uint64_t, + is_nan_uint32_t, + is_nan_uint16_t, + is_nan_uint8_t, + # Casting + #to_complex64, + #to_complex128, + #to_khcomplex128_t, + #to_khcomplex64_t, +) + +{{py: +# TODO: add complex64 and complex128 (requires comparisons between complex numbers) +# dtype, ttype, c_type, to_c_type, to_dtype +dtypes = [#('complex128', 'complex128', 'khcomplex128_t', + # 'to_khcomplex128_t', 'to_complex128'), + #('complex64', 'complex64', 'khcomplex64_t', + # 'to_khcomplex64_t', 'to_complex64'), + ('float64', 'float64', 'float64_t', '', ''), + ('float32', 'float32', 'float32_t', '', ''), + ('uint64', 'uint64', 'uint64_t', '', ''), + ('uint32', 'uint32', 'uint32_t', '', ''), + ('uint16', 'uint16', 'uint16_t', '', ''), + ('uint8', 'uint8', 'uint8_t', '', ''), + ('object', 'pymap', 'object', '', ''), + ('int64', 'int64', 'int64_t', '', ''), + ('int32', 'int32', 'int32_t', '', ''), + ('int16', 'int16', 'int16_t', '', ''), + ('int8', 'int8', 'int8_t', '', '')] + +}} + +{{for dtype, ttype, c_type, to_c_type, to_dtype in dtypes}} + +#@cython.wraparound(False) +#@cython.boundscheck(False) +#cdef {{c_type}} calc_mode_{{dtype}}(kh_{{ttype}}_t *table): +# cdef: +# {{c_type}} mode +# {{c_type}} val +# int count, max_count = 0 +# khiter_t k +# +# +# for k in range(table.n_buckets): +# if kh_exist_{{ttype}}(table, k): +# count = table.vals[k] +# {{if dtype != 'object'}} +# val = table.keys[k] +# if count == max_count and val < mode: +# {{else}} +# val = table.keys[k] +# if count == max_count: +# {{endif}} +# mode = val +# elif count > max_count: +# mode = val +# max_count = count +# return mode + + +@cython.wraparound(False) +@cython.boundscheck(False) +def group_mode_{{dtype}}(ndarray[{{c_type}}, ndim=1] out, + ndarray[{{c_type}}, ndim=1] values, + ndarray[int64_t, ndim=1] labels, + bint dropna = True): + """ + Calculates the mode of each group. + If multimodal returns the smallest mode in each group if numeric. + For all other datatypes, returns a mode. + """ + cdef: + Py_ssize_t i, N = len(values) + int64_t lab, curr_label = -1 + kh_{{ttype}}_t *table + khiter_t k + int count, max_count = 0, ret = 0 + {{c_type}} val, mode + + table = kh_init_{{ttype}}() + {{if dtype != 'object'}} + # Fix NOGIL later with nogil: + for i in range(N): + lab = labels[i] + if lab < 0: # NaN case + continue + if lab != curr_label and curr_label != -1: + for k in range(table.n_buckets): + if kh_exist_{{ttype}}(table, k): + count = table.vals[k] + val = table.keys[k] + if count == max_count and val < mode: + mode = val + elif count > max_count: + mode = val + max_count = count + + #out[curr_label] = calc_mode_{{dtype}}(table) + out[curr_label] = mode + # Reset variables + max_count = 0 + table = kh_init_{{ttype}}() + + val = {{to_c_type}}(values[i]) + + if not is_nan_{{c_type}}(val) or not dropna: + k = kh_get_{{ttype}}(table, val) + if k != table.n_buckets: + table.vals[k] += 1 + else: + k = kh_put_{{ttype}}(table, val, &ret) + table.vals[k] = 1 + curr_label = lab + # Calc mode for the last group + for k in range(table.n_buckets): + if kh_exist_{{ttype}}(table, k): + count = table.vals[k] + val = table.keys[k] + if count == max_count and val < mode: + mode = val + elif count > max_count: + mode = val + max_count = count + #out[lab] = calc_mode_{{dtype}}(table) + out[curr_label] = mode + {{else}} + for i in range(N): + lab = labels[i] + if lab < 0: # NaN case + continue + if lab != curr_label and curr_label != -1: + for k in range(table.n_buckets): + if kh_exist_{{ttype}}(table, k): + count = table.vals[k] + val = table.keys[k] + if count == max_count: + mode = val + elif count > max_count: + mode = val + max_count = count + #out[curr_label] = calc_mode_{{dtype}}(table) + out[curr_label] = mode + # Reset variables + table = kh_init_{{ttype}}() + + val = values[i] + if not checknull(val) or not dropna: + k = kh_get_{{ttype}}(table, val) + if k != table.n_buckets: + table.vals[k] += 1 + else: + k = kh_put_{{ttype}}(table, val, &ret) + table.vals[k] = 1 + curr_label = lab + for k in range(table.n_buckets): + if kh_exist_{{ttype}}(table, k): + count = table.vals[k] + val = table.keys[k] + if count == max_count: + mode = val + elif count > max_count: + mode = val + max_count = count + out[curr_label] = mode + {{endif}} + kh_destroy_{{ttype}}(table) +{{endfor}} diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index 276f162545399..f5ad3e9c2dc3f 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -49,7 +49,8 @@ c_types = ['khcomplex128_t', {{for c_type in c_types}} -cdef bint is_nan_{{c_type}}({{c_type}} val) nogil: +cpdef bint is_nan_{{c_type}}({{c_type}} val) nogil: + # TODO: create missing.pxi.in and move there as cdef? {{if c_type in {'khcomplex128_t', 'khcomplex64_t'} }} return val.real != val.real or val.imag != val.imag {{elif c_type in {'float64_t', 'float32_t'} }} diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 35a3768be7e73..4441ee334d6b2 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1545,6 +1545,35 @@ def median(self, numeric_only=True): numeric_only=numeric_only, ) + @final + @Substitution(name="groupby") + @Appender(_common_see_also) + def mode(self, dropna=True, numeric_only=False): + """ + Compute mode of groups, excluding missing values. If a group has + multiple modes, the smallest mode will be used. + + Parameters + ---------- + dropna : bool, default True + Do not use NaNs in mode calculation + numeric_only: bool, default False + Include only float, int, boolean columns. If None, will attempt to use + everything, then use only numeric data. + Returns + ------- + Series or DataFrame + Mode of values within each group. + """ + # Note: get_cythonized_result iterates in python, slow for many columns? + return self._get_cythonized_result( + "group_mode", + aggregate=True, + numeric_only=numeric_only, + needs_values=True, + dropna=dropna, + ) + @final @Substitution(name="groupby") @Appender(_common_see_also) @@ -2552,7 +2581,7 @@ def cummax(self, axis=0, **kwargs): def _get_cythonized_result( self, how: str, - cython_dtype: np.dtype, + cython_dtype: np.dtype = None, aggregate: bool = False, numeric_only: bool = True, needs_counts: bool = False, @@ -2633,12 +2662,18 @@ def _get_cythonized_result( labels, _, ngroups = grouper.group_info output: Dict[base.OutputKey, np.ndarray] = {} - base_func = getattr(libgroupby, how) + if cython_dtype is not None: + base_func = getattr(libgroupby, how) error_msg = "" for idx, obj in enumerate(self._iterate_slices()): name = obj.name values = obj._values + if cython_dtype is None: + cython_dtype = values.dtype + # We also need to get the specific function for that dtype + how += f"_{cython_dtype}" + base_func = getattr(libgroupby, how) if numeric_only and not is_numeric_dtype(values): continue diff --git a/pandas/tests/groupby/aggregate/test_cython.py b/pandas/tests/groupby/aggregate/test_cython.py index 8799f6faa775c..b8f52472e0ad6 100644 --- a/pandas/tests/groupby/aggregate/test_cython.py +++ b/pandas/tests/groupby/aggregate/test_cython.py @@ -380,3 +380,25 @@ def test_cython_agg_EA_known_dtypes(data, op_name, action, with_na): result = grouped["col"].aggregate(op_name) assert result.dtype == expected_dtype + + +def test_mode_numeric(): + data = { + "A": [0, 0, 0, 0, 1, 1, 1, 1, 1, 1.0, np.nan, np.nan], + "B": ["A", "B"] * 6, + "C": [2, 4, 3, 1, 2, 3, 4, 5, 2, 7, 9, 9], + } + df = DataFrame(data) + df.drop(columns="B", inplace=True) + # Group by 1 column + result = df.groupby("A").mode() + exp = DataFrame({"C": [1, 2]}, index=Series([0.0, 1.0], name="A")) + tm.assert_frame_equal(result, exp) + # Group by 2 column + df = DataFrame(data) + result = df.groupby(by=["A", "B"]).mode() + exp = DataFrame( + {"C": [3, 1, 2, 7]}, + index=pd.MultiIndex.from_product([[0.0, 1.0], ["A", "B"]], names=["A", "B"]), + ) + tm.assert_frame_equal(result, exp) diff --git a/setup.py b/setup.py index f9c4a1158fee0..b7c82a252d77d 100755 --- a/setup.py +++ b/setup.py @@ -50,6 +50,10 @@ def is_platform_mac(): _pxi_dep_template = { "algos": ["_libs/algos_common_helper.pxi.in", "_libs/algos_take_helper.pxi.in"], + "groupby": [ + "_libs/groupby_mode_helper.pxi.in", + "_libs/khash_for_primitive_helper.pxi.in", + ], "hashtable": [ "_libs/hashtable_class_helper.pxi.in", "_libs/hashtable_func_helper.pxi.in", @@ -524,7 +528,14 @@ def srcpath(name=None, suffix=".pyx", subdir="src"): "include": klib_include, "depends": _pxi_dep["algos"], }, - "_libs.groupby": {"pyxfile": "_libs/groupby"}, + "_libs.groupby": { + "pyxfile": "_libs/groupby", + "include": klib_include, + "depends": ( + ["pandas/_libs/src/klib/khash_python.h", "pandas/_libs/src/klib/khash.h"] + + _pxi_dep["groupby"] + ), + }, "_libs.hashing": {"pyxfile": "_libs/hashing", "depends": []}, "_libs.hashtable": { "pyxfile": "_libs/hashtable", From f318f24640c68171c7ed9435a72326a45ee96459 Mon Sep 17 00:00:00 2001 From: Thomas Li Date: Wed, 17 Feb 2021 09:54:54 -0800 Subject: [PATCH 2/5] Fixed annoying mistake --- pandas/_libs/groupby.pyx | 1 - pandas/_libs/groupby_mode_helper.pxi.in | 105 ++++++++---------------- 2 files changed, 33 insertions(+), 73 deletions(-) diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 476c40506ba68..9ede0306ef13f 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -34,7 +34,6 @@ from pandas._libs.algos import groupsort_indexer, rank_1d, take_2d_axis1_float64 from pandas._libs.missing cimport checknull -include "groupby_mode_helper.pxi" cdef int64_t NPY_NAT = get_nat() _int64_max = np.iinfo(np.int64).max diff --git a/pandas/_libs/groupby_mode_helper.pxi.in b/pandas/_libs/groupby_mode_helper.pxi.in index 54d4f2ca8458f..6ed7b9d2bec15 100644 --- a/pandas/_libs/groupby_mode_helper.pxi.in +++ b/pandas/_libs/groupby_mode_helper.pxi.in @@ -81,31 +81,31 @@ dtypes = [#('complex128', 'complex128', 'khcomplex128_t', {{for dtype, ttype, c_type, to_c_type, to_dtype in dtypes}} -#@cython.wraparound(False) -#@cython.boundscheck(False) -#cdef {{c_type}} calc_mode_{{dtype}}(kh_{{ttype}}_t *table): -# cdef: -# {{c_type}} mode -# {{c_type}} val -# int count, max_count = 0 -# khiter_t k -# -# -# for k in range(table.n_buckets): -# if kh_exist_{{ttype}}(table, k): -# count = table.vals[k] -# {{if dtype != 'object'}} -# val = table.keys[k] -# if count == max_count and val < mode: -# {{else}} -# val = table.keys[k] -# if count == max_count: -# {{endif}} -# mode = val -# elif count > max_count: -# mode = val -# max_count = count -# return mode + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef {{c_type}} calc_mode_{{dtype}}(kh_{{ttype}}_t *table): + cdef: + {{c_type}} mode + {{c_type}} val + int count, max_count = 0 + khiter_t k + + for k in range(table.n_buckets): + if kh_exist_{{ttype}}(table, k): + count = table.vals[k] + {{if dtype != 'object'}} + val = table.keys[k] + if count == max_count and val < mode: + {{else}} + val = table.keys[k] + if count == max_count: + {{endif}} + mode = val + elif count > max_count: + mode = val + max_count = count + return mode @cython.wraparound(False) @@ -124,29 +124,19 @@ def group_mode_{{dtype}}(ndarray[{{c_type}}, ndim=1] out, int64_t lab, curr_label = -1 kh_{{ttype}}_t *table khiter_t k - int count, max_count = 0, ret = 0 - {{c_type}} val, mode + int ret = 0 + {{c_type}} mode table = kh_init_{{ttype}}() {{if dtype != 'object'}} - # Fix NOGIL later with nogil: + #TODO: Fix NOGIL later + #with nogil: for i in range(N): lab = labels[i] if lab < 0: # NaN case continue if lab != curr_label and curr_label != -1: - for k in range(table.n_buckets): - if kh_exist_{{ttype}}(table, k): - count = table.vals[k] - val = table.keys[k] - if count == max_count and val < mode: - mode = val - elif count > max_count: - mode = val - max_count = count - - #out[curr_label] = calc_mode_{{dtype}}(table) - out[curr_label] = mode + out[curr_label] = calc_mode_{{dtype}}(table) # Reset variables max_count = 0 table = kh_init_{{ttype}}() @@ -162,34 +152,14 @@ def group_mode_{{dtype}}(ndarray[{{c_type}}, ndim=1] out, table.vals[k] = 1 curr_label = lab # Calc mode for the last group - for k in range(table.n_buckets): - if kh_exist_{{ttype}}(table, k): - count = table.vals[k] - val = table.keys[k] - if count == max_count and val < mode: - mode = val - elif count > max_count: - mode = val - max_count = count - #out[lab] = calc_mode_{{dtype}}(table) - out[curr_label] = mode + out[curr_label] = calc_mode_{{dtype}}(table) {{else}} for i in range(N): lab = labels[i] if lab < 0: # NaN case continue if lab != curr_label and curr_label != -1: - for k in range(table.n_buckets): - if kh_exist_{{ttype}}(table, k): - count = table.vals[k] - val = table.keys[k] - if count == max_count: - mode = val - elif count > max_count: - mode = val - max_count = count - #out[curr_label] = calc_mode_{{dtype}}(table) - out[curr_label] = mode + out[curr_label] = calc_mode_{{dtype}}(table) # Reset variables table = kh_init_{{ttype}}() @@ -202,16 +172,7 @@ def group_mode_{{dtype}}(ndarray[{{c_type}}, ndim=1] out, k = kh_put_{{ttype}}(table, val, &ret) table.vals[k] = 1 curr_label = lab - for k in range(table.n_buckets): - if kh_exist_{{ttype}}(table, k): - count = table.vals[k] - val = table.keys[k] - if count == max_count: - mode = val - elif count > max_count: - mode = val - max_count = count - out[curr_label] = mode + out[curr_label] = calc_mode_{{dtype}}(table) {{endif}} kh_destroy_{{ttype}}(table) {{endfor}} From 6418db79e0bf784f568128c8d1780b175673b7ac Mon Sep 17 00:00:00 2001 From: Thomas Li Date: Wed, 17 Feb 2021 11:10:37 -0800 Subject: [PATCH 3/5] Maybe silence warning? --- pandas/_libs/groupby_mode_helper.pxi.in | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/_libs/groupby_mode_helper.pxi.in b/pandas/_libs/groupby_mode_helper.pxi.in index 6ed7b9d2bec15..de784a9ce7d09 100644 --- a/pandas/_libs/groupby_mode_helper.pxi.in +++ b/pandas/_libs/groupby_mode_helper.pxi.in @@ -86,7 +86,7 @@ dtypes = [#('complex128', 'complex128', 'khcomplex128_t', @cython.boundscheck(False) cdef {{c_type}} calc_mode_{{dtype}}(kh_{{ttype}}_t *table): cdef: - {{c_type}} mode + {{c_type}} mode = 0 # fix annoying uninitialized warning {{c_type}} val int count, max_count = 0 khiter_t k @@ -125,7 +125,6 @@ def group_mode_{{dtype}}(ndarray[{{c_type}}, ndim=1] out, kh_{{ttype}}_t *table khiter_t k int ret = 0 - {{c_type}} mode table = kh_init_{{ttype}}() {{if dtype != 'object'}} From 284e0243b006e59963437cbc545a896242e1c5a7 Mon Sep 17 00:00:00 2001 From: Thomas Li Date: Wed, 17 Feb 2021 12:02:34 -0800 Subject: [PATCH 4/5] Update allowlist --- pandas/core/groupby/base.py | 1 + pandas/tests/groupby/test_allowlist.py | 1 + 2 files changed, 2 insertions(+) diff --git a/pandas/core/groupby/base.py b/pandas/core/groupby/base.py index c169e29b74dbb..f92decce6c6be 100644 --- a/pandas/core/groupby/base.py +++ b/pandas/core/groupby/base.py @@ -141,6 +141,7 @@ def _gotitem(self, key, ndim, subset=None): "mad", "max", "mean", + "mode", "median", "min", "ngroup", diff --git a/pandas/tests/groupby/test_allowlist.py b/pandas/tests/groupby/test_allowlist.py index de8335738791d..3a5e0fb0231d2 100644 --- a/pandas/tests/groupby/test_allowlist.py +++ b/pandas/tests/groupby/test_allowlist.py @@ -285,6 +285,7 @@ def test_tab_completion(mframe): "mean", "median", "min", + "mode", "ngroups", "nth", "ohlc", From 4ca6748f893b0ca74379810a69025b85ca4ff758 Mon Sep 17 00:00:00 2001 From: Thomas Li Date: Wed, 17 Feb 2021 13:20:58 -0800 Subject: [PATCH 5/5] Fix tests --- pandas/tests/groupby/test_categorical.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 4049ef46f3006..f518997c7e7db 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -52,6 +52,7 @@ def f(a): "max": np.NaN, "mean": np.NaN, "median": np.NaN, + "mode": np.NaN, "min": np.NaN, "nth": np.NaN, "nunique": 0,