From dbf06767daad964854a774b256b310ed8b3b8a33 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Thu, 8 Feb 2018 17:04:32 -0800 Subject: [PATCH 1/2] Consolidated groupby_helpers; added / cleaned tests --- pandas/_libs/groupby.pyx | 99 ---------------------------- pandas/_libs/groupby_helper.pxi.in | 32 +++++---- pandas/tests/groupby/test_groupby.py | 73 ++++++++++++-------- 3 files changed, 65 insertions(+), 139 deletions(-) diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 55de700c9af52..f6c3aa151c6b5 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -25,105 +25,6 @@ cdef double NaN = np.NaN cdef double nan = NaN -# TODO: aggregate multiple columns in single pass -# ---------------------------------------------------------------------- -# first, nth, last - - -@cython.boundscheck(False) -@cython.wraparound(False) -def group_nth_object(ndarray[object, ndim=2] out, - ndarray[int64_t] counts, - ndarray[object, ndim=2] values, - ndarray[int64_t] labels, - int64_t rank, - Py_ssize_t min_count=-1): - """ - Only aggregates on axis=0 - """ - cdef: - Py_ssize_t i, j, N, K, lab - object val - float64_t count - ndarray[int64_t, ndim=2] nobs - ndarray[object, ndim=2] resx - - assert min_count == -1, "'min_count' only used in add and prod" - - nobs = np.zeros(( out).shape, dtype=np.int64) - resx = np.empty(( out).shape, dtype=object) - - N, K = ( values).shape - - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - for j in range(K): - val = values[i, j] - - # not nan - if val == val: - nobs[lab, j] += 1 - if nobs[lab, j] == rank: - resx[lab, j] = val - - for i in range(len(counts)): - for j in range(K): - if nobs[i, j] == 0: - out[i, j] = nan - else: - out[i, j] = resx[i, j] - - -@cython.boundscheck(False) -@cython.wraparound(False) -def group_last_object(ndarray[object, ndim=2] out, - ndarray[int64_t] counts, - ndarray[object, ndim=2] values, - ndarray[int64_t] labels, - Py_ssize_t min_count=-1): - """ - Only aggregates on axis=0 - """ - cdef: - Py_ssize_t i, j, N, K, lab - object val - float64_t count - ndarray[object, ndim=2] resx - ndarray[int64_t, ndim=2] nobs - - assert min_count == -1, "'min_count' only used in add and prod" - - nobs = np.zeros(( out).shape, dtype=np.int64) - resx = np.empty(( out).shape, dtype=object) - - N, K = ( values).shape - - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - for j in range(K): - val = values[i, j] - - # not nan - if val == val: - nobs[lab, j] += 1 - resx[lab, j] = val - - for i in range(len(counts)): - for j in range(K): - if nobs[i, j] == 0: - out[i, j] = nan - else: - out[i, j] = resx[i, j] - - cdef inline float64_t median_linear(float64_t* a, int n) nogil: cdef int i, j, na_count = 0 cdef float64_t result diff --git a/pandas/_libs/groupby_helper.pxi.in b/pandas/_libs/groupby_helper.pxi.in index a751fadaf48cf..025f1b2bc011e 100644 --- a/pandas/_libs/groupby_helper.pxi.in +++ b/pandas/_libs/groupby_helper.pxi.in @@ -325,7 +325,8 @@ def group_ohlc_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, # name, c_type, dest_type2, nan_val dtypes = [('float64', 'float64_t', 'float64_t', 'NAN'), ('float32', 'float32_t', 'float32_t', 'NAN'), - ('int64', 'int64_t', 'int64_t', 'iNaT')] + ('int64', 'int64_t', 'int64_t', 'iNaT'), + ('object', 'object', 'object', 'NAN')] def get_dispatch(dtypes): @@ -350,7 +351,7 @@ def group_last_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, """ cdef: Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - {{dest_type2}} val, count + {{dest_type2}} val ndarray[{{dest_type2}}, ndim=2] resx ndarray[int64_t, ndim=2] nobs @@ -360,11 +361,19 @@ def group_last_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, raise AssertionError("len(index) != len(labels)") nobs = np.zeros(( out).shape, dtype=np.int64) + {{if name=='object'}} + resx = np.empty(( out).shape, dtype=object) + {{else}} resx = np.empty_like(out) + {{endif}} N, K = ( values).shape + {{if name == "object"}} + if True: # make templating happy + {{else}} with nogil: + {{endif}} for i in range(N): lab = labels[i] if lab < 0: @@ -375,11 +384,7 @@ def group_last_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, val = values[i, j] # not nan - {{if name == 'int64'}} - if val != {{nan_val}}: - {{else}} if val == val and val != {{nan_val}}: - {{endif}} nobs[lab, j] += 1 resx[lab, j] = val @@ -390,7 +395,6 @@ def group_last_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, else: out[i, j] = resx[i, j] - @cython.wraparound(False) @cython.boundscheck(False) def group_nth_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, @@ -403,7 +407,7 @@ def group_nth_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, """ cdef: Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - {{dest_type2}} val, count + {{dest_type2}} val ndarray[{{dest_type2}}, ndim=2] resx ndarray[int64_t, ndim=2] nobs @@ -413,11 +417,19 @@ def group_nth_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, raise AssertionError("len(index) != len(labels)") nobs = np.zeros(( out).shape, dtype=np.int64) + {{if name=='object'}} + resx = np.empty(( out).shape, dtype=object) + {{else}} resx = np.empty_like(out) + {{endif}} N, K = ( values).shape + {{if name == "object"}} + if True: # make templating happy + {{else}} with nogil: + {{endif}} for i in range(N): lab = labels[i] if lab < 0: @@ -428,11 +440,7 @@ def group_nth_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, val = values[i, j] # not nan - {{if name == 'int64'}} - if val != {{nan_val}}: - {{else}} if val == val and val != {{nan_val}}: - {{endif}} nobs[lab, j] += 1 if nobs[lab, j] == rank: resx[lab, j] = val diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 5172efe25d697..f98045dd68a62 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2086,7 +2086,19 @@ def test_median_empty_bins(self): expected = df.groupby(bins).agg(lambda x: x.median()) assert_frame_equal(result, expected) - def test_groupby_non_arithmetic_agg_types(self): + @pytest.mark.parametrize("dtype", [ + 'int8', 'int16', 'int32', 'int64', 'float32', 'float64']) + @pytest.mark.parametrize("method,data", [ + ('first', {'df': [{'a': 1, 'b': 1}, {'a': 2, 'b': 3}]}), + ('last', {'df': [{'a': 1, 'b': 2}, {'a': 2, 'b': 4}]}), + ('min', {'df': [{'a': 1, 'b': 1}, {'a': 2, 'b': 3}]}), + ('max', {'df': [{'a': 1, 'b': 2}, {'a': 2, 'b': 4}]}), + ('nth', {'df': [{'a': 1, 'b': 2}, {'a': 2, 'b': 4}], + 'args': [1]}), + ('count', {'df': [{'a': 1, 'b': 2}, {'a': 2, 'b': 2}], + 'out_type': 'int64'}) + ]) + def test_groupby_non_arithmetic_agg_types(self, dtype, method, data): # GH9311, GH6620 df = pd.DataFrame( [{'a': 1, 'b': 1}, @@ -2094,39 +2106,44 @@ def test_groupby_non_arithmetic_agg_types(self): {'a': 2, 'b': 3}, {'a': 2, 'b': 4}]) - dtypes = ['int8', 'int16', 'int32', 'int64', 'float32', 'float64'] + df['b'] = df.b.astype(dtype) - grp_exp = {'first': {'df': [{'a': 1, 'b': 1}, {'a': 2, 'b': 3}]}, - 'last': {'df': [{'a': 1, 'b': 2}, {'a': 2, 'b': 4}]}, - 'min': {'df': [{'a': 1, 'b': 1}, {'a': 2, 'b': 3}]}, - 'max': {'df': [{'a': 1, 'b': 2}, {'a': 2, 'b': 4}]}, - 'nth': {'df': [{'a': 1, 'b': 2}, {'a': 2, 'b': 4}], - 'args': [1]}, - 'count': {'df': [{'a': 1, 'b': 2}, {'a': 2, 'b': 2}], - 'out_type': 'int64'}} + if 'args' not in data: + data['args'] = [] - for dtype in dtypes: - df_in = df.copy() - df_in['b'] = df_in.b.astype(dtype) - - for method, data in compat.iteritems(grp_exp): - if 'args' not in data: - data['args'] = [] + if 'out_type' in data: + out_type = data['out_type'] + else: + out_type = dtype - if 'out_type' in data: - out_type = data['out_type'] - else: - out_type = dtype + exp = data['df'] + df_out = pd.DataFrame(exp) - exp = data['df'] - df_out = pd.DataFrame(exp) + df_out['b'] = df_out.b.astype(out_type) + df_out.set_index('a', inplace=True) - df_out['b'] = df_out.b.astype(out_type) - df_out.set_index('a', inplace=True) + grpd = df.groupby('a') + t = getattr(grpd, method)(*data['args']) + assert_frame_equal(t, df_out) - grpd = df_in.groupby('a') - t = getattr(grpd, method)(*data['args']) - assert_frame_equal(t, df_out) + @pytest.mark.parametrize("method,exp,args", [ + ('first', [('bar', 'quuz'), ('foo', 'baz')], []), + ('last', [('bar', 'grault'), ('foo', 'quux')], []), + ('nth', [('bar', 'corge'), ('foo', 'qux')], [1]), + ]) + def test_groupby_get_nth_object(self, method, exp, args): + df = pd.DataFrame( + [{'a': 'foo', 'b': 'baz'}, + {'a': 'foo', 'b': 'qux'}, + {'a': 'foo', 'b': 'quux'}, + {'a': 'bar', 'b': 'quuz'}, + {'a': 'bar', 'b': 'corge'}, + {'a': 'bar', 'b': 'grault'}]) + exp_df = pd.DataFrame(exp, columns=['a', 'b']) + exp_df.set_index('a', inplace=True) + grpd = df.groupby('a') + t = getattr(grpd, method)(*args) + assert_frame_equal(t, exp_df) def test_groupby_non_arithmetic_agg_intlike_precision(self): # GH9311, GH6620 From fa63b47595546d4583b0fa7dbfd9e9bff393c76e Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Thu, 8 Feb 2018 19:29:30 -0800 Subject: [PATCH 2/2] Removed unnecessary get_nth_object test --- pandas/tests/groupby/test_groupby.py | 19 ------------------- 1 file changed, 19 deletions(-) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index f98045dd68a62..4a3e29d295e0e 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2126,25 +2126,6 @@ def test_groupby_non_arithmetic_agg_types(self, dtype, method, data): t = getattr(grpd, method)(*data['args']) assert_frame_equal(t, df_out) - @pytest.mark.parametrize("method,exp,args", [ - ('first', [('bar', 'quuz'), ('foo', 'baz')], []), - ('last', [('bar', 'grault'), ('foo', 'quux')], []), - ('nth', [('bar', 'corge'), ('foo', 'qux')], [1]), - ]) - def test_groupby_get_nth_object(self, method, exp, args): - df = pd.DataFrame( - [{'a': 'foo', 'b': 'baz'}, - {'a': 'foo', 'b': 'qux'}, - {'a': 'foo', 'b': 'quux'}, - {'a': 'bar', 'b': 'quuz'}, - {'a': 'bar', 'b': 'corge'}, - {'a': 'bar', 'b': 'grault'}]) - exp_df = pd.DataFrame(exp, columns=['a', 'b']) - exp_df.set_index('a', inplace=True) - grpd = df.groupby('a') - t = getattr(grpd, method)(*args) - assert_frame_equal(t, exp_df) - def test_groupby_non_arithmetic_agg_intlike_precision(self): # GH9311, GH6620 c = 24650000000000000