diff --git a/doc/source/release.rst b/doc/source/release.rst index a6aa842940bc0..2151407b8c2fd 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -313,6 +313,8 @@ Improvements to existing features in item handling (:issue:`6745`, :issue:`6988`). - Improve performance in certain reindexing operations by optimizing ``take_2d`` (:issue:`6749`) - Arrays of strings can be wrapped to a specified width (``str.wrap``) (:issue:`6999`) +- ``GroupBy.count()`` is now implemented in Cython and is much faster for large + numbers of groups (:issue:`7016`). .. _release.bug_fixes-0.14.0: diff --git a/doc/source/v0.14.0.txt b/doc/source/v0.14.0.txt index b5df39df3b617..f9f9e5d5e4ad3 100644 --- a/doc/source/v0.14.0.txt +++ b/doc/source/v0.14.0.txt @@ -568,6 +568,8 @@ Performance - Performance improvements in timedelta conversions for integer dtypes (:issue:`6754`) - Improved performance of compatible pickles (:issue:`6899`) - Improve performance in certain reindexing operations by optimizing ``take_2d`` (:issue:`6749`) +- ``GroupBy.count()`` is now implemented in Cython and is much faster for large + numbers of groups (:issue:`7016`). Experimental ~~~~~~~~~~~~ diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index ce64ed754180d..400f7e06df784 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -5,7 +5,7 @@ import collections from pandas.compat import( - zip, builtins, range, long, lrange, lzip, + zip, builtins, range, long, lzip, OrderedDict, callable ) from pandas import compat @@ -713,15 +713,6 @@ def size(self): """ return self.grouper.size() - def count(self, axis=0): - """ - Number of non-null items in each group. - axis : axis number, default 0 - the grouping axis - """ - self._set_selection_from_grouper() - return self._python_agg_general(lambda x: notnull(x).sum(axis=axis)).astype('int64') - sum = _groupby_function('sum', 'add', np.sum) prod = _groupby_function('prod', 'prod', np.prod) min = _groupby_function('min', 'min', np.min, numeric_only=False) @@ -731,6 +722,12 @@ def count(self, axis=0): last = _groupby_function('last', 'last', _last_compat, numeric_only=False, _convert=True) + _count = _groupby_function('_count', 'count', + lambda x, axis=0: notnull(x).sum(axis=axis), + numeric_only=False) + + def count(self, axis=0): + return self._count().astype('int64') def ohlc(self): """ @@ -1318,10 +1315,11 @@ def get_group_levels(self): 'f': lambda func, a, b, c, d: func(a, b, c, d, 1) }, 'last': 'group_last', + 'count': 'group_count', } _cython_transforms = { - 'std': np.sqrt + 'std': np.sqrt, } _cython_arity = { @@ -1390,14 +1388,16 @@ def aggregate(self, values, how, axis=0): values = com.ensure_float(values) is_numeric = True else: - if issubclass(values.dtype.type, np.datetime64): - raise Exception('Cython not able to handle this case') - - values = values.astype(object) - is_numeric = False + is_numeric = issubclass(values.dtype.type, (np.datetime64, + np.timedelta64)) + if is_numeric: + values = values.view('int64') + else: + values = values.astype(object) # will be filled in Cython function - result = np.empty(out_shape, dtype=values.dtype) + result = np.empty(out_shape, + dtype=np.dtype('f%d' % values.dtype.itemsize)) result.fill(np.nan) counts = np.zeros(self.ngroups, dtype=np.int64) @@ -1405,10 +1405,10 @@ def aggregate(self, values, how, axis=0): if self._filter_empty_groups: if result.ndim == 2: - if is_numeric: + try: result = lib.row_bool_subset( result, (counts > 0).view(np.uint8)) - else: + except ValueError: result = lib.row_bool_subset_object( result, (counts > 0).view(np.uint8)) else: @@ -1442,6 +1442,7 @@ def _aggregate(self, result, counts, values, how, is_numeric): chunk = chunk.squeeze() agg_func(result[:, :, i], counts, chunk, comp_ids) else: + #import ipdb; ipdb.set_trace() # XXX BREAKPOINT agg_func(result, counts, values, comp_ids) return trans_func(result) @@ -1651,6 +1652,7 @@ def names(self): 'f': lambda func, a, b, c, d: func(a, b, c, d, 1) }, 'last': 'group_last_bin', + 'count': 'group_count_bin', } _name_functions = { diff --git a/pandas/src/generate_code.py b/pandas/src/generate_code.py index 6d45a82fb7175..53754a899adf8 100644 --- a/pandas/src/generate_code.py +++ b/pandas/src/generate_code.py @@ -3,7 +3,6 @@ # don't introduce a pandas/pandas.compat import # or we get a bootstrapping problem from StringIO import StringIO -import os header = """ cimport numpy as np @@ -34,7 +33,9 @@ ctypedef unsigned char UChar cimport util -from util cimport is_array, _checknull, _checknan +from util cimport is_array, _checknull, _checknan, get_nat + +cdef int64_t iNaT = get_nat() # import datetime C API PyDateTime_IMPORT @@ -1150,6 +1151,79 @@ def group_var_bin_%(name)s(ndarray[%(dest_type2)s, ndim=2] out, (ct * ct - ct)) """ +group_count_template = """@cython.boundscheck(False) +@cython.wraparound(False) +def group_count_%(name)s(ndarray[%(dest_type2)s, ndim=2] out, + ndarray[int64_t] counts, + ndarray[%(c_type)s, ndim=2] values, + ndarray[int64_t] labels): + ''' + Only aggregates on axis=0 + ''' + cdef: + Py_ssize_t i, j, lab + Py_ssize_t N = values.shape[0], K = values.shape[1] + %(c_type)s val + ndarray[int64_t, ndim=2] nobs = np.zeros((out.shape[0], out.shape[1]), + dtype=np.int64) + + if len(values) != len(labels): + raise AssertionError("len(index) != len(labels)") + + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + + # not nan + nobs[lab, j] += val == val and val != iNaT + + for i in range(len(counts)): + for j in range(K): + out[i, j] = nobs[i, j] + + +""" + +group_count_bin_template = """@cython.boundscheck(False) +@cython.wraparound(False) +def group_count_bin_%(name)s(ndarray[%(dest_type2)s, ndim=2] out, + ndarray[int64_t] counts, + ndarray[%(c_type)s, ndim=2] values, + ndarray[int64_t] bins): + ''' + Only aggregates on axis=0 + ''' + cdef: + Py_ssize_t i, j, ngroups + Py_ssize_t N = values.shape[0], K = values.shape[1], b = 0 + %(c_type)s val + ndarray[int64_t, ndim=2] nobs = np.zeros((out.shape[0], out.shape[1]), + dtype=np.int64) + + ngroups = len(bins) + (bins[len(bins) - 1] != N) + + for i in range(N): + while b < ngroups - 1 and i >= bins[b]: + b += 1 + + counts[b] += 1 + for j in range(K): + val = values[i, j] + + # not nan + nobs[b, j] += val == val and val != iNaT + + for i in range(ngroups): + for j in range(K): + out[i, j] = nobs[i, j] + + +""" # add passing bin edges, instead of labels @@ -2145,7 +2219,8 @@ def put2d_%(name)s_%(dest_type)s(ndarray[%(c_type)s, ndim=2, cast=True] values, #------------------------------------------------------------------------- # Generators -def generate_put_template(template, use_ints = True, use_floats = True): +def generate_put_template(template, use_ints = True, use_floats = True, + use_objects=False): floats_list = [ ('float64', 'float64_t', 'float64_t', 'np.float64'), ('float32', 'float32_t', 'float32_t', 'np.float32'), @@ -2156,11 +2231,14 @@ def generate_put_template(template, use_ints = True, use_floats = True): ('int32', 'int32_t', 'float64_t', 'np.float64'), ('int64', 'int64_t', 'float64_t', 'np.float64'), ] + object_list = [('object', 'object', 'float64_t', 'np.float64')] function_list = [] if use_floats: function_list.extend(floats_list) if use_ints: function_list.extend(ints_list) + if use_objects: + function_list.extend(object_list) output = StringIO() for name, c_type, dest_type, dest_dtype in function_list: @@ -2251,6 +2329,8 @@ def generate_from_template(template, exclude=None): group_max_bin_template, group_ohlc_template] +groupby_count = [group_count_template, group_count_bin_template] + templates_1d = [map_indices_template, pad_template, backfill_template, @@ -2272,6 +2352,7 @@ def generate_from_template(template, exclude=None): take_2d_axis1_template, take_2d_multi_template] + def generate_take_cython_file(path='generated.pyx'): with open(path, 'w') as f: print(header, file=f) @@ -2288,7 +2369,10 @@ def generate_take_cython_file(path='generated.pyx'): print(generate_put_template(template), file=f) for template in groupbys: - print(generate_put_template(template, use_ints = False), file=f) + print(generate_put_template(template, use_ints=False), file=f) + + for template in groupby_count: + print(generate_put_template(template, use_objects=True), file=f) # for template in templates_1d_datetime: # print >> f, generate_from_template_datetime(template) @@ -2299,5 +2383,6 @@ def generate_take_cython_file(path='generated.pyx'): for template in nobool_1d_templates: print(generate_from_template(template, exclude=['bool']), file=f) + if __name__ == '__main__': generate_take_cython_file() diff --git a/pandas/src/generated.pyx b/pandas/src/generated.pyx index 68bda2957fb55..26c6f3daf0e0a 100644 --- a/pandas/src/generated.pyx +++ b/pandas/src/generated.pyx @@ -27,7 +27,9 @@ from khash cimport * ctypedef unsigned char UChar cimport util -from util cimport is_array, _checknull, _checknan +from util cimport is_array, _checknull, _checknan, get_nat + +cdef int64_t iNaT = get_nat() # import datetime C API PyDateTime_IMPORT @@ -6621,6 +6623,498 @@ def group_ohlc_float32(ndarray[float32_t, ndim=2] out, out[b, 2] = vlow out[b, 3] = vclose +@cython.boundscheck(False) +@cython.wraparound(False) +def group_count_float64(ndarray[float64_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[float64_t, ndim=2] values, + ndarray[int64_t] labels): + ''' + Only aggregates on axis=0 + ''' + cdef: + Py_ssize_t i, j, lab + Py_ssize_t N = values.shape[0], K = values.shape[1] + float64_t val + ndarray[int64_t, ndim=2] nobs = np.zeros((out.shape[0], out.shape[1]), + dtype=np.int64) + + if len(values) != len(labels): + raise AssertionError("len(index) != len(labels)") + + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + + # not nan + nobs[lab, j] += val == val and val != iNaT + + for i in range(len(counts)): + for j in range(K): + out[i, j] = nobs[i, j] + + +@cython.boundscheck(False) +@cython.wraparound(False) +def group_count_float32(ndarray[float32_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[float32_t, ndim=2] values, + ndarray[int64_t] labels): + ''' + Only aggregates on axis=0 + ''' + cdef: + Py_ssize_t i, j, lab + Py_ssize_t N = values.shape[0], K = values.shape[1] + float32_t val + ndarray[int64_t, ndim=2] nobs = np.zeros((out.shape[0], out.shape[1]), + dtype=np.int64) + + if len(values) != len(labels): + raise AssertionError("len(index) != len(labels)") + + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + + # not nan + nobs[lab, j] += val == val and val != iNaT + + for i in range(len(counts)): + for j in range(K): + out[i, j] = nobs[i, j] + + +@cython.boundscheck(False) +@cython.wraparound(False) +def group_count_int8(ndarray[float32_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[int8_t, ndim=2] values, + ndarray[int64_t] labels): + ''' + Only aggregates on axis=0 + ''' + cdef: + Py_ssize_t i, j, lab + Py_ssize_t N = values.shape[0], K = values.shape[1] + int8_t val + ndarray[int64_t, ndim=2] nobs = np.zeros((out.shape[0], out.shape[1]), + dtype=np.int64) + + if len(values) != len(labels): + raise AssertionError("len(index) != len(labels)") + + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + + # not nan + nobs[lab, j] += val == val and val != iNaT + + for i in range(len(counts)): + for j in range(K): + out[i, j] = nobs[i, j] + + +@cython.boundscheck(False) +@cython.wraparound(False) +def group_count_int16(ndarray[float32_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[int16_t, ndim=2] values, + ndarray[int64_t] labels): + ''' + Only aggregates on axis=0 + ''' + cdef: + Py_ssize_t i, j, lab + Py_ssize_t N = values.shape[0], K = values.shape[1] + int16_t val + ndarray[int64_t, ndim=2] nobs = np.zeros((out.shape[0], out.shape[1]), + dtype=np.int64) + + if len(values) != len(labels): + raise AssertionError("len(index) != len(labels)") + + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + + # not nan + nobs[lab, j] += val == val and val != iNaT + + for i in range(len(counts)): + for j in range(K): + out[i, j] = nobs[i, j] + + +@cython.boundscheck(False) +@cython.wraparound(False) +def group_count_int32(ndarray[float64_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[int32_t, ndim=2] values, + ndarray[int64_t] labels): + ''' + Only aggregates on axis=0 + ''' + cdef: + Py_ssize_t i, j, lab + Py_ssize_t N = values.shape[0], K = values.shape[1] + int32_t val + ndarray[int64_t, ndim=2] nobs = np.zeros((out.shape[0], out.shape[1]), + dtype=np.int64) + + if len(values) != len(labels): + raise AssertionError("len(index) != len(labels)") + + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + + # not nan + nobs[lab, j] += val == val and val != iNaT + + for i in range(len(counts)): + for j in range(K): + out[i, j] = nobs[i, j] + + +@cython.boundscheck(False) +@cython.wraparound(False) +def group_count_int64(ndarray[float64_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[int64_t, ndim=2] values, + ndarray[int64_t] labels): + ''' + Only aggregates on axis=0 + ''' + cdef: + Py_ssize_t i, j, lab + Py_ssize_t N = values.shape[0], K = values.shape[1] + int64_t val + ndarray[int64_t, ndim=2] nobs = np.zeros((out.shape[0], out.shape[1]), + dtype=np.int64) + + if len(values) != len(labels): + raise AssertionError("len(index) != len(labels)") + + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + + # not nan + nobs[lab, j] += val == val and val != iNaT + + for i in range(len(counts)): + for j in range(K): + out[i, j] = nobs[i, j] + + +@cython.boundscheck(False) +@cython.wraparound(False) +def group_count_object(ndarray[float64_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[object, ndim=2] values, + ndarray[int64_t] labels): + ''' + Only aggregates on axis=0 + ''' + cdef: + Py_ssize_t i, j, lab + Py_ssize_t N = values.shape[0], K = values.shape[1] + object val + ndarray[int64_t, ndim=2] nobs = np.zeros((out.shape[0], out.shape[1]), + dtype=np.int64) + + if len(values) != len(labels): + raise AssertionError("len(index) != len(labels)") + + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + + # not nan + nobs[lab, j] += val == val and val != iNaT + + for i in range(len(counts)): + for j in range(K): + out[i, j] = nobs[i, j] + + + +@cython.boundscheck(False) +@cython.wraparound(False) +def group_count_bin_float64(ndarray[float64_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[float64_t, ndim=2] values, + ndarray[int64_t] bins): + ''' + Only aggregates on axis=0 + ''' + cdef: + Py_ssize_t i, j, ngroups + Py_ssize_t N = values.shape[0], K = values.shape[1], b = 0 + float64_t val + ndarray[int64_t, ndim=2] nobs = np.zeros((out.shape[0], out.shape[1]), + dtype=np.int64) + + ngroups = len(bins) + (bins[len(bins) - 1] != N) + + for i in range(N): + while b < ngroups - 1 and i >= bins[b]: + b += 1 + + counts[b] += 1 + for j in range(K): + val = values[i, j] + + # not nan + nobs[b, j] += val == val and val != iNaT + + for i in range(ngroups): + for j in range(K): + out[i, j] = nobs[i, j] + + +@cython.boundscheck(False) +@cython.wraparound(False) +def group_count_bin_float32(ndarray[float32_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[float32_t, ndim=2] values, + ndarray[int64_t] bins): + ''' + Only aggregates on axis=0 + ''' + cdef: + Py_ssize_t i, j, ngroups + Py_ssize_t N = values.shape[0], K = values.shape[1], b = 0 + float32_t val + ndarray[int64_t, ndim=2] nobs = np.zeros((out.shape[0], out.shape[1]), + dtype=np.int64) + + ngroups = len(bins) + (bins[len(bins) - 1] != N) + + for i in range(N): + while b < ngroups - 1 and i >= bins[b]: + b += 1 + + counts[b] += 1 + for j in range(K): + val = values[i, j] + + # not nan + nobs[b, j] += val == val and val != iNaT + + for i in range(ngroups): + for j in range(K): + out[i, j] = nobs[i, j] + + +@cython.boundscheck(False) +@cython.wraparound(False) +def group_count_bin_int8(ndarray[float32_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[int8_t, ndim=2] values, + ndarray[int64_t] bins): + ''' + Only aggregates on axis=0 + ''' + cdef: + Py_ssize_t i, j, ngroups + Py_ssize_t N = values.shape[0], K = values.shape[1], b = 0 + int8_t val + ndarray[int64_t, ndim=2] nobs = np.zeros((out.shape[0], out.shape[1]), + dtype=np.int64) + + ngroups = len(bins) + (bins[len(bins) - 1] != N) + + for i in range(N): + while b < ngroups - 1 and i >= bins[b]: + b += 1 + + counts[b] += 1 + for j in range(K): + val = values[i, j] + + # not nan + nobs[b, j] += val == val and val != iNaT + + for i in range(ngroups): + for j in range(K): + out[i, j] = nobs[i, j] + + +@cython.boundscheck(False) +@cython.wraparound(False) +def group_count_bin_int16(ndarray[float32_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[int16_t, ndim=2] values, + ndarray[int64_t] bins): + ''' + Only aggregates on axis=0 + ''' + cdef: + Py_ssize_t i, j, ngroups + Py_ssize_t N = values.shape[0], K = values.shape[1], b = 0 + int16_t val + ndarray[int64_t, ndim=2] nobs = np.zeros((out.shape[0], out.shape[1]), + dtype=np.int64) + + ngroups = len(bins) + (bins[len(bins) - 1] != N) + + for i in range(N): + while b < ngroups - 1 and i >= bins[b]: + b += 1 + + counts[b] += 1 + for j in range(K): + val = values[i, j] + + # not nan + nobs[b, j] += val == val and val != iNaT + + for i in range(ngroups): + for j in range(K): + out[i, j] = nobs[i, j] + + +@cython.boundscheck(False) +@cython.wraparound(False) +def group_count_bin_int32(ndarray[float64_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[int32_t, ndim=2] values, + ndarray[int64_t] bins): + ''' + Only aggregates on axis=0 + ''' + cdef: + Py_ssize_t i, j, ngroups + Py_ssize_t N = values.shape[0], K = values.shape[1], b = 0 + int32_t val + ndarray[int64_t, ndim=2] nobs = np.zeros((out.shape[0], out.shape[1]), + dtype=np.int64) + + ngroups = len(bins) + (bins[len(bins) - 1] != N) + + for i in range(N): + while b < ngroups - 1 and i >= bins[b]: + b += 1 + + counts[b] += 1 + for j in range(K): + val = values[i, j] + + # not nan + nobs[b, j] += val == val and val != iNaT + + for i in range(ngroups): + for j in range(K): + out[i, j] = nobs[i, j] + + +@cython.boundscheck(False) +@cython.wraparound(False) +def group_count_bin_int64(ndarray[float64_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[int64_t, ndim=2] values, + ndarray[int64_t] bins): + ''' + Only aggregates on axis=0 + ''' + cdef: + Py_ssize_t i, j, ngroups + Py_ssize_t N = values.shape[0], K = values.shape[1], b = 0 + int64_t val + ndarray[int64_t, ndim=2] nobs = np.zeros((out.shape[0], out.shape[1]), + dtype=np.int64) + + ngroups = len(bins) + (bins[len(bins) - 1] != N) + + for i in range(N): + while b < ngroups - 1 and i >= bins[b]: + b += 1 + + counts[b] += 1 + for j in range(K): + val = values[i, j] + + # not nan + nobs[b, j] += val == val and val != iNaT + + for i in range(ngroups): + for j in range(K): + out[i, j] = nobs[i, j] + + +@cython.boundscheck(False) +@cython.wraparound(False) +def group_count_bin_object(ndarray[float64_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[object, ndim=2] values, + ndarray[int64_t] bins): + ''' + Only aggregates on axis=0 + ''' + cdef: + Py_ssize_t i, j, ngroups + Py_ssize_t N = values.shape[0], K = values.shape[1], b = 0 + object val + ndarray[int64_t, ndim=2] nobs = np.zeros((out.shape[0], out.shape[1]), + dtype=np.int64) + + ngroups = len(bins) + (bins[len(bins) - 1] != N) + + for i in range(N): + while b < ngroups - 1 and i >= bins[b]: + b += 1 + + counts[b] += 1 + for j in range(K): + val = values[i, j] + + # not nan + nobs[b, j] += val == val and val != iNaT + + for i in range(ngroups): + for j in range(K): + out[i, j] = nobs[i, j] + + + @cython.wraparound(False) @cython.boundscheck(False) def left_join_indexer_unique_float64(ndarray[float64_t] left, diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index 1b70ae0309b10..eb3c28b672fd4 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -1451,7 +1451,6 @@ def test_groupby_head_tail(self): assert_frame_equal(g_not_as[['B']].head(1), df_as.loc[[0,2], ['B']]) assert_frame_equal(g_not_as[['A', 'B']].head(1), df_as.loc[[0,2]]) - def test_groupby_multiple_key(self): df = tm.makeTimeDataFrame() grouped = df.groupby([lambda x: x.year, @@ -1629,6 +1628,21 @@ def test_cython_agg_nothing_to_agg(self): 'b': ['foo', 'bar'] * 25}) self.assertRaises(DataError, frame[['b']].groupby(frame['a']).mean) + def test_cython_agg_nothing_to_agg_with_dates(self): + frame = DataFrame({'a': np.random.randint(0, 5, 50), + 'b': ['foo', 'bar'] * 25, + 'dates': pd.date_range('now', periods=50, + freq='T')}) + with tm.assertRaisesRegexp(DataError, "No numeric types to aggregate"): + frame.groupby('b').dates.mean() + + def test_groupby_timedelta_cython_count(self): + df = DataFrame({'g': list('ab' * 2), + 'delt': np.arange(4).astype('timedelta64[ns]')}) + expected = Series([2, 2], index=['a', 'b'], name='delt') + result = df.groupby('g').delt.count() + tm.assert_series_equal(expected, result) + def test_cython_agg_frame_columns(self): # #2113 df = DataFrame({'x': [1, 2, 3], 'y': [3, 4, 5]}) @@ -1992,7 +2006,8 @@ def test_count(self): # GH5610 # count counts non-nulls - df = pd.DataFrame([[1, 2, 'foo'], [1, nan, 'bar'], [3, nan, nan]], columns=['A', 'B', 'C']) + df = pd.DataFrame([[1, 2, 'foo'], [1, nan, 'bar'], [3, nan, nan]], + columns=['A', 'B', 'C']) count_as = df.groupby('A').count() count_not_as = df.groupby('A', as_index=False).count() @@ -2005,6 +2020,19 @@ def test_count(self): count_B = df.groupby('A')['B'].count() assert_series_equal(count_B, expected['B']) + def test_count_object(self): + df = pd.DataFrame({'a': ['a'] * 3 + ['b'] * 3, + 'c': [2] * 3 + [3] * 3}) + result = df.groupby('c').a.count() + expected = pd.Series([3, 3], index=[2, 3], name='a') + tm.assert_series_equal(result, expected) + + df = pd.DataFrame({'a': ['a', np.nan, np.nan] + ['b'] * 3, + 'c': [2] * 3 + [3] * 3}) + result = df.groupby('c').a.count() + expected = pd.Series([1, 3], index=[2, 3], name='a') + tm.assert_series_equal(result, expected) + def test_non_cython_api(self): # GH5610 @@ -2354,7 +2382,6 @@ def test_groupby_aggregation_mixed_dtype(self): result = g[['v1','v2']].mean() assert_frame_equal(result,expected) - def test_groupby_dtype_inference_empty(self): # GH 6733 df = DataFrame({'x': [], 'range': np.arange(0,dtype='int64')}) @@ -3325,7 +3352,6 @@ def test_cumcount_groupby_not_col(self): assert_series_equal(expected, g.cumcount()) assert_series_equal(expected, sg.cumcount()) - def test_filter_series(self): import pandas as pd s = pd.Series([1, 3, 20, 5, 22, 24, 7]) @@ -4168,6 +4194,14 @@ def test_nargsort(self): expected = list(range(5)) + list(range(105, 110)) + list(range(104, 4, -1)) assert_equal(result, expected) + def test_datetime_count(self): + df = DataFrame({'a': [1,2,3] * 2, + 'dates': pd.date_range('now', periods=6, freq='T')}) + result = df.groupby('a').dates.count() + expected = Series([2, 2, 2], index=Index([1, 2, 3], name='a'), + name='dates') + tm.assert_series_equal(result, expected) + def assert_fp_equal(a, b): assert (np.abs(a - b) < 1e-12).all() diff --git a/pandas/tslib.pyx b/pandas/tslib.pyx index 6d99d38049e5a..df9c465c33853 100644 --- a/pandas/tslib.pyx +++ b/pandas/tslib.pyx @@ -462,6 +462,12 @@ class NaTType(_NaT): def __hash__(self): return iNaT + def __int__(self): + return NPY_NAT + + def __long__(self): + return NPY_NAT + def weekday(self): return -1 diff --git a/vb_suite/groupby.py b/vb_suite/groupby.py index 01644153b28e1..638862ffd1367 100644 --- a/vb_suite/groupby.py +++ b/vb_suite/groupby.py @@ -119,6 +119,36 @@ def f(): setup, start_date=datetime(2011, 10, 1)) #---------------------------------------------------------------------- +# count() speed + +setup = common_setup + """ +n = 10000 +offsets = np.random.randint(n, size=n).astype('timedelta64[ns]') + +dates = np.datetime64('now') + offsets +dates[np.random.rand(n) > 0.5] = np.datetime64('nat') + +offsets[np.random.rand(n) > 0.5] = np.timedelta64('nat') + +value2 = np.random.randn(n) +value2[np.random.rand(n) > 0.5] = np.nan + +obj = pd.util.testing.choice(['a', 'b'], size=n).astype(object) +obj[np.random.randn(n) > 0.5] = np.nan + +df = DataFrame({'key1': np.random.randint(0, 500, size=n), + 'key2': np.random.randint(0, 100, size=n), + 'dates': dates, + 'value2' : value2, + 'value3' : np.random.randn(n), + 'obj': obj, + 'offsets': offsets}) +""" + +groupby_multi_count = Benchmark("df.groupby(['key1', 'key2']).count()", + setup, name='groupby_multi_count', + start_date=datetime(2014, 5, 5)) +#---------------------------------------------------------------------- # Series.value_counts setup = common_setup + """ @@ -151,11 +181,11 @@ def f(): ind2 = np.random.randint(0, 2, size=100000) df = DataFrame({'key1': fac1.take(ind1), - 'key2': fac2.take(ind2), - 'key3': fac2.take(ind2), - 'value1' : np.random.randn(100000), - 'value2' : np.random.randn(100000), - 'value3' : np.random.randn(100000)}) +'key2': fac2.take(ind2), +'key3': fac2.take(ind2), +'value1' : np.random.randn(100000), +'value2' : np.random.randn(100000), +'value3' : np.random.randn(100000)}) """ stmt = "df.pivot_table(rows='key1', cols=['key2', 'key3'])" @@ -192,13 +222,13 @@ def f(): start_date=datetime(2012, 5, 1)) groupby_first_float32 = Benchmark('data2.groupby(labels).first()', setup, - start_date=datetime(2013, 1, 1)) + start_date=datetime(2013, 1, 1)) groupby_last = Benchmark('data.groupby(labels).last()', setup, start_date=datetime(2012, 5, 1)) groupby_last_float32 = Benchmark('data2.groupby(labels).last()', setup, - start_date=datetime(2013, 1, 1)) + start_date=datetime(2013, 1, 1)) #---------------------------------------------------------------------- @@ -256,9 +286,9 @@ def f(): labels = np.random.randint(0, 2000, size=N) labels2 = np.random.randint(0, 3, size=N) df = DataFrame({'key': labels, - 'key2': labels2, - 'value1': randn(N), - 'value2': ['foo', 'bar', 'baz', 'qux'] * (N / 4)}) +'key2': labels2, +'value1': randn(N), +'value2': ['foo', 'bar', 'baz', 'qux'] * (N / 4)}) def f(g): return 1 """