diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index ae3a0847600e5..bc12122434d1f 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -24,10 +24,10 @@ class DataError(GroupByError): class SpecificationError(GroupByError): pass -def _groupby_function(name, alias, npfunc): +def _groupby_function(name, alias, npfunc, numeric_only=True): def f(self): try: - return self._cython_agg_general(alias) + return self._cython_agg_general(alias, numeric_only=numeric_only) except Exception: return self.aggregate(lambda x: npfunc(x, axis=self.axis)) @@ -350,8 +350,9 @@ def size(self): prod = _groupby_function('prod', 'prod', np.prod) min = _groupby_function('min', 'min', np.min) max = _groupby_function('max', 'max', np.max) - first = _groupby_function('first', 'first', _first_compat) - last = _groupby_function('last', 'last', _last_compat) + first = _groupby_function('first', 'first', _first_compat, + numeric_only=False) + last = _groupby_function('last', 'last', _last_compat, numeric_only=False) def ohlc(self): """ @@ -370,10 +371,11 @@ def picker(arr): return np.nan return self.agg(picker) - def _cython_agg_general(self, how): + def _cython_agg_general(self, how, numeric_only=True): output = {} for name, obj in self._iterate_slices(): - if not issubclass(obj.dtype.type, (np.number, np.bool_)): + is_numeric = issubclass(obj.dtype.type, (np.number, np.bool_)) + if numeric_only and not is_numeric: continue result, names = self.grouper.aggregate(obj.values, how) @@ -668,6 +670,11 @@ def get_group_levels(self): 'last': lib.group_last } + _cython_object_functions = { + 'first' : lambda a, b, c, d: lib.group_nth_object(a, b, c, d, 1), + 'last' : lib.group_last_object + } + _cython_transforms = { 'std' : np.sqrt } @@ -681,7 +688,13 @@ def get_group_levels(self): _filter_empty_groups = True def aggregate(self, values, how, axis=0): - values = com._ensure_float64(values) + values = com.ensure_float(values) + is_numeric = True + + if not issubclass(values.dtype.type, (np.number, np.bool_)): + values = values.astype(object) + is_numeric = False + arity = self._cython_arity.get(how, 1) vdim = values.ndim @@ -698,15 +711,19 @@ def aggregate(self, values, how, axis=0): out_shape = (self.ngroups,) + values.shape[1:] # will be filled in Cython function - result = np.empty(out_shape, dtype=np.float64) + result = np.empty(out_shape, dtype=values.dtype) counts = np.zeros(self.ngroups, dtype=np.int64) - result = self._aggregate(result, counts, values, how) + result = self._aggregate(result, counts, values, how, is_numeric) if self._filter_empty_groups: if result.ndim == 2: - result = lib.row_bool_subset(result, - (counts > 0).view(np.uint8)) + if is_numeric: + result = lib.row_bool_subset(result, + (counts > 0).view(np.uint8)) + else: + result = lib.row_bool_subset_object(result, + (counts > 0).view(np.uint8)) else: result = result[counts > 0] @@ -724,8 +741,11 @@ def aggregate(self, values, how, axis=0): return result, names - def _aggregate(self, result, counts, values, how): - agg_func = self._cython_functions[how] + def _aggregate(self, result, counts, values, how, is_numeric): + fdict = self._cython_functions + if not is_numeric: + fdict = self._cython_object_functions + agg_func = fdict[how] trans_func = self._cython_transforms.get(how, lambda x: x) comp_ids, _, ngroups = self.group_info @@ -913,14 +933,22 @@ def names(self): 'last': lib.group_last_bin } + _cython_object_functions = { + 'first' : lambda a, b, c, d: lib.group_nth_bin_object(a, b, c, d, 1), + 'last' : lib.group_last_bin_object + } + _name_functions = { 'ohlc' : lambda *args: ['open', 'high', 'low', 'close'] } _filter_empty_groups = True - def _aggregate(self, result, counts, values, how): - agg_func = self._cython_functions[how] + def _aggregate(self, result, counts, values, how, is_numeric=True): + fdict = self._cython_functions + if not is_numeric: + fdict = self._cython_object_functions + agg_func = fdict[how] trans_func = self._cython_transforms.get(how, lambda x: x) if values.ndim > 3: @@ -1385,8 +1413,8 @@ def _iterate_slices(self): yield val, slicer(val) - def _cython_agg_general(self, how): - new_blocks = self._cython_agg_blocks(how) + def _cython_agg_general(self, how, numeric_only=True): + new_blocks = self._cython_agg_blocks(how, numeric_only=numeric_only) return self._wrap_agged_blocks(new_blocks) def _wrap_agged_blocks(self, blocks): @@ -1408,18 +1436,20 @@ def _wrap_agged_blocks(self, blocks): _block_agg_axis = 0 - def _cython_agg_blocks(self, how): + def _cython_agg_blocks(self, how, numeric_only=True): data, agg_axis = self._get_data_to_aggregate() new_blocks = [] for block in data.blocks: values = block.values - if not issubclass(values.dtype.type, (np.number, np.bool_)): + is_numeric = issubclass(values.dtype.type, (np.number, np.bool_)) + if numeric_only and not is_numeric: continue - values = com._ensure_float64(values) - result, names = self.grouper.aggregate(values, how, axis=agg_axis) + if is_numeric: + values = com.ensure_float(values) + result, _ = self.grouper.aggregate(values, how, axis=agg_axis) newb = make_block(result, block.items, block.ref_items) new_blocks.append(newb) @@ -2210,5 +2240,3 @@ def complete_dataframe(obj, prev_completions): install_ipython_completers() except Exception: pass - - diff --git a/pandas/src/groupby.pyx b/pandas/src/groupby.pyx index 552f781e6be0c..739897ea8971a 100644 --- a/pandas/src/groupby.pyx +++ b/pandas/src/groupby.pyx @@ -376,6 +376,49 @@ def group_nth(ndarray[float64_t, ndim=2] out, else: out[i, j] = resx[i, j] +@cython.boundscheck(False) +@cython.wraparound(False) +def group_nth_object(ndarray[object, ndim=2] out, + ndarray[int64_t] counts, + ndarray[object, ndim=2] values, + ndarray[int64_t] labels, + int64_t rank): + ''' + Only aggregates on axis=0 + ''' + cdef: + Py_ssize_t i, j, N, K, lab + object val + float64_t count + ndarray[int64_t, ndim=2] nobs + ndarray[object, ndim=2] resx + + nobs = np.zeros(( out).shape, dtype=np.int64) + resx = np.empty(( out).shape, dtype=object) + + N, K = ( values).shape + + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val: + nobs[lab, j] += 1 + if nobs[lab, j] == rank: + resx[lab, j] = val + + for i in range(len(counts)): + for j in range(K): + if nobs[i, j] == 0: + out[i, j] = nan + else: + out[i, j] = resx[i, j] @cython.boundscheck(False) @cython.wraparound(False) @@ -423,6 +466,54 @@ def group_nth_bin(ndarray[float64_t, ndim=2] out, else: out[i, j] = resx[i, j] +@cython.boundscheck(False) +@cython.wraparound(False) +def group_nth_bin_object(ndarray[object, ndim=2] out, + ndarray[int64_t] counts, + ndarray[object, ndim=2] values, + ndarray[int64_t] bins, int64_t rank): + ''' + Only aggregates on axis=0 + ''' + cdef: + Py_ssize_t i, j, N, K, ngroups, b + object val + float64_t count + ndarray[object, ndim=2] resx + ndarray[float64_t, ndim=2] nobs + + nobs = np.zeros(( out).shape, dtype=np.float64) + resx = np.empty(( out).shape, dtype=object) + + if bins[len(bins) - 1] == len(values): + ngroups = len(bins) + else: + ngroups = len(bins) + 1 + + N, K = ( values).shape + + b = 0 + for i in range(N): + while b < ngroups - 1 and i >= bins[b]: + b += 1 + + counts[b] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val: + nobs[b, j] += 1 + if nobs[b, j] == rank: + resx[b, j] = val + + for i in range(ngroups): + for j in range(K): + if nobs[i, j] == 0: + out[i, j] = nan + else: + out[i, j] = resx[i, j] + @cython.boundscheck(False) @cython.wraparound(False) def group_last(ndarray[float64_t, ndim=2] out, @@ -464,6 +555,48 @@ def group_last(ndarray[float64_t, ndim=2] out, else: out[i, j] = resx[i, j] +@cython.boundscheck(False) +@cython.wraparound(False) +def group_last_object(ndarray[object, ndim=2] out, + ndarray[int64_t] counts, + ndarray[object, ndim=2] values, + ndarray[int64_t] labels): + ''' + Only aggregates on axis=0 + ''' + cdef: + Py_ssize_t i, j, N, K, lab + object val + float64_t count + ndarray[object, ndim=2] resx + ndarray[int64_t, ndim=2] nobs + + nobs = np.zeros(( out).shape, dtype=np.int64) + resx = np.empty(( out).shape, dtype=object) + + N, K = ( values).shape + + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val: + nobs[lab, j] += 1 + resx[lab, j] = val + + for i in range(len(counts)): + for j in range(K): + if nobs[i, j] == 0: + out[i, j] = nan + else: + out[i, j] = resx[i, j] + @cython.boundscheck(False) @cython.wraparound(False) @@ -510,6 +643,53 @@ def group_last_bin(ndarray[float64_t, ndim=2] out, else: out[i, j] = resx[i, j] +@cython.boundscheck(False) +@cython.wraparound(False) +def group_last_bin_object(ndarray[object, ndim=2] out, + ndarray[int64_t] counts, + ndarray[object, ndim=2] values, + ndarray[int64_t] bins): + ''' + Only aggregates on axis=0 + ''' + cdef: + Py_ssize_t i, j, N, K, ngroups, b + object val + float64_t count + ndarray[object, ndim=2] resx + ndarray[float64_t, ndim=2] nobs + + nobs = np.zeros(( out).shape, dtype=np.float64) + resx = np.empty(( out).shape, dtype=object) + + if bins[len(bins) - 1] == len(values): + ngroups = len(bins) + else: + ngroups = len(bins) + 1 + + N, K = ( values).shape + + b = 0 + for i in range(N): + while b < ngroups - 1 and i >= bins[b]: + b += 1 + + counts[b] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val: + nobs[b, j] += 1 + resx[b, j] = val + + for i in range(ngroups): + for j in range(K): + if nobs[i, j] == 0: + out[i, j] = nan + else: + out[i, j] = resx[i, j] + #---------------------------------------------------------------------- # group_min, group_max @@ -1321,6 +1501,26 @@ def row_bool_subset(ndarray[float64_t, ndim=2] values, return out +@cython.boundscheck(False) +@cython.wraparound(False) +def row_bool_subset_object(ndarray[object, ndim=2] values, + ndarray[uint8_t, cast=True] mask): + cdef: + Py_ssize_t i, j, n, k, pos = 0 + ndarray[object, ndim=2] out + + n, k = ( values).shape + assert(n == len(mask)) + + out = np.empty((mask.sum(), k), dtype=object) + + for i in range(n): + if mask[i]: + for j in range(k): + out[pos, j] = values[i, j] + pos += 1 + + return out def group_count(ndarray[int64_t] values, Py_ssize_t size): diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index b3f3bd95f9c54..bf46ed4d2ce54 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -127,12 +127,12 @@ def test_first_last_nth(self): # tests for first / last / nth grouped = self.df.groupby('A') first = grouped.first() - expected = self.df.ix[[1, 0], ['C', 'D']] + expected = self.df.ix[[1, 0], ['B', 'C', 'D']] expected.index = ['bar', 'foo'] assert_frame_equal(first, expected) last = grouped.last() - expected = self.df.ix[[5, 7], ['C', 'D']] + expected = self.df.ix[[5, 7], ['B', 'C', 'D']] expected.index = ['bar', 'foo'] assert_frame_equal(last, expected) diff --git a/pandas/tseries/tests/test_timeseries.py b/pandas/tseries/tests/test_timeseries.py index e6694fdbdab3b..53dda63c16b1d 100644 --- a/pandas/tseries/tests/test_timeseries.py +++ b/pandas/tseries/tests/test_timeseries.py @@ -1080,7 +1080,7 @@ def test_frame_datetime64_handling_groupby(self): (3,np.datetime64('2012-07-04'))], columns = ['a', 'date']) result = df.groupby('a').first() - self.assertEqual(result['date'][3].year, 2012) + self.assertEqual(result['date'][3], np.datetime64('2012-07-03')) def test_series_interpolate_intraday(self): # #1698 @@ -2190,4 +2190,3 @@ def test_hash_equivalent(self): if __name__ == '__main__': nose.runmodule(argv=[__file__,'-vvs','-x','--pdb', '--pdb-failure'], exit=False) -