From 3e1f2ba10fe4a0547dfbe0a2dc70d0990227757e Mon Sep 17 00:00:00 2001 From: mayukh18 Date: Sat, 8 Apr 2017 02:21:35 +0530 Subject: [PATCH] clean demo of error.only the op sum is ready in this --- pandas/_libs/algos_groupby_helper.pxi.in | 399 ++++++++++++++++------- pandas/core/groupby.py | 52 +-- 2 files changed, 302 insertions(+), 149 deletions(-) diff --git a/pandas/_libs/algos_groupby_helper.pxi.in b/pandas/_libs/algos_groupby_helper.pxi.in index e2c263f49b110..dd226a8c86e58 100644 --- a/pandas/_libs/algos_groupby_helper.pxi.in +++ b/pandas/_libs/algos_groupby_helper.pxi.in @@ -9,26 +9,27 @@ cdef extern from "numpy/npy_math.h": _int64_max = np.iinfo(np.int64).max #---------------------------------------------------------------------- -# group_add, group_prod, group_var, group_mean, group_ohlc +# group_add #---------------------------------------------------------------------- {{py: -# name, c_type, dest_type, dest_dtype -dtypes = [('float64', 'float64_t', 'float64_t', 'np.float64'), - ('float32', 'float32_t', 'float32_t', 'np.float32')] +# name, c_type, dest_type, dest_dtype, nan_val +dtypes = [('float64', 'float64_t', 'float64_t', 'np.float64', 'NAN'), + ('float32', 'float32_t', 'float32_t', 'np.float32', 'NAN'), + ('int64', 'int64_t', 'int64_t', 'np.int64', 'iNaT')] def get_dispatch(dtypes): - for name, c_type, dest_type, dest_dtype in dtypes: + for name, c_type, dest_type, dest_dtype, nan_val in dtypes: dest_type2 = dest_type dest_type = dest_type.replace('_t', '') - yield name, c_type, dest_type, dest_type2, dest_dtype + yield name, c_type, dest_type, dest_type2, dest_dtype, nan_val }} -{{for name, c_type, dest_type, dest_type2, dest_dtype in get_dispatch(dtypes)}} +{{for name, c_type, dest_type, dest_type2, dest_dtype, nan_val in get_dispatch(dtypes)}} @cython.wraparound(False) @@ -36,7 +37,8 @@ def get_dispatch(dtypes): def group_add_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, ndarray[int64_t] counts, ndarray[{{c_type}}, ndim=2] values, - ndarray[int64_t] labels): + ndarray[int64_t] labels, + bint checknull): """ Only aggregates on axis=0 """ @@ -54,25 +56,31 @@ def group_add_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, N, K = ( values).shape with nogil: - if K > 1: - for i in range(N): lab = labels[i] if lab < 0: continue - counts[lab] += 1 for j in range(K): val = values[i, j] - # not nan - if val == val: + # val = nan + {{if name == 'int64'}} + if val == {{nan_val}}: + sumx[lab, j] = {{nan_val}} + else: + {{else}} + if val != val: + if checknull: + continue + else: + sumx[lab, j] = NAN + else: + {{endif}} nobs[lab, j] += 1 sumx[lab, j] += val - else: - for i in range(N): lab = labels[i] if lab < 0: @@ -81,25 +89,65 @@ def group_add_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, counts[lab] += 1 val = values[i, 0] - # not nan - if val == val: + # val = nan + {{if name == 'int64'}} + if val == {{nan_val}}: + sumx[lab, 0] = {{nan_val}} + else: + {{else}} + if val != val: + if checknull: + continue + else: + sumx[lab, 0] = NAN + else: + {{endif}} nobs[lab, 0] += 1 sumx[lab, 0] += val for i in range(ncounts): for j in range(K): if nobs[i, j] == 0: + {{if name == 'int64'}} + out[i, j] = {{nan_val}} + {{else}} out[i, j] = NAN + {{endif}} else: out[i, j] = sumx[i, j] +{{endfor}} + +#---------------------------------------------------------------------- +# group_prod, group_var, group_mean, group_ohlc +#---------------------------------------------------------------------- + +{{py: + +# name, c_type, dest_type, dest_dtype +dtypes = [('float64', 'float64_t', 'float64_t', 'np.float64'), + ('float32', 'float32_t', 'float32_t', 'np.float32')] + +def get_dispatch(dtypes): + + for name, c_type, dest_type, dest_dtype in dtypes: + + dest_type2 = dest_type + dest_type = dest_type.replace('_t', '') + + yield name, c_type, dest_type, dest_type2, dest_dtype +}} + +{{for name, c_type, dest_type, dest_type2, dest_dtype in get_dispatch(dtypes)}} + @cython.wraparound(False) @cython.boundscheck(False) def group_prod_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, ndarray[int64_t] counts, ndarray[{{c_type}}, ndim=2] values, - ndarray[int64_t] labels): + ndarray[int64_t] labels, + bint skipna): """ Only aggregates on axis=0 """ @@ -117,38 +165,63 @@ def group_prod_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, N, K = ( values).shape with nogil: - if K > 1: - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - for j in range(K): - val = values[i, j] - - # not nan - if val == val: + if skipna == False: + if K > 1: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] nobs[lab, j] += 1 prodx[lab, j] *= val - else: - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - val = values[i, 0] + else: + for i in range(N): + lab = labels[i] + if lab < 0: + continue - # not nan - if val == val: + counts[lab] += 1 + val = values[i, 0] nobs[lab, 0] += 1 prodx[lab, 0] *= val + else: + if K > 1: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + #not nan + if val == val: + nobs[lab, j] += 1 + prodx[lab, j] *= val + else: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + val = values[i, 0] + #not nan + if val == val: + nobs[lab, 0] += 1 + prodx[lab, 0] *= val for i in range(ncounts): for j in range(K): if nobs[i, j] == 0: + {{if name == 'int64'}} + out[i, j] = {{nan_val}} + {{else}} out[i, j] = NAN + {{endif}} else: out[i, j] = prodx[i, j] @@ -159,7 +232,8 @@ def group_prod_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, def group_var_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, ndarray[int64_t] counts, ndarray[{{dest_type2}}, ndim=2] values, - ndarray[int64_t] labels): + ndarray[int64_t] labels, + bint skipna): cdef: Py_ssize_t i, j, N, K, lab, ncounts = len(counts) {{dest_type2}} val, ct, oldmean @@ -176,28 +250,46 @@ def group_var_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, out[:, :] = 0.0 with nogil: - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 + if skipna == False: + for i in range(N): + lab = labels[i] + if lab < 0: + continue - for j in range(K): - val = values[i, j] + counts[lab] += 1 - # not nan - if val == val: + for j in range(K): + val = values[i, j] nobs[lab, j] += 1 oldmean = mean[lab, j] mean[lab, j] += (val - oldmean) / nobs[lab, j] out[lab, j] += (val - mean[lab, j]) * (val - oldmean) + else: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + + for j in range(K): + val = values[i, j] + #not nan + if val == val: + nobs[lab, j] += 1 + oldmean = mean[lab, j] + mean[lab, j] += (val - oldmean) / nobs[lab, j] + out[lab, j] += (val - mean[lab, j]) * (val - oldmean) for i in range(ncounts): for j in range(K): ct = nobs[i, j] if ct < 2: + {{if name == 'int64'}} + out[i, j] = {{nan_val}} + {{else}} out[i, j] = NAN + {{endif}} else: out[i, j] /= (ct - 1) # add passing bin edges, instead of labels @@ -208,7 +300,8 @@ def group_var_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, def group_mean_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, ndarray[int64_t] counts, ndarray[{{dest_type2}}, ndim=2] values, - ndarray[int64_t] labels): + ndarray[int64_t] labels, + bint skipna): cdef: Py_ssize_t i, j, N, K, lab, ncounts = len(counts) {{dest_type2}} val, count @@ -223,39 +316,66 @@ def group_mean_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, N, K = ( values).shape with nogil: - if K > 1: - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - for j in range(K): - val = values[i, j] - # not nan - if val == val: + if skipna == False: + if K > 1: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] nobs[lab, j] += 1 sumx[lab, j] += val - else: - for i in range(N): - lab = labels[i] - if lab < 0: - continue + else: + for i in range(N): + lab = labels[i] + if lab < 0: + continue - counts[lab] += 1 - val = values[i, 0] - # not nan - if val == val: + counts[lab] += 1 + val = values[i, 0] nobs[lab, 0] += 1 sumx[lab, 0] += val + else: + if K > 1: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + #not nan + if val == val: + nobs[lab, j] += 1 + sumx[lab, j] += val + else: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + val = values[i, 0] + #not nan + if val == val: + nobs[lab, 0] += 1 + sumx[lab, 0] += val for i in range(ncounts): for j in range(K): count = nobs[i, j] if nobs[i, j] == 0: + {{if name == 'int64'}} + out[i, j] = {{nan_val}} + {{else}} out[i, j] = NAN + {{endif}} else: - out[i, j] = sumx[i, j] / count + out[i, j] = sumx[i,j] / count @cython.wraparound(False) @@ -263,7 +383,8 @@ def group_mean_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, def group_ohlc_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, ndarray[int64_t] counts, ndarray[{{dest_type2}}, ndim=2] values, - ndarray[int64_t] labels): + ndarray[int64_t] labels, + bint skipna): """ Only aggregates on axis=0 """ @@ -455,19 +576,21 @@ def get_dispatch(dtypes): def group_max_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, ndarray[int64_t] counts, ndarray[{{dest_type2}}, ndim=2] values, - ndarray[int64_t] labels): + ndarray[int64_t] labels, + bint checknull): """ Only aggregates on axis=0 """ cdef: Py_ssize_t i, j, N, K, lab, ncounts = len(counts) {{dest_type2}} val, count - ndarray[{{dest_type2}}, ndim=2] maxx, nobs + ndarray[{{dest_type2}}, ndim=2] maxx, nobs, nancount if not len(values) == len(labels): raise AssertionError("len(index) != len(labels)") nobs = np.zeros_like(out) + nancount = np.zeros_like(out) maxx = np.empty_like(out) maxx.fill(-{{inf_val}}) @@ -484,16 +607,11 @@ def group_max_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, counts[lab] += 1 for j in range(K): val = values[i, j] - - # not nan - {{if name == 'int64'}} - if val != {{nan_val}}: - {{else}} - if val == val and val != {{nan_val}}: - {{endif}} - nobs[lab, j] += 1 - if val > maxx[lab, j]: - maxx[lab, j] = val + if val != val: + nancount[lab, j] += 1 + nobs[lab, j] += 1 + if val > maxx[lab, j]: + maxx[lab, j] = val else: for i in range(N): lab = labels[i] @@ -502,13 +620,8 @@ def group_max_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, counts[lab] += 1 val = values[i, 0] - - # not nan - {{if name == 'int64'}} - if val != {{nan_val}}: - {{else}} - if val == val and val != {{nan_val}}: - {{endif}} + if val != val: + nancount[lab, 0] += 1 nobs[lab, 0] += 1 if val > maxx[lab, 0]: maxx[lab, 0] = val @@ -517,6 +630,8 @@ def group_max_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, for j in range(K): if nobs[i, j] == 0: out[i, j] = {{nan_val}} + elif checknull == False and nancount[i, j] > 0: + out[i, j] = {{nan_val}} else: out[i, j] = maxx[i, j] @@ -526,19 +641,21 @@ def group_max_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, def group_min_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, ndarray[int64_t] counts, ndarray[{{dest_type2}}, ndim=2] values, - ndarray[int64_t] labels): + ndarray[int64_t] labels, + bint skipna): """ Only aggregates on axis=0 """ cdef: Py_ssize_t i, j, N, K, lab, ncounts = len(counts) {{dest_type2}} val, count - ndarray[{{dest_type2}}, ndim=2] minx, nobs + ndarray[{{dest_type2}}, ndim=2] minx, nobs, nancount if not len(values) == len(labels): raise AssertionError("len(index) != len(labels)") nobs = np.zeros_like(out) + nancount = np.zeros_like(out) minx = np.empty_like(out) minx.fill({{inf_val}}) @@ -546,15 +663,62 @@ def group_min_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, N, K = ( values).shape with nogil: - if K > 1: - for i in range(N): - lab = labels[i] - if lab < 0: - continue + if skipna == False: + if K > 1: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + if val != val: + nancount[lab, j] += 1 + nobs[lab, j] += 1 + if val < minx[lab, j]: + minx[lab, j] = val + else: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + val = values[i, 0] + if val != val: + nancount[lab, 0] += 1 + nobs[lab, 0] += 1 + if val < minx[lab, 0]: + minx[lab, 0] = val + else: + if K > 1: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + + # not nan + {{if name == 'int64'}} + if val != {{nan_val}}: + {{else}} + if val == val and val != {{nan_val}}: + {{endif}} + nobs[lab, j] += 1 + if val < minx[lab, j]: + minx[lab, j] = val + else: + for i in range(N): + lab = labels[i] + if lab < 0: + continue - counts[lab] += 1 - for j in range(K): - val = values[i, j] + counts[lab] += 1 + val = values[i, 0] # not nan {{if name == 'int64'}} @@ -562,32 +726,16 @@ def group_min_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, {{else}} if val == val and val != {{nan_val}}: {{endif}} - nobs[lab, j] += 1 - if val < minx[lab, j]: - minx[lab, j] = val - else: - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - val = values[i, 0] - - # not nan - {{if name == 'int64'}} - if val != {{nan_val}}: - {{else}} - if val == val and val != {{nan_val}}: - {{endif}} - nobs[lab, 0] += 1 - if val < minx[lab, 0]: - minx[lab, 0] = val + nobs[lab, 0] += 1 + if val < minx[lab, 0]: + minx[lab, 0] = val for i in range(ncounts): for j in range(K): if nobs[i, j] == 0: out[i, j] = {{nan_val}} + elif skipna == False and nancount[i, j] > 0: + out[i, j] = {{nan_val}} else: out[i, j] = minx[i, j] @@ -684,7 +832,8 @@ def group_cummax_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, def group_median_float64(ndarray[float64_t, ndim=2] out, ndarray[int64_t] counts, ndarray[float64_t, ndim=2] values, - ndarray[int64_t] labels): + ndarray[int64_t] labels, + bint checknull): """ Only aggregates on axis=0 """ diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 4095a14aa5970..1751c873c2b42 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -105,7 +105,7 @@ 'cummin', 'cummax']) -def _groupby_function(name, alias, npfunc, numeric_only=True, +def _groupby_function(name, alias, npfunc, numeric_only=True, skipna=True, _convert=False): _local_template = "Compute %(f)s of group values" @@ -807,7 +807,7 @@ def _cython_transform(self, how, numeric_only=True): return self._wrap_transformed_output(output, names) - def _cython_agg_general(self, how, alt=None, numeric_only=True): + def _cython_agg_general(self, how, alt=None, numeric_only=True, skipna=True): output = {} for name, obj in self._iterate_slices(): is_numeric = is_numeric_dtype(obj.dtype) @@ -815,7 +815,7 @@ def _cython_agg_general(self, how, alt=None, numeric_only=True): continue try: - result, names = self.grouper.aggregate(obj.values, how) + result, names = self.grouper.aggregate(obj.values, how, skipna) except AssertionError as e: raise GroupByError(str(e)) output[name] = self._try_cast(result, obj) @@ -1020,7 +1020,7 @@ def mean(self, *args, **kwargs): For multiple groupings, the result index will be a MultiIndex """ - nv.validate_groupby_func('mean', args, kwargs, ['numeric_only']) + nv.validate_groupby_func('mean', args, kwargs, ['numeric_only','skipna']) try: return self._cython_agg_general('mean', **kwargs) except GroupByError: @@ -1067,7 +1067,7 @@ def std(self, ddof=1, *args, **kwargs): """ # TODO: implement at Cython level? - nv.validate_groupby_func('std', args, kwargs) + nv.validate_groupby_func('std', args, kwargs,['skipna']) return np.sqrt(self.var(ddof=ddof, **kwargs)) @Substitution(name='groupby') @@ -1083,7 +1083,7 @@ def var(self, ddof=1, *args, **kwargs): ddof : integer, default 1 degrees of freedom """ - nv.validate_groupby_func('var', args, kwargs) + nv.validate_groupby_func('var', args, kwargs, ['skipna']) if ddof == 1: return self._cython_agg_general('var', **kwargs) else: @@ -1093,7 +1093,7 @@ def var(self, ddof=1, *args, **kwargs): @Substitution(name='groupby') @Appender(_doc_template) - def sem(self, ddof=1): + def sem(self, ddof=1, **kwargs): """ Compute standard error of the mean of groups, excluding missing values @@ -1105,7 +1105,7 @@ def sem(self, ddof=1): degrees of freedom """ - return self.std(ddof=ddof) / np.sqrt(self.count()) + return self.std(ddof=ddof, **kwargs) / np.sqrt(self.count()) @Substitution(name='groupby') @Appender(_doc_template) @@ -1117,10 +1117,10 @@ def size(self): result.name = getattr(self, 'name', None) return result - sum = _groupby_function('sum', 'add', np.sum) - prod = _groupby_function('prod', 'prod', np.prod) - min = _groupby_function('min', 'min', np.min, numeric_only=False) - max = _groupby_function('max', 'max', np.max, numeric_only=False) + sum = _groupby_function('sum', 'add', np.sum, skipna=True) + prod = _groupby_function('prod', 'prod', np.prod, skipna=True) + min = _groupby_function('min', 'min', np.min, numeric_only=False, skipna=True) + max = _groupby_function('max', 'max', np.max, numeric_only=False, skipna=True) first = _groupby_function('first', 'first', _first_compat, numeric_only=False, _convert=True) last = _groupby_function('last', 'last', _last_compat, numeric_only=False, @@ -1849,7 +1849,7 @@ def wrapper(*args, **kwargs): (how, dtype_str)) return func, dtype_str - def _cython_operation(self, kind, values, how, axis): + def _cython_operation(self, kind, values, how, axis, skipna): assert kind in ['transform', 'aggregate'] # can we do this operation with our cython functions @@ -1933,7 +1933,7 @@ def _cython_operation(self, kind, values, how, axis): fill_value=np.nan) counts = np.zeros(self.ngroups, dtype=np.int64) result = self._aggregate( - result, counts, values, labels, func, is_numeric, + result, counts, values, labels, skipna, func, is_numeric, is_datetimelike) elif kind == 'transform': result = _maybe_fill(np.empty_like(values, dtype=out_dtype), @@ -1975,13 +1975,13 @@ def _cython_operation(self, kind, values, how, axis): return result, names - def aggregate(self, values, how, axis=0): - return self._cython_operation('aggregate', values, how, axis) + def aggregate(self, values, how, skipna, axis=0): + return self._cython_operation('aggregate', values, how, axis, skipna) def transform(self, values, how, axis=0): - return self._cython_operation('transform', values, how, axis) + return self._cython_operation('transform', values, how, axis, skipna=True) - def _aggregate(self, result, counts, values, comp_ids, agg_func, + def _aggregate(self, result, counts, values, comp_ids, skipna, agg_func, is_numeric, is_datetimelike): if values.ndim > 3: # punting for now @@ -1991,9 +1991,9 @@ def _aggregate(self, result, counts, values, comp_ids, agg_func, for i, chunk in enumerate(values.transpose(2, 0, 1)): chunk = chunk.squeeze() - agg_func(result[:, :, i], counts, chunk, comp_ids) + agg_func(result[:, :, i], counts, chunk, comp_ids, skipna) else: - agg_func(result, counts, values, comp_ids) + agg_func(result, counts, values, comp_ids, skipna) return result @@ -3187,9 +3187,9 @@ def _iterate_slices(self): continue yield val, slicer(val) - def _cython_agg_general(self, how, alt=None, numeric_only=True): + def _cython_agg_general(self, how, alt=None, numeric_only=True, skipna=True): new_items, new_blocks = self._cython_agg_blocks( - how, alt=alt, numeric_only=numeric_only) + how, alt=alt, numeric_only=numeric_only, skipna=skipna) return self._wrap_agged_blocks(new_items, new_blocks) def _wrap_agged_blocks(self, items, blocks): @@ -3215,7 +3215,7 @@ def _wrap_agged_blocks(self, items, blocks): _block_agg_axis = 0 - def _cython_agg_blocks(self, how, alt=None, numeric_only=True): + def _cython_agg_blocks(self, how, alt=None, numeric_only=True,skipna=None): # TODO: the actual managing of mgr_locs is a PITA # here, it should happen via BlockManager.combine @@ -3232,8 +3232,9 @@ def _cython_agg_blocks(self, how, alt=None, numeric_only=True): locs = block.mgr_locs.as_array try: result, _ = self.grouper.aggregate( - block.values, how, axis=agg_axis) + block.values, how, skipna=skipna, axis=agg_axis) except NotImplementedError: + continue # generally if we have numeric_only=False # and non-applicable functions # try to python agg @@ -3327,6 +3328,9 @@ def aggregate(self, arg, *args, **kwargs): self._insert_inaxis_grouper_inplace(result) result.index = np.arange(len(result)) + if result.empty: + for col in result.columns: + result[col] = result[col].astype(self.obj[col]) return result._convert(datetime=True) agg = aggregate