diff --git a/doc/source/release.rst b/doc/source/release.rst index 40e99b879dc29..0e96491fb3aa1 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -179,6 +179,8 @@ API Changes validation warnings in :func:`read_csv`/:func:`read_table` (:issue:`6607`) - Raise a ``TypeError`` when ``DataFrame`` is passed an iterator as the ``data`` argument (:issue:`5357`) +- groupby will now not return the grouped column for non-cython functions (:issue:`5610`), + as its already the index Deprecations ~~~~~~~~~~~~ diff --git a/doc/source/v0.14.0.txt b/doc/source/v0.14.0.txt index ccbde36b9a09f..f89f56e7a1aa2 100644 --- a/doc/source/v0.14.0.txt +++ b/doc/source/v0.14.0.txt @@ -110,12 +110,29 @@ API changes .. ipython:: python - DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B']) + df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B']) g = df.groupby('A') g.nth(0) # can also use negative ints g.nth(0, dropna='any') # similar to old behaviour + groupby will now not return the grouped column for non-cython functions (:issue:`5610`), + as its already the index + + .. ipython:: python + + df = DataFrame([[1, np.nan], [1, 4], [5, 6], [5, 8]], columns=['A', 'B']) + g = df.groupby('A') + g.count() + g.describe() + + passing ``as_index`` will leave the grouped column in-place (this is not change in 0.14.0) + + df = DataFrame([[1, np.nan], [1, 4], [5, 6], [5, 8]], columns=['A', 'B']) + g = df.groupby('A',as_index=False) + g.count() + g.describe() + - Allow specification of a more complex groupby via ``pd.Grouper``, such as grouping by a Time and a string field simultaneously. See :ref:`the docs `. (:issue:`3794`) diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index 23fccc3719278..b255831e51ae0 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -226,11 +226,13 @@ def describe(self): """ # Hack? from pandas.core.frame import DataFrame - grouped = DataFrame(self.labels).groupby(0) - counts = grouped.count().values.squeeze() + counts = DataFrame({ + 'labels' : self.labels, + 'values' : self.labels } + ).groupby('labels').count().squeeze().values freqs = counts / float(counts.sum()) - return DataFrame.from_dict({ + return DataFrame({ 'counts': counts, 'freqs': freqs, 'levels': self.levels - }).set_index('levels') + }).set_index('levels') diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 2c2f133dd52c1..01af7534d458d 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -611,11 +611,19 @@ def __neg__(self): arr = operator.inv(values) else: arr = operator.neg(values) - return self._wrap_array(arr, self.axes, copy=False) + return self.__array_wrap__(arr) def __invert__(self): - arr = operator.inv(_values_from_object(self)) - return self._wrap_array(arr, self.axes, copy=False) + try: + arr = operator.inv(_values_from_object(self)) + return self.__array_wrap__(arr) + except: + + # inv fails with 0 len + if not np.prod(self.shape): + return self + + raise def equals(self, other): """ @@ -707,15 +715,11 @@ def __abs__(self): #---------------------------------------------------------------------- # Array Interface - def _wrap_array(self, arr, axes, copy=False): - d = self._construct_axes_dict_from(self, axes, copy=copy) - return self._constructor(arr, **d).__finalize__(self) - def __array__(self, dtype=None): return _values_from_object(self) - def __array_wrap__(self, result): - d = self._construct_axes_dict(self._AXIS_ORDERS, copy=False) + def __array_wrap__(self, result, copy=False): + d = self._construct_axes_dict(self._AXIS_ORDERS, copy=copy) return self._constructor(result, **d).__finalize__(self) # ideally we would define this to avoid the getattr checks, but diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 27001bb69cd05..2a36ea65667d6 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -445,6 +445,23 @@ def _selection_list(self): return [self._selection] return self._selection + @cache_readonly + def _selected_obj(self): + + if self._selection is None or isinstance(self.obj, Series): + return self.obj + else: + return self.obj[self._selection] + + def _set_selection_from_grouper(self): + """ we may need create a selection if we have non-level groupers """ + grp = self.grouper + if self._selection is None and self.as_index and getattr(grp,'groupings',None) is not None: + ax = self.obj._info_axis + groupers = [ g.name for g in grp.groupings if g.level is None and g.name is not None and g.name in ax ] + if len(groupers): + self._selection = (ax-Index(groupers)).tolist() + def _local_dir(self): return sorted(set(self.obj._local_dir() + list(self._apply_whitelist))) @@ -453,7 +470,6 @@ def __getattr__(self, attr): return object.__getattribute__(self, attr) if attr in self.obj: return self[attr] - if hasattr(self.obj, attr): return self._make_wrapper(attr) @@ -472,6 +488,10 @@ def _make_wrapper(self, name): type(self).__name__)) raise AttributeError(msg) + # need to setup the selection + # as are not passed directly but in the grouper + self._set_selection_from_grouper() + f = getattr(self._selected_obj, name) if not isinstance(f, types.MethodType): return self.apply(lambda self: getattr(self, name)) @@ -503,7 +523,19 @@ def curried(x): try: return self.apply(curried_with_axis) except Exception: - return self.apply(curried) + try: + return self.apply(curried) + except Exception: + + # related to : GH3688 + # try item-by-item + # this can be called recursively, so need to raise ValueError if + # we don't have this method to indicated to aggregate to + # mark this column as an error + try: + return self._aggregate_item_by_item(name, *args, **kwargs) + except (AttributeError): + raise ValueError return wrapper @@ -624,6 +656,7 @@ def mean(self): except GroupByError: raise except Exception: # pragma: no cover + self._set_selection_from_grouper() f = lambda x: x.mean(axis=self.axis) return self._python_agg_general(f) @@ -639,6 +672,7 @@ def median(self): raise except Exception: # pragma: no cover + self._set_selection_from_grouper() def f(x): if isinstance(x, np.ndarray): x = Series(x) @@ -655,6 +689,7 @@ def std(self, ddof=1): if ddof == 1: return self._cython_agg_general('std') else: + self._set_selection_from_grouper() f = lambda x: x.std(ddof=ddof) return self._python_agg_general(f) @@ -667,15 +702,26 @@ def var(self, ddof=1): if ddof == 1: return self._cython_agg_general('var') else: + self._set_selection_from_grouper() f = lambda x: x.var(ddof=ddof) return self._python_agg_general(f) def size(self): """ Compute group sizes + """ return self.grouper.size() + def count(self, axis=0): + """ + Number of non-null items in each group. + axis : axis number, default 0 + the grouping axis + """ + self._set_selection_from_grouper() + return self._python_agg_general(lambda x: notnull(x).sum(axis=axis)).astype('int64') + sum = _groupby_function('sum', 'add', np.sum) prod = _groupby_function('prod', 'prod', np.prod) min = _groupby_function('min', 'min', np.min, numeric_only=False) @@ -685,14 +731,14 @@ def size(self): last = _groupby_function('last', 'last', _last_compat, numeric_only=False, _convert=True) + def ohlc(self): """ Compute sum of values, excluding missing values - For multiple groupings, the result index will be a MultiIndex - """ - return self._cython_agg_general('ohlc') + return self._apply_to_column_groupbys( + lambda x: x._cython_agg_general('ohlc')) def nth(self, n, dropna=None): """ @@ -888,13 +934,6 @@ def _cumcount_array(self, arr=None, **kwargs): cumcounts[v] = arr[len(v)-1::-1] return cumcounts - @cache_readonly - def _selected_obj(self): - if self._selection is None or isinstance(self.obj, Series): - return self.obj - else: - return self.obj[self._selection] - def _index_with_as_index(self, b): """ Take boolean mask of index to be returned from apply, if as_index=True @@ -990,12 +1029,23 @@ def _concat_objects(self, keys, values, not_indexed_same=False): result = result.reindex(ax) else: result = result.reindex_axis(ax, axis=self.axis) - elif self.group_keys and self.as_index: - group_keys = keys - group_levels = self.grouper.levels - group_names = self.grouper.names - result = concat(values, axis=self.axis, keys=group_keys, - levels=group_levels, names=group_names) + + elif self.group_keys: + + if self.as_index: + + # possible MI return case + group_keys = keys + group_levels = self.grouper.levels + group_names = self.grouper.names + result = concat(values, axis=self.axis, keys=group_keys, + levels=group_levels, names=group_names) + else: + + # GH5610, returns a MI, with the first level being a + # range index + keys = list(range(len(values))) + result = concat(values, axis=self.axis, keys=keys) else: result = concat(values, axis=self.axis) @@ -2187,6 +2237,9 @@ def true_and_notnull(x, *args, **kwargs): filtered = self._apply_filter(indices, dropna) return filtered + def _apply_to_column_groupbys(self, func): + """ return a pass thru """ + return func(self) class NDFrameGroupBy(GroupBy): @@ -2486,6 +2539,7 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False): elif hasattr(self.grouper, 'groupings'): if len(self.grouper.groupings) > 1: key_index = MultiIndex.from_tuples(keys, names=key_names) + else: ping = self.grouper.groupings[0] if len(keys) == ping.ngroups: @@ -2498,8 +2552,13 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False): # reorder the values values = [values[i] for i in indexer] else: + key_index = Index(keys, name=key_names[0]) + # don't use the key indexer + if not self.as_index: + key_index = None + # make Nones an empty object if com._count_not_none(*values) != len(values): v = next(v for v in values if v is not None) @@ -2569,7 +2628,7 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False): # normally use vstack as its faster than concat # and if we have mi-columns - if not _np_version_under1p7 or isinstance(v.index,MultiIndex): + if not _np_version_under1p7 or isinstance(v.index,MultiIndex) or key_index is None: stacked_values = np.vstack([np.asarray(x) for x in values]) result = DataFrame(stacked_values,index=key_index,columns=index) else: @@ -2889,16 +2948,6 @@ def _apply_to_column_groupbys(self, func): in self._iterate_column_groupbys()), keys=self._selected_obj.columns, axis=1) - def ohlc(self): - """ - Compute sum of values, excluding missing values - - For multiple groupings, the result index will be a MultiIndex - """ - return self._apply_to_column_groupbys( - lambda x: x._cython_agg_general('ohlc')) - - from pandas.tools.plotting import boxplot_frame_groupby DataFrameGroupBy.boxplot = boxplot_frame_groupby diff --git a/pandas/core/series.py b/pandas/core/series.py index c94d7dc9acefd..9c642280169f0 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -370,12 +370,12 @@ def __array__(self, result=None): """ the array interface, return my values """ return self.values - def __array_wrap__(self, result): + def __array_wrap__(self, result, copy=False): """ Gets called prior to a ufunc (and after) """ return self._constructor(result, index=self.index, - copy=False).__finalize__(self) + copy=copy).__finalize__(self) def __contains__(self, key): return key in self.index @@ -959,19 +959,6 @@ def iteritems(self): if compat.PY3: # pragma: no cover items = iteritems - # inversion - def __neg__(self): - values = self.values - if values.dtype == np.bool_: - arr = operator.inv(values) - else: - arr = operator.neg(values) - return self._constructor(arr, self.index).__finalize__(self) - - def __invert__(self): - arr = operator.inv(self.values) - return self._constructor(arr, self.index).__finalize__(self) - #---------------------------------------------------------------------- # unbox reductions diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index fde9156017c4e..fcc4eb83b0af9 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -1378,7 +1378,8 @@ def test_groupby_as_index_apply(self): res_not_as_apply = g_not_as.apply(lambda x: x.head(2)).index # apply doesn't maintain the original ordering - exp_not_as_apply = Index([0, 2, 1, 4]) + # changed in GH5610 as the as_index=False returns a MI here + exp_not_as_apply = MultiIndex.from_tuples([(0, 0), (0, 2), (1, 1), (2, 4)]) exp_as_apply = MultiIndex.from_tuples([(1, 0), (1, 2), (2, 1), (3, 4)]) assert_index_equal(res_as_apply, exp_as_apply) @@ -1970,6 +1971,64 @@ def test_size(self): for key, group in grouped: self.assertEquals(result[key], len(group)) + def test_count(self): + + # GH5610 + # count counts non-nulls + df = pd.DataFrame([[1, 2, 'foo'], [1, nan, 'bar'], [3, nan, nan]], columns=['A', 'B', 'C']) + + count_as = df.groupby('A').count() + count_not_as = df.groupby('A', as_index=False).count() + + expected = DataFrame([[1, 2], [0, 0]], columns=['B', 'C'], index=[1,3]) + expected.index.name='A' + assert_frame_equal(count_not_as, expected.reset_index()) + assert_frame_equal(count_as, expected) + + count_B = df.groupby('A')['B'].count() + assert_series_equal(count_B, expected['B']) + + def test_non_cython_api(self): + + # GH5610 + # non-cython calls should not include the grouper + + df = DataFrame([[1, 2, 'foo'], [1, nan, 'bar',], [3, nan, 'baz']], columns=['A', 'B','C']) + g = df.groupby('A') + gni = df.groupby('A',as_index=False) + + # mad + expected = DataFrame([[0],[nan]],columns=['B'],index=[1,3]) + expected.index.name = 'A' + result = g.mad() + assert_frame_equal(result,expected) + + expected = DataFrame([[0.,0.],[0,nan]],columns=['A','B'],index=[0,1]) + result = gni.mad() + assert_frame_equal(result,expected) + + # describe + expected = DataFrame(dict(B = concat([df.loc[[0,1],'B'].describe(),df.loc[[2],'B'].describe()],keys=[1,3]))) + expected.index.names = ['A',None] + result = g.describe() + assert_frame_equal(result,expected) + + expected = concat([df.loc[[0,1],['A','B']].describe(),df.loc[[2],['A','B']].describe()],keys=[0,1]) + result = gni.describe() + assert_frame_equal(result,expected) + + # any + expected = DataFrame([[True, True],[False, True]],columns=['B','C'],index=[1,3]) + expected.index.name = 'A' + result = g.any() + assert_frame_equal(result,expected) + + # idxmax + expected = DataFrame([[0],[nan]],columns=['B'],index=[1,3]) + expected.index.name = 'A' + result = g.idxmax() + assert_frame_equal(result,expected) + def test_grouping_ndarray(self): grouped = self.df.groupby(self.df['A'].values) @@ -2925,7 +2984,7 @@ def test_groupby_with_timegrouper(self): DT.datetime(2013,12,2,12,0), DT.datetime(2013,9,2,14,0), ]}) - + # GH 6908 change target column's order df_reordered = df_original.sort(columns='Quantity') @@ -3937,8 +3996,14 @@ def test_frame_groupby_plot_boxplot(self): self.assertEqual(len(res), 2) tm.close() + # now works with GH 5610 as gender is excluded + res = df.groupby('gender').hist() + tm.close() + + df2 = df.copy() + df2['gender2'] = df['gender'] with tm.assertRaisesRegexp(TypeError, '.*str.+float'): - gb.hist() + df2.groupby('gender').hist() @slow def test_frame_groupby_hist(self): diff --git a/pandas/tseries/tests/test_resample.py b/pandas/tseries/tests/test_resample.py index 5f975105cd80e..7fe8ab8ca642e 100644 --- a/pandas/tseries/tests/test_resample.py +++ b/pandas/tseries/tests/test_resample.py @@ -1126,9 +1126,9 @@ def test_evenly_divisible_with_no_extra_bins(self): expected = DataFrame( [{'REST_KEY': 14, 'DLY_TRN_QT': 14, 'DLY_SLS_AMT': 14, 'COOP_DLY_TRN_QT': 14, 'COOP_DLY_SLS_AMT': 14}] * 4, - index=index).unstack().swaplevel(1,0).sortlevel() + index=index) result = df.resample('7D', how='count') - assert_series_equal(result,expected) + assert_frame_equal(result,expected) expected = DataFrame( [{'REST_KEY': 21, 'DLY_TRN_QT': 1050, 'DLY_SLS_AMT': 700,