diff --git a/doc/source/groupby.rst b/doc/source/groupby.rst index 4bdc69be777ba..bbb5060acc35d 100644 --- a/doc/source/groupby.rst +++ b/doc/source/groupby.rst @@ -397,7 +397,7 @@ index are the group names and whose values are the sizes of each group. named *columns*. Aggregating functions are ones that reduce the dimension of the returned objects, - for example: ``mean, sum, size, count, std, var, describe, first, last, min, max``. This is + for example: ``mean, sum, size, count, std, var, describe, first, last, nth, min, max``. This is what happens when you do for example ``DataFrame.sum()`` and get back a ``Series``. .. _groupby.aggregate.multifunc: @@ -613,7 +613,7 @@ For dataframes with multiple columns, filters should explicitly specify a column a reduced shape of the original (and potentitally eliminating groups), but with the index unchanged. Passing ``as_index=False`` will not affect these transformation methods. - For example: ``head, tail nth``. + For example: ``head, tail``. .. ipython:: python diff --git a/doc/source/release.rst b/doc/source/release.rst index 463cf928660dd..05c6f2276c2de 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -190,7 +190,7 @@ API Changes validation warnings in :func:`read_csv`/:func:`read_table` (:issue:`6607`) - Raise a ``TypeError`` when ``DataFrame`` is passed an iterator as the ``data`` argument (:issue:`5357`) -- groupby will now not return the grouped column for non-cython functions (:issue:`5610`, :issue:`5614`), +- groupby will now not return the grouped column for non-cython functions (:issue:`5610`, :issue:`5614`, :issue:`6732`), as its already the index - ``DataFrame.plot`` and ``Series.plot`` now supports area plot with specifying ``kind='area'`` (:issue:`6656`) - Line plot can be stacked by ``stacked=True``. (:issue:`6656`) diff --git a/doc/source/v0.14.0.txt b/doc/source/v0.14.0.txt index c706312815e37..bf15812e91f8e 100644 --- a/doc/source/v0.14.0.txt +++ b/doc/source/v0.14.0.txt @@ -22,6 +22,8 @@ users upgrade to this version. - :ref:`API Changes ` +- :ref:`Groupby API Changes ` + - :ref:`Performance Improvements ` - :ref:`Prior Deprecations ` @@ -95,57 +97,6 @@ API changes - Add ``is_month_start``, ``is_month_end``, ``is_quarter_start``, ``is_quarter_end``, ``is_year_start``, ``is_year_end`` accessors for ``DateTimeIndex`` / ``Timestamp`` which return a boolean array of whether the timestamp(s) are at the start/end of the month/quarter/year defined by the frequency of the ``DateTimeIndex`` / ``Timestamp`` (:issue:`4565`, :issue:`6998`) -- More consistent behaviour for some groupby methods: - - groupby ``head`` and ``tail`` now act more like ``filter`` rather than an aggregation: - - .. ipython:: python - - df = pd.DataFrame([[1, 2], [1, 4], [5, 6]], columns=['A', 'B']) - g = df.groupby('A') - g.head(1) # filters DataFrame - - g.apply(lambda x: x.head(1)) # used to simply fall-through - - groupby head and tail respect column selection: - - .. ipython:: python - - g[['B']].head(1) - - groupby ``nth`` now filters by default, with optional dropna argument to ignore - NaN (to replicate the previous behaviour.), See :ref:`the docs `. - - .. ipython:: python - - df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B']) - g = df.groupby('A') - g.nth(0) # can also use negative ints - - g.nth(0, dropna='any') # similar to old behaviour - - groupby will now not return the grouped column for non-cython functions (:issue:`5610`, :issue:`5614`), - as its already the index - - .. ipython:: python - - df = DataFrame([[1, np.nan], [1, 4], [5, 6], [5, 8]], columns=['A', 'B']) - g = df.groupby('A') - g.count() - g.describe() - - passing ``as_index`` will leave the grouped column in-place (this is not change in 0.14.0) - - .. ipython:: python - - df = DataFrame([[1, np.nan], [1, 4], [5, 6], [5, 8]], columns=['A', 'B']) - g = df.groupby('A',as_index=False) - g.count() - g.describe() - -- Allow specification of a more complex groupby via ``pd.Grouper``, such as grouping - by a Time and a string field simultaneously. See :ref:`the docs `. (:issue:`3794`) - - Local variable usage has changed in :func:`pandas.eval`/:meth:`DataFrame.eval`/:meth:`DataFrame.query` (:issue:`5987`). For the :class:`~pandas.DataFrame` methods, two things have @@ -247,6 +198,62 @@ API changes from 0.13.1 - Added ``factorize`` functions to ``Index`` and ``Series`` to get indexer and unique values (:issue:`7090`) +.. _whatsnew_0140.groupby: + +Groupby API Changes +~~~~~~~~~~~~~~~~~~~ + +More consistent behaviour for some groupby methods: + +- groupby ``head`` and ``tail`` now act more like ``filter`` rather than an aggregation: + + .. ipython:: python + + df = pd.DataFrame([[1, 2], [1, 4], [5, 6]], columns=['A', 'B']) + g = df.groupby('A') + g.head(1) # filters DataFrame + + g.apply(lambda x: x.head(1)) # used to simply fall-through + +- groupby head and tail respect column selection: + + .. ipython:: python + + g[['B']].head(1) + +- groupby ``nth`` now filters by default, with optional dropna argument to ignore + NaN (to replicate the previous behaviour.), See :ref:`the docs `. + + .. ipython:: python + + df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B']) + g = df.groupby('A') + g.nth(0) # can also use negative ints + + g.nth(0, dropna='any') # similar to old behaviour + +- groupby will now not return the grouped column for non-cython functions (:issue:`5610`, :issue:`5614`, :issue:`6732`), + as its already the index + + .. ipython:: python + + df = DataFrame([[1, np.nan], [1, 4], [5, 6], [5, 8]], columns=['A', 'B']) + g = df.groupby('A') + g.count() + g.describe() + +- passing ``as_index`` will leave the grouped column in-place (this is not change in 0.14.0) + + .. ipython:: python + + df = DataFrame([[1, np.nan], [1, 4], [5, 6], [5, 8]], columns=['A', 'B']) + g = df.groupby('A',as_index=False) + g.count() + g.describe() + +- Allow specification of a more complex groupby via ``pd.Grouper``, such as grouping + by a Time and a string field simultaneously. See :ref:`the docs `. (:issue:`3794`) + .. _whatsnew_0140.sql: SQL diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index b13b2121ac0c4..bce3a993171a7 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -99,6 +99,7 @@ class SpecificationError(GroupByError): def _groupby_function(name, alias, npfunc, numeric_only=True, _convert=False): def f(self): + self._set_selection_from_grouper() try: return self._cython_agg_general(alias, numeric_only=numeric_only) except AssertionError as e: @@ -356,6 +357,7 @@ class GroupBy(PandasObject): _apply_whitelist = _common_apply_whitelist _internal_names = ['_cache'] _internal_names_set = set(_internal_names) + _group_selection = None def __init__(self, obj, keys=None, axis=0, level=None, grouper=None, exclusions=None, selection=None, as_index=True, @@ -454,6 +456,8 @@ def _selection_list(self): def _selected_obj(self): if self._selection is None or isinstance(self.obj, Series): + if self._group_selection is not None: + return self.obj[self._group_selection] return self.obj else: return self.obj[self._selection] @@ -461,11 +465,11 @@ def _selected_obj(self): def _set_selection_from_grouper(self): """ we may need create a selection if we have non-level groupers """ grp = self.grouper - if self._selection is None and self.as_index and getattr(grp,'groupings',None) is not None: + if self.as_index and getattr(grp,'groupings',None) is not None: ax = self.obj._info_axis groupers = [ g.name for g in grp.groupings if g.level is None and g.name is not None and g.name in ax ] if len(groupers): - self._selection = (ax-Index(groupers)).tolist() + self._group_selection = (ax-Index(groupers)).tolist() def _local_dir(self): return sorted(set(self.obj._local_dir() + list(self._apply_whitelist))) @@ -776,6 +780,7 @@ def nth(self, n, dropna=None): """ + self._set_selection_from_grouper() if not dropna: # good choice m = self.grouper._max_groupsize if n >= m or n < -m: @@ -787,7 +792,21 @@ def nth(self, n, dropna=None): else: rng[- n - 1] = True is_nth = self._cumcount_array(rng, ascending=False) - return self._selected_obj[is_nth] + + result = self._selected_obj[is_nth] + + # the result index + if self.as_index: + ax = self.obj._info_axis + names = self.grouper.names + if all([ n in ax for n in names ]): + result.index = Index(self.obj[names][is_nth].values.ravel()).set_names(names) + elif self._group_selection is not None: + result.index = self.obj._get_axis(self.axis)[is_nth] + + result = result.sort_index() + + return result if (isinstance(self._selected_obj, DataFrame) and dropna not in ['any', 'all']): @@ -853,6 +872,7 @@ def cumcount(self, **kwargs): dtype: int64 """ + self._set_selection_from_grouper() ascending = kwargs.pop('ascending', True) index = self._selected_obj.index diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index e5d8b92f7094f..7a8fc8a3832db 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -166,18 +166,27 @@ def test_first_last_nth(self): # tests for first / last / nth grouped = self.df.groupby('A') first = grouped.first() - expected = self.df.ix[[1, 0], ['B', 'C', 'D']] - expected.index = ['bar', 'foo'] - assert_frame_equal(first, expected, check_names=False) + expected = self.df.ix[[1, 0], ['B','C','D']] + expected.index = Index(['bar', 'foo'],name='A') + expected = expected.sort_index() + assert_frame_equal(first, expected) + + nth = grouped.nth(0) + assert_frame_equal(nth, expected) last = grouped.last() - expected = self.df.ix[[5, 7], ['B', 'C', 'D']] - expected.index = ['bar', 'foo'] - assert_frame_equal(last, expected, check_names=False) + expected = self.df.ix[[5, 7], ['B','C','D']] + expected.index = Index(['bar', 'foo'],name='A') + assert_frame_equal(last, expected) + + nth = grouped.nth(-1) + assert_frame_equal(nth, expected) nth = grouped.nth(1) - expected = self.df.iloc[[2, 3]] - assert_frame_equal(nth, expected, check_names=False) + expected = self.df.ix[[2, 3],['B','C','D']].copy() + expected.index = Index(['foo', 'bar'],name='A') + expected = expected.sort_index() + assert_frame_equal(nth, expected) # it works! grouped['B'].first() @@ -189,6 +198,17 @@ def test_first_last_nth(self): self.assert_(com.isnull(grouped['B'].last()['foo'])) self.assert_(com.isnull(grouped['B'].nth(0)[0])) # not sure what this is testing + # v0.14.0 whatsnew + df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B']) + g = df.groupby('A') + result = g.first() + expected = df.iloc[[1,2]].set_index('A') + assert_frame_equal(result, expected) + + expected = df.iloc[[1,2]].set_index('A') + result = g.nth(0,dropna='any') + assert_frame_equal(result, expected) + def test_first_last_nth_dtypes(self): df = self.df_mixed_floats.copy() @@ -199,17 +219,21 @@ def test_first_last_nth_dtypes(self): grouped = df.groupby('A') first = grouped.first() expected = df.ix[[1, 0], ['B', 'C', 'D', 'E', 'F']] - expected.index = ['bar', 'foo'] - assert_frame_equal(first, expected, check_names=False) + expected.index = Index(['bar', 'foo'], name='A') + expected = expected.sort_index() + assert_frame_equal(first, expected) last = grouped.last() expected = df.ix[[5, 7], ['B', 'C', 'D', 'E', 'F']] - expected.index = ['bar', 'foo'] - assert_frame_equal(last, expected, check_names=False) + expected.index = Index(['bar', 'foo'], name='A') + expected = expected.sort_index() + assert_frame_equal(last, expected) nth = grouped.nth(1) - expected = df.iloc[[2, 3]] - assert_frame_equal(nth, expected, check_names=False) + expected = df.ix[[3, 2],['B', 'C', 'D', 'E', 'F']] + expected.index = Index(['bar', 'foo'], name='A') + expected = expected.sort_index() + assert_frame_equal(nth, expected) # GH 2763, first/last shifting dtypes idx = lrange(10) @@ -223,15 +247,15 @@ def test_nth(self): df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B']) g = df.groupby('A') - assert_frame_equal(g.nth(0), df.iloc[[0, 2]]) - assert_frame_equal(g.nth(1), df.iloc[[1]]) - assert_frame_equal(g.nth(2), df.loc[[]]) - assert_frame_equal(g.nth(-1), df.iloc[[1, 2]]) - assert_frame_equal(g.nth(-2), df.iloc[[0]]) - assert_frame_equal(g.nth(-3), df.loc[[]]) + assert_frame_equal(g.nth(0), df.iloc[[0, 2]].set_index('A')) + assert_frame_equal(g.nth(1), df.iloc[[1]].set_index('A')) + assert_frame_equal(g.nth(2), df.loc[[],['B']]) + assert_frame_equal(g.nth(-1), df.iloc[[1, 2]].set_index('A')) + assert_frame_equal(g.nth(-2), df.iloc[[0]].set_index('A')) + assert_frame_equal(g.nth(-3), df.loc[[],['B']]) assert_series_equal(g.B.nth(0), df.B.iloc[[0, 2]]) assert_series_equal(g.B.nth(1), df.B.iloc[[1]]) - assert_frame_equal(g[['B']].nth(0), df.ix[[0, 2], ['B']]) + assert_frame_equal(g[['B']].nth(0), df.ix[[0, 2], ['A', 'B']].set_index('A')) exp = df.set_index('A') assert_frame_equal(g.nth(0, dropna='any'), exp.iloc[[1, 2]])