From 567e823e1a6ef3eca7b9fc80edc2c28e41a7a481 Mon Sep 17 00:00:00 2001 From: sinhrks Date: Sat, 9 Apr 2016 17:02:04 +0900 Subject: [PATCH] BUG: GroupBy with TimeGrouper sorts unstably --- doc/source/whatsnew/v0.18.1.txt | 2 ++ pandas/core/groupby.py | 3 ++- pandas/tseries/tests/test_resample.py | 30 ++++++++++++++++++++++++--- 3 files changed, 31 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v0.18.1.txt b/doc/source/whatsnew/v0.18.1.txt index 7d79367cef1e2..b3bdf3df1eb20 100644 --- a/doc/source/whatsnew/v0.18.1.txt +++ b/doc/source/whatsnew/v0.18.1.txt @@ -241,3 +241,5 @@ Bug Fixes - Bug in ``.describe()`` resets categorical columns information (:issue:`11558`) - Bug where ``loffset`` argument was not applied when calling ``resample().count()`` on a timeseries (:issue:`12725`) - ``pd.read_excel()`` now accepts path objects (e.g. ``pathlib.Path``, ``py.path.local``) for the file path, in line with other ``read_*`` functions (:issue:`12655`) + +- Bug in ``GroupBy.first()``, ``.last()`` returns incorrect row when ``TimeGrouper`` is used (:issue:`7453`) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index a0a358717fdc6..a99ab46f3623a 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -273,7 +273,8 @@ def _set_grouper(self, obj, sort=False): # possibly sort if (self.sort or sort) and not ax.is_monotonic: - indexer = self.indexer = ax.argsort(kind='quicksort') + # use stable sort to suport first, last, nth + indexer = self.indexer = ax.argsort(kind='mergesort') ax = ax.take(indexer) obj = obj.take(indexer, axis=self.axis, convert=False, is_copy=False) diff --git a/pandas/tseries/tests/test_resample.py b/pandas/tseries/tests/test_resample.py index 118b06a63dfd8..a9348eb11e13b 100644 --- a/pandas/tseries/tests/test_resample.py +++ b/pandas/tseries/tests/test_resample.py @@ -2365,6 +2365,28 @@ def test_fails_on_no_datetime_index(self): "got an instance of 'PeriodIndex'"): df.groupby(TimeGrouper('D')) + def test_aaa_group_order(self): + # GH 12840 + # check TimeGrouper perform stable sorts + n = 20 + data = np.random.randn(n, 4) + df = DataFrame(data, columns=['A', 'B', 'C', 'D']) + df['key'] = [datetime(2013, 1, 1), datetime(2013, 1, 2), + datetime(2013, 1, 3), datetime(2013, 1, 4), + datetime(2013, 1, 5)] * 4 + grouped = df.groupby(TimeGrouper(key='key', freq='D')) + + tm.assert_frame_equal(grouped.get_group(datetime(2013, 1, 1)), + df[::5]) + tm.assert_frame_equal(grouped.get_group(datetime(2013, 1, 2)), + df[1::5]) + tm.assert_frame_equal(grouped.get_group(datetime(2013, 1, 3)), + df[2::5]) + tm.assert_frame_equal(grouped.get_group(datetime(2013, 1, 4)), + df[3::5]) + tm.assert_frame_equal(grouped.get_group(datetime(2013, 1, 5)), + df[4::5]) + def test_aggregate_normal(self): # check TimeGrouper's aggregation is identical as normal groupby @@ -2402,7 +2424,8 @@ def test_aggregate_normal(self): periods=5, name='key') dt_result = getattr(dt_grouped, func)() assert_series_equal(expected, dt_result) - """ + + # GH 7453 for func in ['first', 'last']: expected = getattr(normal_grouped, func)() expected.index = date_range(start='2013-01-01', freq='D', @@ -2410,6 +2433,9 @@ def test_aggregate_normal(self): dt_result = getattr(dt_grouped, func)() assert_frame_equal(expected, dt_result) + # if TimeGrouper is used included, 'nth' doesn't work yet + + """ for func in ['nth']: expected = getattr(normal_grouped, func)(3) expected.index = date_range(start='2013-01-01', @@ -2417,8 +2443,6 @@ def test_aggregate_normal(self): dt_result = getattr(dt_grouped, func)(3) assert_frame_equal(expected, dt_result) """ - # if TimeGrouper is used included, 'first','last' and 'nth' doesn't - # work yet def test_aggregate_with_nat(self): # check TimeGrouper's aggregation is identical as normal groupby