From 9c1b4ce0ab47547e8d5e465f4dcab2fe255e9d54 Mon Sep 17 00:00:00 2001 From: Radek Benes Date: Wed, 27 Feb 2019 19:31:21 +0100 Subject: [PATCH 01/13] Avoid casting in specific situations --- pandas/core/groupby/generic.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 683c21f7bd47a..84bf561f56b83 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -254,8 +254,11 @@ def _aggregate_item_by_item(self, func, *args, **kwargs): data = obj[item] colg = SeriesGroupBy(data, selection=item, grouper=self.grouper) - result[item] = self._try_cast( - colg.aggregate(func, *args, **kwargs), data) + + result[item] = colg.aggregate(func, *args, **kwargs) + if func != "idxmin" and func != "idxmax": + result[item] = self._try_cast(result[item], data) + except ValueError: cannot_agg.append(item) continue From b947efa708f0a03d9c8752b48fcf29266c2964e6 Mon Sep 17 00:00:00 2001 From: Radek Benes Date: Sat, 2 Mar 2019 22:36:21 +0100 Subject: [PATCH 02/13] Unit test --- pandas/tests/groupby/test_groupby.py | 32 ++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 6a11f0ae9b44a..5696b625a33f1 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -1714,3 +1714,35 @@ def test_groupby_multiindex_nat(): result = ser.groupby(level=1).mean() expected = pd.Series([3., 2.5], index=["a", "b"]) assert_series_equal(result, expected) + + +def test_idxmin_idxmax_returns_int_types(): + df = pd.DataFrame({'name': ['A', 'A', 'B', 'B'], + 'c_int': [1, 2, 3, 4], + 'c_float': [4.02, 3.03, 2.04, 1.05], + 'c_date': ['2019', '2018', '2016', '2017']}) + df['c_date'] = pd.to_datetime(df['c_date']) + + idxmins_computed = df.groupby('name').idxmin() + idxmaxs_computed = df.groupby('name').idxmax() + + for col in idxmaxs_computed: + assert idxmins_computed[col].dtype == np.int64 + assert idxmaxs_computed[col].dtype == np.int64 + + idxmins_expected = pd.DataFrame({'c_int': [0, 2], + 'c_float': [1, 3], + 'c_date': [1, 2]}, + index=['A', 'B']) + + idxmins_expected.index.name = "name" + + idxmaxs_expected = pd.DataFrame({'c_int': [1, 3], + 'c_float': [0, 2], + 'c_date': [0, 3]}, + index=['A', 'B']) + + idxmaxs_expected.index.name = "name" + + assert_frame_equal(idxmins_expected, idxmins_computed) + assert_frame_equal(idxmaxs_expected, idxmaxs_computed) From 4248eace0fee8151917a51edeec8459b17fa1369 Mon Sep 17 00:00:00 2001 From: Radek Benes Date: Mon, 4 Mar 2019 09:55:04 +0100 Subject: [PATCH 03/13] Add GH issue link --- pandas/tests/groupby/test_groupby.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 5696b625a33f1..4e127f9f31ac8 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -1717,6 +1717,7 @@ def test_groupby_multiindex_nat(): def test_idxmin_idxmax_returns_int_types(): + # GH 25444 df = pd.DataFrame({'name': ['A', 'A', 'B', 'B'], 'c_int': [1, 2, 3, 4], 'c_float': [4.02, 3.03, 2.04, 1.05], From 538f8f56be645abf265d2331ad81b2d271effa66 Mon Sep 17 00:00:00 2001 From: Radek Benes Date: Mon, 4 Mar 2019 09:58:52 +0100 Subject: [PATCH 04/13] Pep8 fixes --- pandas/tests/groupby/test_groupby.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 4e127f9f31ac8..cc8492ccfa0c3 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -1718,9 +1718,9 @@ def test_groupby_multiindex_nat(): def test_idxmin_idxmax_returns_int_types(): # GH 25444 - df = pd.DataFrame({'name': ['A', 'A', 'B', 'B'], - 'c_int': [1, 2, 3, 4], - 'c_float': [4.02, 3.03, 2.04, 1.05], + df = pd.DataFrame({'name': ['A', 'A', 'B', 'B'], + 'c_int': [1, 2, 3, 4], + 'c_float': [4.02, 3.03, 2.04, 1.05], 'c_date': ['2019', '2018', '2016', '2017']}) df['c_date'] = pd.to_datetime(df['c_date']) @@ -1734,14 +1734,14 @@ def test_idxmin_idxmax_returns_int_types(): idxmins_expected = pd.DataFrame({'c_int': [0, 2], 'c_float': [1, 3], 'c_date': [1, 2]}, - index=['A', 'B']) + index=['A', 'B']) idxmins_expected.index.name = "name" idxmaxs_expected = pd.DataFrame({'c_int': [1, 3], 'c_float': [0, 2], 'c_date': [0, 3]}, - index=['A', 'B']) + index=['A', 'B']) idxmaxs_expected.index.name = "name" From d874ef7adf0b5fdad7525ed8b26164447ba9044f Mon Sep 17 00:00:00 2001 From: Radek Benes Date: Mon, 4 Mar 2019 22:52:30 +0100 Subject: [PATCH 05/13] Move test to test_function.py and implement PR changes --- pandas/tests/groupby/test_function.py | 22 ++++++++++++++++++ pandas/tests/groupby/test_groupby.py | 33 --------------------------- 2 files changed, 22 insertions(+), 33 deletions(-) diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index b5e328ef64424..fdc92c2737f52 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -400,6 +400,28 @@ def test_groupby_non_arithmetic_agg_int_like_precision(i): assert res.iloc[0].b == data["expected"] +@pytest.mark.parametrize("func, expected", [ + ("idxmin", {'c_int': [0, 2], 'c_float': [1, 3], 'c_date': [1, 2]}), + ("idxmax", {'c_int': [1, 3], 'c_float': [0, 2], 'c_date': [0, 3]}) +]) +def test_idxmin_idxmax_returns_int_types(func, expected): + # GH 25444 + df = pd.DataFrame({'name': ['A', 'A', 'B', 'B'], + 'c_int': [1, 2, 3, 4], + 'c_float': [4.02, 3.03, 2.04, 1.05], + 'c_date': ['2019', '2018', '2016', '2017']}) + df['c_date'] = pd.to_datetime(df['c_date']) + + result = getattr(df.groupby('name'), func)() + + for col in result: + assert result[col].dtype == np.int64 + + df_expected = pd.DataFrame(expected, index=Index(['A', 'B'], name="name")) + + tm.assert_frame_equal(result, df_expected) + + def test_fill_consistency(): # GH9221 diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index cc8492ccfa0c3..6a11f0ae9b44a 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -1714,36 +1714,3 @@ def test_groupby_multiindex_nat(): result = ser.groupby(level=1).mean() expected = pd.Series([3., 2.5], index=["a", "b"]) assert_series_equal(result, expected) - - -def test_idxmin_idxmax_returns_int_types(): - # GH 25444 - df = pd.DataFrame({'name': ['A', 'A', 'B', 'B'], - 'c_int': [1, 2, 3, 4], - 'c_float': [4.02, 3.03, 2.04, 1.05], - 'c_date': ['2019', '2018', '2016', '2017']}) - df['c_date'] = pd.to_datetime(df['c_date']) - - idxmins_computed = df.groupby('name').idxmin() - idxmaxs_computed = df.groupby('name').idxmax() - - for col in idxmaxs_computed: - assert idxmins_computed[col].dtype == np.int64 - assert idxmaxs_computed[col].dtype == np.int64 - - idxmins_expected = pd.DataFrame({'c_int': [0, 2], - 'c_float': [1, 3], - 'c_date': [1, 2]}, - index=['A', 'B']) - - idxmins_expected.index.name = "name" - - idxmaxs_expected = pd.DataFrame({'c_int': [1, 3], - 'c_float': [0, 2], - 'c_date': [0, 3]}, - index=['A', 'B']) - - idxmaxs_expected.index.name = "name" - - assert_frame_equal(idxmins_expected, idxmins_computed) - assert_frame_equal(idxmaxs_expected, idxmaxs_computed) From 29afd36e88bc2e19ab04add6219f526fce69cfea Mon Sep 17 00:00:00 2001 From: Radek Benes Date: Wed, 6 Mar 2019 22:23:48 +0100 Subject: [PATCH 06/13] Unit test updates according to PR --- pandas/tests/groupby/test_function.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index fdc92c2737f52..4ea0d12656ee4 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -400,11 +400,11 @@ def test_groupby_non_arithmetic_agg_int_like_precision(i): assert res.iloc[0].b == data["expected"] -@pytest.mark.parametrize("func, expected", [ +@pytest.mark.parametrize("func, values", [ ("idxmin", {'c_int': [0, 2], 'c_float': [1, 3], 'c_date': [1, 2]}), ("idxmax", {'c_int': [1, 3], 'c_float': [0, 2], 'c_date': [0, 3]}) ]) -def test_idxmin_idxmax_returns_int_types(func, expected): +def test_idxmin_idxmax_returns_int_types(func, values): # GH 25444 df = pd.DataFrame({'name': ['A', 'A', 'B', 'B'], 'c_int': [1, 2, 3, 4], @@ -414,12 +414,9 @@ def test_idxmin_idxmax_returns_int_types(func, expected): result = getattr(df.groupby('name'), func)() - for col in result: - assert result[col].dtype == np.int64 + expected = pd.DataFrame(values, index=Index(['A', 'B'], name="name")) - df_expected = pd.DataFrame(expected, index=Index(['A', 'B'], name="name")) - - tm.assert_frame_equal(result, df_expected) + tm.assert_frame_equal(result, expected) def test_fill_consistency(): From f23e884db3d44b8a8e8fff99e8f6eea4f3b1a786 Mon Sep 17 00:00:00 2001 From: Radek Benes Date: Wed, 6 Mar 2019 22:24:12 +0100 Subject: [PATCH 07/13] Update in cast handling --- pandas/core/groupby/base.py | 3 ++- pandas/core/groupby/generic.py | 4 +++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/pandas/core/groupby/base.py b/pandas/core/groupby/base.py index ebba4a0a9395d..02d96997986a6 100644 --- a/pandas/core/groupby/base.py +++ b/pandas/core/groupby/base.py @@ -89,7 +89,8 @@ def _gotitem(self, key, ndim, subset=None): cython_transforms = frozenset(['cumprod', 'cumsum', 'shift', 'cummin', 'cummax']) -cython_cast_blacklist = frozenset(['rank', 'count', 'size']) +cython_cast_blacklist = frozenset(['rank', 'count', 'size', 'idxmin', + 'idxmax']) def whitelist_method_generator(base, klass, whitelist): diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 84bf561f56b83..86713ec04c108 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -255,8 +255,10 @@ def _aggregate_item_by_item(self, func, *args, **kwargs): colg = SeriesGroupBy(data, selection=item, grouper=self.grouper) + cast = self._transform_should_cast(func) + result[item] = colg.aggregate(func, *args, **kwargs) - if func != "idxmin" and func != "idxmax": + if cast: result[item] = self._try_cast(result[item], data) except ValueError: From 73a4545a00ce04d6492840185369c451c866de15 Mon Sep 17 00:00:00 2001 From: Radek Benes Date: Wed, 6 Mar 2019 22:30:58 +0100 Subject: [PATCH 08/13] Whats new added --- doc/source/whatsnew/v0.24.2.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v0.24.2.rst b/doc/source/whatsnew/v0.24.2.rst index 4fcde7769b362..655a6f38167f5 100644 --- a/doc/source/whatsnew/v0.24.2.rst +++ b/doc/source/whatsnew/v0.24.2.rst @@ -86,6 +86,7 @@ Bug Fixes - Bug in :meth:`pandas.core.groupby.GroupBy.transform` where applying a function to a timezone aware column would return a timezone naive result (:issue:`24198`) - Bug in :func:`DataFrame.join` when joining on a timezone aware :class:`DatetimeIndex` (:issue:`23931`) +- Bug in groupby.idxmin() that returns DateTime values for DataTime columns (:issue:`25444`) - **Visualization** From 510b22135575ee6ebedd559a6d5ecb69e4b9a376 Mon Sep 17 00:00:00 2001 From: Radek Benes Date: Tue, 19 Mar 2019 21:48:08 +0100 Subject: [PATCH 09/13] Whats new moved and re-formulated --- doc/source/whatsnew/v0.24.2.rst | 1 - doc/source/whatsnew/v0.25.0.rst | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.24.2.rst b/doc/source/whatsnew/v0.24.2.rst index 68314af774c14..6ad299de45e2a 100644 --- a/doc/source/whatsnew/v0.24.2.rst +++ b/doc/source/whatsnew/v0.24.2.rst @@ -53,7 +53,6 @@ Bug Fixes - Bug in :meth:`~pandas.core.groupby.GroupBy.transform` where applying a function to a timezone aware column would return a timezone naive result (:issue:`24198`) - Bug in :func:`DataFrame.join` when joining on a timezone aware :class:`DatetimeIndex` (:issue:`23931`) -- Bug in groupby.idxmin() that returns DateTime values for DataTime columns (:issue:`25444`) **Visualization** diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 8e72ce83ac028..0ae137ca65651 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -254,6 +254,7 @@ Groupby/Resample/Rolling - Bug in :func:`pandas.core.groupby.GroupBy.agg` when applying a aggregation function to timezone aware data (:issue:`23683`) - Bug in :func:`pandas.core.groupby.GroupBy.first` and :func:`pandas.core.groupby.GroupBy.last` where timezone information would be dropped (:issue:`21603`) - Ensured that ordering of outputs in ``groupby`` aggregation functions is consistent across all versions of Python (:issue:`25692`) +- Bug in :func:`pandas.core.groupby.GroupBy.idxmin` and :func:`pandas.core.groupby.GroupBy.idxmax` where datetime columns would incorrectly return datetime values instead of ints (:issue:`25444`) Reshaping From 3cd23e7e2539fe0d131c3eb545a06563097b81ab Mon Sep 17 00:00:00 2001 From: Radek Benes Date: Sat, 23 Mar 2019 22:11:18 +0100 Subject: [PATCH 10/13] Unit test for GH 15306 --- pandas/tests/groupby/test_transform.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/pandas/tests/groupby/test_transform.py b/pandas/tests/groupby/test_transform.py index b645073fcf72a..9c6e7c0c97c47 100644 --- a/pandas/tests/groupby/test_transform.py +++ b/pandas/tests/groupby/test_transform.py @@ -845,3 +845,22 @@ def test_groupby_transform_timezone_column(func): expected = pd.DataFrame([[ts, 1, ts]], columns=['end_time', 'id', 'max_end_time']) tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("func, values", [ + ("idxmin", ["1/1/2011"] * 2 + ["1/3/2011"] * 7 + ["1/10/2011"]), + ("idxmax", ["1/2/2011"] * 2 + ["1/9/2011"] * 7 + ["1/10/2011"]) +]) +def test_groupby_transform_with_datetimes(func, values): + # GH 15306 + dates = pd.date_range('1/1/2011', periods=10, freq='D') + + stocks = pd.DataFrame({'price': np.arange(10.0)}, index=dates) + stocks['week_id'] = pd.to_datetime(stocks.index).week + + result = stocks.groupby(stocks['week_id'])['price'].transform(func) + + expected = pd.Series(data=map(pd.to_datetime, values), + index=dates, name="price") + + tm.assert_series_equal(result, expected) From 7a639f97aa888e2b0f1ebf754264bb2d9a5aa104 Mon Sep 17 00:00:00 2001 From: Radek Benes Date: Sat, 23 Mar 2019 22:14:36 +0100 Subject: [PATCH 11/13] Whats new updated wit GH 15306 --- doc/source/whatsnew/v0.25.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 0ae137ca65651..ec37484250045 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -254,7 +254,7 @@ Groupby/Resample/Rolling - Bug in :func:`pandas.core.groupby.GroupBy.agg` when applying a aggregation function to timezone aware data (:issue:`23683`) - Bug in :func:`pandas.core.groupby.GroupBy.first` and :func:`pandas.core.groupby.GroupBy.last` where timezone information would be dropped (:issue:`21603`) - Ensured that ordering of outputs in ``groupby`` aggregation functions is consistent across all versions of Python (:issue:`25692`) -- Bug in :func:`pandas.core.groupby.GroupBy.idxmin` and :func:`pandas.core.groupby.GroupBy.idxmax` where datetime columns would incorrectly return datetime values instead of ints (:issue:`25444`) +- Bug in :func:`pandas.core.groupby.GroupBy.idxmin` and :func:`pandas.core.groupby.GroupBy.idxmax` where datetime columns would incorrectly return datetime values instead of ints (:issue:`25444`, :issue:`15306`) Reshaping From ccc1048ce0d2b90db5ebf17a7195c64a0d9c305d Mon Sep 17 00:00:00 2001 From: Radek Benes Date: Sun, 24 Mar 2019 21:09:19 +0100 Subject: [PATCH 12/13] Better usage of pd.to_datetime --- pandas/tests/groupby/test_transform.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/groupby/test_transform.py b/pandas/tests/groupby/test_transform.py index 9c6e7c0c97c47..26f39f8f41e2f 100644 --- a/pandas/tests/groupby/test_transform.py +++ b/pandas/tests/groupby/test_transform.py @@ -860,7 +860,7 @@ def test_groupby_transform_with_datetimes(func, values): result = stocks.groupby(stocks['week_id'])['price'].transform(func) - expected = pd.Series(data=map(pd.to_datetime, values), + expected = pd.Series(data=pd.to_datetime(values), index=dates, name="price") tm.assert_series_equal(result, expected) From 28b5ab2fe70420867aed8d09bde27df53e5ef1fd Mon Sep 17 00:00:00 2001 From: Radek Benes Date: Sun, 24 Mar 2019 21:43:50 +0100 Subject: [PATCH 13/13] Whats new updated --- doc/source/whatsnew/v0.25.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index ec37484250045..9405e7804a461 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -254,7 +254,7 @@ Groupby/Resample/Rolling - Bug in :func:`pandas.core.groupby.GroupBy.agg` when applying a aggregation function to timezone aware data (:issue:`23683`) - Bug in :func:`pandas.core.groupby.GroupBy.first` and :func:`pandas.core.groupby.GroupBy.last` where timezone information would be dropped (:issue:`21603`) - Ensured that ordering of outputs in ``groupby`` aggregation functions is consistent across all versions of Python (:issue:`25692`) -- Bug in :func:`pandas.core.groupby.GroupBy.idxmin` and :func:`pandas.core.groupby.GroupBy.idxmax` where datetime columns would incorrectly return datetime values instead of ints (:issue:`25444`, :issue:`15306`) +- Bug in :func:`idxmax` and :func:`idxmin` on :meth:`DataFrame.groupby` with datetime column would return incorrect dtype (:issue:`25444`, :issue:`15306`) Reshaping