From 20049c1d675eaae481a57c402e7ef71dc743f413 Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Thu, 9 Jan 2020 18:19:46 +0000 Subject: [PATCH 01/23] :bug: aggregations were getting overwritten if they had the same name --- pandas/core/groupby/generic.py | 10 ++-- .../tests/groupby/aggregate/test_aggregate.py | 50 +++++++++++++++++++ 2 files changed, 56 insertions(+), 4 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 27dd6e953c219..b986c3f967df4 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -312,7 +312,7 @@ def _aggregate_multiple_funcs(self, arg): arg = zip(columns, arg) results = {} - for name, func in arg: + for idx, (name, func) in enumerate(arg): obj = self # reset the cache so that we @@ -321,13 +321,15 @@ def _aggregate_multiple_funcs(self, arg): obj = copy.copy(obj) obj._reset_cache() obj._selection = name - results[name] = obj.aggregate(func) + results[base.OutputKey(label=name, position=idx)] = obj.aggregate(func) if any(isinstance(x, DataFrame) for x in results.values()): # let higher level handle - return results + return {key.label: value for key, value in results.items()} - return DataFrame(results, columns=columns) + if results: + return DataFrame(self._wrap_aggregated_output(results), columns=columns) + return DataFrame(columns=columns) def _wrap_series_output( self, output: Mapping[base.OutputKey, Union[Series, np.ndarray]], index: Index diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index 2d31996a8a964..3db3e079a2228 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -238,6 +238,56 @@ def test_agg_multiple_functions_maintain_order(df): tm.assert_index_equal(result.columns, exp_cols) +def test_agg_multiple_functions_same_name(df): + # GH 30880 + np.random.seed(1) + df = tm.makeTimeDataFrame() + result = df.resample("3D").agg( + {"A": [functools.partial(np.std, ddof=0), functools.partial(np.std, ddof=1)]} + ) + expected_index = pd.DatetimeIndex( + [ + "2000-01-03", + "2000-01-06", + "2000-01-09", + "2000-01-12", + "2000-01-15", + "2000-01-18", + "2000-01-21", + "2000-01-24", + "2000-01-27", + "2000-01-30", + "2000-02-02", + "2000-02-05", + "2000-02-08", + "2000-02-11", + ], + dtype="datetime64[ns]", + freq="3D", + ) + expected_columns = pd.MultiIndex.from_tuples([("A", "std"), ("A", "std")]) + expected_values = [ + [1.03497007, 1.26757429], + [0.96918813, 1.37063899], + [2.02317523, 2.86120185], + [0.44121013, 0.54036984], + [0.0, np.nan], + [0.80503606, 0.98596379], + [0.0, np.nan], + [0.3954067, 0.48427232], + [0.27030073, 0.38226296], + [1.12267144, 1.58769718], + [0.18796289, 0.23020659], + [0.0, np.nan], + [0.3540029, 0.43356324], + [0.0, np.nan], + ] + expected = pd.DataFrame( + expected_values, columns=expected_columns, index=expected_index + ) + tm.assert_frame_equal(result, expected) + + def test_multiple_functions_tuples_and_non_tuples(df): # #1359 funcs = [("foo", "mean"), "std"] From ab685fd12a3c31b946033f60906c25b88d5239e8 Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Tue, 21 Jan 2020 10:24:50 +0000 Subject: [PATCH 02/23] :art: shorten test for the sake of legibility --- .../tests/groupby/aggregate/test_aggregate.py | 40 ++----------------- 1 file changed, 4 insertions(+), 36 deletions(-) diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index 3db3e079a2228..995db30669527 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -245,43 +245,11 @@ def test_agg_multiple_functions_same_name(df): result = df.resample("3D").agg( {"A": [functools.partial(np.std, ddof=0), functools.partial(np.std, ddof=1)]} ) - expected_index = pd.DatetimeIndex( - [ - "2000-01-03", - "2000-01-06", - "2000-01-09", - "2000-01-12", - "2000-01-15", - "2000-01-18", - "2000-01-21", - "2000-01-24", - "2000-01-27", - "2000-01-30", - "2000-02-02", - "2000-02-05", - "2000-02-08", - "2000-02-11", - ], - dtype="datetime64[ns]", - freq="3D", - ) + expected_index = pd.date_range("2000-01-03", "2000-02-11", freq="3D") expected_columns = pd.MultiIndex.from_tuples([("A", "std"), ("A", "std")]) - expected_values = [ - [1.03497007, 1.26757429], - [0.96918813, 1.37063899], - [2.02317523, 2.86120185], - [0.44121013, 0.54036984], - [0.0, np.nan], - [0.80503606, 0.98596379], - [0.0, np.nan], - [0.3954067, 0.48427232], - [0.27030073, 0.38226296], - [1.12267144, 1.58769718], - [0.18796289, 0.23020659], - [0.0, np.nan], - [0.3540029, 0.43356324], - [0.0, np.nan], - ] + expected_values = np.array( + [df.resample("3D").A.std(ddof=i).values for i in range(2)] + ).T expected = pd.DataFrame( expected_values, columns=expected_columns, index=expected_index ) From e38e45039fdfcbdc371a27367ab24ec26022f93e Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Tue, 21 Jan 2020 10:56:17 +0000 Subject: [PATCH 03/23] :art: handle empty in , make whatsnewentry public-facing --- pandas/core/groupby/generic.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index b986c3f967df4..6c6b93e91706b 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -311,7 +311,7 @@ def _aggregate_multiple_funcs(self, arg): arg = zip(columns, arg) - results = {} + results: Mapping[base.OutputKey, Union[Series, DataFrame]] = {} for idx, (name, func) in enumerate(arg): obj = self @@ -326,10 +326,7 @@ def _aggregate_multiple_funcs(self, arg): if any(isinstance(x, DataFrame) for x in results.values()): # let higher level handle return {key.label: value for key, value in results.items()} - - if results: - return DataFrame(self._wrap_aggregated_output(results), columns=columns) - return DataFrame(columns=columns) + return DataFrame(self._wrap_aggregated_output(results), columns=columns) def _wrap_series_output( self, output: Mapping[base.OutputKey, Union[Series, np.ndarray]], index: Index @@ -360,8 +357,10 @@ def _wrap_series_output( if len(output) > 1: result = DataFrame(indexed_output, index=index) result.columns = columns - else: + elif not columns.empty: result = Series(indexed_output[0], index=index, name=columns[0]) + else: + result = DataFrame() return result From cb849a2a94863692de6862b64477fe1080b725b5 Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Thu, 23 Jan 2020 15:16:56 +0000 Subject: [PATCH 04/23] :pencil: move whatsnew entry to v1.1.0 --- doc/source/whatsnew/v1.1.0.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 9fdda83abe944..584a059ba30a7 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -132,7 +132,7 @@ Numeric Conversion ^^^^^^^^^^ - Bug in :class:`Series` construction from NumPy array with big-endian ``datetime64`` dtype (:issue:`29684`) -- +- Bug in :meth:`DataFrame.replace` was changing other columns' dtypes when values in one column were being replaced with ``NaN`` (:issue:`30512`) - Strings @@ -202,6 +202,7 @@ Reshaping - Fix incorrect error message in :meth:`DataFrame.pivot` when ``columns`` is set to ``None``. (:issue:`30924`) - Bug in :func:`crosstab` when inputs are two Series and have tuple names, the output will keep dummy MultiIndex as columns. (:issue:`18321`) - Bug in :func:`concat` where the resulting indices are not copied when ``copy=True`` (:issue:`29879`) +- Bug in :meth:`SeriesGroupBy.aggregate` was resulting in aggregations being overwritten when they shared the same name (:issue:`30092`) Sparse ^^^^^^ From 521bc1de69e84242150bb3f4f4959ef94fc46c93 Mon Sep 17 00:00:00 2001 From: MarcoGorelli Date: Sun, 2 Feb 2020 08:17:38 +0000 Subject: [PATCH 05/23] remove accidentally added whatsnewentry --- doc/source/whatsnew/v1.1.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 584a059ba30a7..c5320a2c28b7d 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -132,7 +132,7 @@ Numeric Conversion ^^^^^^^^^^ - Bug in :class:`Series` construction from NumPy array with big-endian ``datetime64`` dtype (:issue:`29684`) -- Bug in :meth:`DataFrame.replace` was changing other columns' dtypes when values in one column were being replaced with ``NaN`` (:issue:`30512`) +- - Strings From 6f9aac8dc53d051009fe940f70a0a4056fd090e2 Mon Sep 17 00:00:00 2001 From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Tue, 3 Mar 2020 14:25:45 +0000 Subject: [PATCH 06/23] Update v1.1.0.rst --- doc/source/whatsnew/v1.1.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 95f3954282e1d..ba204f0d5c403 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -330,6 +330,7 @@ Reshaping - Bug in :meth:`SeriesGroupBy.aggregate` was resulting in aggregations being overwritten when they shared the same name (:issue:`30092`) - :meth:`DataFrame.replace` and :meth:`Series.replace` will raise a ``TypeError`` if ``to_replace`` is not an expected type. Previously the ``replace`` would fail silently (:issue:`18634`) + Sparse ^^^^^^ From a8e9121ba20b234d8a2280895828664e3d1d60a7 Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Wed, 4 Mar 2020 11:55:18 +0000 Subject: [PATCH 07/23] remove dataframe constructor --- pandas/core/groupby/generic.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index e2872edc5a07c..03b478fba7006 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -326,7 +326,13 @@ def _aggregate_multiple_funcs(self, arg): if any(isinstance(x, DataFrame) for x in results.values()): # let higher level handle return {key.label: value for key, value in results.items()} - return DataFrame(self._wrap_aggregated_output(results), columns=columns) + + if not results: + return DataFrame() + output = self._wrap_aggregated_output(results) + if not isinstance(output, ABCDataFrame): + output = output.to_frame() + return output def _wrap_series_output( self, output: Mapping[base.OutputKey, Union[Series, np.ndarray]], index: Index @@ -357,10 +363,8 @@ def _wrap_series_output( if len(output) > 1: result = DataFrame(indexed_output, index=index) result.columns = columns - elif not columns.empty: - result = Series(indexed_output[0], index=index, name=columns[0]) else: - result = DataFrame() + result = Series(indexed_output[0], index=index, name=columns[0]) return result From b857c6d922e586430d79f7531f6523ed39159ad3 Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Wed, 4 Mar 2020 11:56:11 +0000 Subject: [PATCH 08/23] Dict instead of Mapping --- pandas/core/groupby/generic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 03b478fba7006..578c34b4154a5 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -311,7 +311,7 @@ def _aggregate_multiple_funcs(self, arg): arg = zip(columns, arg) - results: Mapping[base.OutputKey, Union[Series, DataFrame]] = {} + results: Dict[base.OutputKey, Union[Series, DataFrame]] = {} for idx, (name, func) in enumerate(arg): obj = self From 552063ac9e1983e016a2ffd805c118ae3c128f3a Mon Sep 17 00:00:00 2001 From: MarcoGorelli Date: Sun, 15 Mar 2020 11:16:34 +0000 Subject: [PATCH 09/23] remove no longer necessary setting of random seed --- pandas/tests/groupby/aggregate/test_aggregate.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index 4a7529a388117..3356908f32d86 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -252,7 +252,6 @@ def test_agg_multiple_functions_maintain_order(df): def test_agg_multiple_functions_same_name(df): # GH 30880 - np.random.seed(1) df = tm.makeTimeDataFrame() result = df.resample("3D").agg( {"A": [functools.partial(np.std, ddof=0), functools.partial(np.std, ddof=1)]} From 40f7e31c4ce9e172a85b289a7790421b4a1ba692 Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Sun, 19 Apr 2020 11:45:55 +0100 Subject: [PATCH 10/23] don't return slice in concat --- pandas/core/groupby/generic.py | 5 +++-- pandas/core/reshape/concat.py | 4 ++-- pandas/tests/groupby/aggregate/test_aggregate.py | 11 +++++++++++ 3 files changed, 16 insertions(+), 4 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index b6df6122707b9..5e6d3bc44c272 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -293,7 +293,8 @@ def aggregate(self, func=None, *args, **kwargs): if isinstance(ret, dict): from pandas import concat - ret = concat(ret, axis=1) + ret = concat(ret.values(), axis=1, keys=[key.label for key in ret.keys()]) + return ret agg = aggregate @@ -336,7 +337,7 @@ def _aggregate_multiple_funcs(self, arg): if any(isinstance(x, DataFrame) for x in results.values()): # let higher level handle - return {key.label: value for key, value in results.items()} + return results if not results: return DataFrame() diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index a868e663b06a5..940e1465292c6 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -618,9 +618,9 @@ def _make_concat_multiindex(indexes, keys, levels=None, names=None) -> MultiInde for hlevel, level in zip(zipped, levels): to_concat = [] - for key, index in zip(hlevel, indexes): + for i, (key, index) in enumerate(zip(hlevel, indexes)): try: - i = level.get_loc(key) + level.get_loc(key) except KeyError as err: raise ValueError(f"Key {key} not in level {level}") from err diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index ecc07429ed49e..b055586623b34 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -268,6 +268,17 @@ def test_agg_multiple_functions_same_name(df): ) tm.assert_frame_equal(result, expected) + result = df.resample("3D").agg( + { + "A": [ + "ohlc", + functools.partial(np.std, ddof=0), + functools.partial(np.std, ddof=1), + ] + } + ) + pass + def test_multiple_functions_tuples_and_non_tuples(df): # #1359 From f8f2d7f5640a47f031785de48e52e54d3776b0b0 Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Sun, 19 Apr 2020 12:00:46 +0100 Subject: [PATCH 11/23] Add test containing ohlc --- .../tests/groupby/aggregate/test_aggregate.py | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index b055586623b34..50e9d1c2d1ae5 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -268,6 +268,7 @@ def test_agg_multiple_functions_same_name(df): ) tm.assert_frame_equal(result, expected) + # check what happens if ohlc (which expands dimensions) is present result = df.resample("3D").agg( { "A": [ @@ -277,7 +278,21 @@ def test_agg_multiple_functions_same_name(df): ] } ) - pass + expected_columns = pd.MultiIndex.from_tuples( + [ + ("A", "ohlc", "open"), + ("A", "ohlc", "high"), + ("A", "ohlc", "low"), + ("A", "ohlc", "close"), + ("A", "std", "A"), + ("A", "std", "A"), + ] + ) + expected_values = np.hstack([df.resample("3D").A.ohlc(), expected_values]) + expected = pd.DataFrame( + expected_values, columns=expected_columns, index=expected_index + ) + tm.assert_frame_equal(result, expected) def test_multiple_functions_tuples_and_non_tuples(df): From dba7dde21cad1bd7e4efcdb32f404793be6f7548 Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Sun, 19 Apr 2020 12:11:23 +0100 Subject: [PATCH 12/23] Add named aggregation resample test, add to whatsnew --- doc/source/whatsnew/v1.1.0.rst | 3 ++- .../tests/groupby/aggregate/test_aggregate.py | 22 +++++++++++++++++++ 2 files changed, 24 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 2c63e870dd936..8ff7f0a19eb18 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -617,7 +617,8 @@ Reshaping - Bug in :func:`crosstab` when inputs are two Series and have tuple names, the output will keep dummy MultiIndex as columns. (:issue:`18321`) - :meth:`DataFrame.pivot` can now take lists for ``index`` and ``columns`` arguments (:issue:`21425`) - Bug in :func:`concat` where the resulting indices are not copied when ``copy=True`` (:issue:`29879`) -- Bug in :meth:`SeriesGroupBy.aggregate` was resulting in aggregations being overwritten when they shared the same name (:issue:`30092`) +- Bug in :meth:`SeriesGroupBy.aggregate` was resulting in aggregations being overwritten when they shared the same name (:issue:`30880`) +- Bug in :meth:`SeriesGroupBy.aggregate` was resulting in named aggregations being overwritten when using ``resample`` (:issue:`30092`) - Bug where :meth:`Index.astype` would lose the name attribute when converting from ``Float64Index`` to ``Int64Index``, or when casting to an ``ExtensionArray`` dtype (:issue:`32013`) - :meth:`Series.append` will now raise a ``TypeError`` when passed a DataFrame or a sequence containing Dataframe (:issue:`31413`) - :meth:`DataFrame.replace` and :meth:`Series.replace` will raise a ``TypeError`` if ``to_replace`` is not an expected type. Previously the ``replace`` would fail silently (:issue:`18634`) diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index 50e9d1c2d1ae5..53c01f528b445 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -2,6 +2,7 @@ test .agg behavior / note that .apply is tested generally in test_groupby.py """ import functools +from functools import partial import numpy as np import pytest @@ -295,6 +296,27 @@ def test_agg_multiple_functions_same_name(df): tm.assert_frame_equal(result, expected) +def test_named_aggregation_with_resample(): + # GH 30092 + df = pd.DataFrame( + np.random.randn(1000, 3), + index=pd.date_range("1/1/2012", freq="S", periods=1000), + columns=["A", "B", "C"], + ) + result = df.resample("3T").agg( + {"A": [partial(np.quantile, q=0.9999), partial(np.quantile, q=0.90)]} + ) + expected_index = pd.date_range("1/1/2012", freq="3T", periods=6) + expected_columns = MultiIndex.from_tuples([("A", "quantile"), ("A", "quantile")]) + expected_values = expected_values = np.array( + [df.resample("3T").A.quantile(q=q).values for q in [0.9999, 0.9]] + ).T + expected = pd.DataFrame( + expected_values, columns=expected_columns, index=expected_index + ) + tm.assert_frame_equal(result, expected) + + def test_multiple_functions_tuples_and_non_tuples(df): # #1359 funcs = [("foo", "mean"), "std"] From 1b43ed1b0b3e68da6a66a12074ac979563758cb2 Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Sun, 19 Apr 2020 12:14:22 +0100 Subject: [PATCH 13/23] revert empty line change --- pandas/core/groupby/generic.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 5e6d3bc44c272..7b06a76d4d4e0 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -294,7 +294,6 @@ def aggregate(self, func=None, *args, **kwargs): from pandas import concat ret = concat(ret.values(), axis=1, keys=[key.label for key in ret.keys()]) - return ret agg = aggregate From 868a68044e454c4bf227da47d9c628d9ebcd77b6 Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Sun, 19 Apr 2020 12:26:02 +0100 Subject: [PATCH 14/23] remove 30092 from whatsnew as the issue is already fixed in 1.0.3 and it's not clear what I should put --- doc/source/whatsnew/v1.1.0.rst | 1 - 1 file changed, 1 deletion(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 8ff7f0a19eb18..30cf5796721e0 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -618,7 +618,6 @@ Reshaping - :meth:`DataFrame.pivot` can now take lists for ``index`` and ``columns`` arguments (:issue:`21425`) - Bug in :func:`concat` where the resulting indices are not copied when ``copy=True`` (:issue:`29879`) - Bug in :meth:`SeriesGroupBy.aggregate` was resulting in aggregations being overwritten when they shared the same name (:issue:`30880`) -- Bug in :meth:`SeriesGroupBy.aggregate` was resulting in named aggregations being overwritten when using ``resample`` (:issue:`30092`) - Bug where :meth:`Index.astype` would lose the name attribute when converting from ``Float64Index`` to ``Int64Index``, or when casting to an ``ExtensionArray`` dtype (:issue:`32013`) - :meth:`Series.append` will now raise a ``TypeError`` when passed a DataFrame or a sequence containing Dataframe (:issue:`31413`) - :meth:`DataFrame.replace` and :meth:`Series.replace` will raise a ``TypeError`` if ``to_replace`` is not an expected type. Previously the ``replace`` would fail silently (:issue:`18634`) From 14b2402c90455520d0685fb8986438100bee8b8f Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Sat, 2 May 2020 19:18:14 +0100 Subject: [PATCH 15/23] catch performancewarning in test --- pandas/tests/groupby/aggregate/test_aggregate.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index e68e8c27789df..72d77185fab13 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -7,6 +7,8 @@ import numpy as np import pytest +from pandas.errors import PerformanceWarning + from pandas.core.dtypes.common import is_integer_dtype import pandas as pd @@ -293,7 +295,8 @@ def test_agg_multiple_functions_same_name(df): expected = pd.DataFrame( expected_values, columns=expected_columns, index=expected_index ) - tm.assert_frame_equal(result, expected) + with tm.assert_produces_warning(PerformanceWarning): + tm.assert_frame_equal(result, expected) def test_named_aggregation_with_resample(): From 862b39e046d5ba6ac632484a0894e5071bead56c Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Sun, 10 May 2020 10:14:31 +0100 Subject: [PATCH 16/23] make test same as in OP --- .../tests/groupby/aggregate/test_aggregate.py | 55 ++++++------------- 1 file changed, 16 insertions(+), 39 deletions(-) diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index 72d77185fab13..a473170ea1aa8 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -257,14 +257,18 @@ def test_agg_multiple_functions_maintain_order(df): def test_agg_multiple_functions_same_name(df): # GH 30880 - df = tm.makeTimeDataFrame() - result = df.resample("3D").agg( - {"A": [functools.partial(np.std, ddof=0), functools.partial(np.std, ddof=1)]} + df = pd.DataFrame( + np.random.randn(1000, 3), + index=pd.date_range("1/1/2012", freq="S", periods=1000), + columns=["A", "B", "C"], ) - expected_index = pd.date_range("2000-01-03", "2000-02-11", freq="3D") - expected_columns = pd.MultiIndex.from_tuples([("A", "std"), ("A", "std")]) - expected_values = np.array( - [df.resample("3D").A.std(ddof=i).values for i in range(2)] + result = df.resample("3T").agg( + {"A": [partial(np.quantile, q=0.9999), partial(np.quantile, q=0.90)]} + ) + expected_index = pd.date_range("1/1/2012", freq="3T", periods=6) + expected_columns = MultiIndex.from_tuples([("A", "quantile"), ("A", "quantile")]) + expected_values = expected_values = np.array( + [df.resample("3T").A.quantile(q=q).values for q in [0.9999, 0.9]] ).T expected = pd.DataFrame( expected_values, columns=expected_columns, index=expected_index @@ -272,14 +276,8 @@ def test_agg_multiple_functions_same_name(df): tm.assert_frame_equal(result, expected) # check what happens if ohlc (which expands dimensions) is present - result = df.resample("3D").agg( - { - "A": [ - "ohlc", - functools.partial(np.std, ddof=0), - functools.partial(np.std, ddof=1), - ] - } + result = df.resample("3T").agg( + {"A": ["ohlc", partial(np.quantile, q=0.9999), partial(np.quantile, q=0.90)]} ) expected_columns = pd.MultiIndex.from_tuples( [ @@ -287,11 +285,11 @@ def test_agg_multiple_functions_same_name(df): ("A", "ohlc", "high"), ("A", "ohlc", "low"), ("A", "ohlc", "close"), - ("A", "std", "A"), - ("A", "std", "A"), + ("A", "quantile", "A"), + ("A", "quantile", "A"), ] ) - expected_values = np.hstack([df.resample("3D").A.ohlc(), expected_values]) + expected_values = np.hstack([df.resample("3T").A.ohlc(), expected_values]) expected = pd.DataFrame( expected_values, columns=expected_columns, index=expected_index ) @@ -299,27 +297,6 @@ def test_agg_multiple_functions_same_name(df): tm.assert_frame_equal(result, expected) -def test_named_aggregation_with_resample(): - # GH 30092 - df = pd.DataFrame( - np.random.randn(1000, 3), - index=pd.date_range("1/1/2012", freq="S", periods=1000), - columns=["A", "B", "C"], - ) - result = df.resample("3T").agg( - {"A": [partial(np.quantile, q=0.9999), partial(np.quantile, q=0.90)]} - ) - expected_index = pd.date_range("1/1/2012", freq="3T", periods=6) - expected_columns = MultiIndex.from_tuples([("A", "quantile"), ("A", "quantile")]) - expected_values = expected_values = np.array( - [df.resample("3T").A.quantile(q=q).values for q in [0.9999, 0.9]] - ).T - expected = pd.DataFrame( - expected_values, columns=expected_columns, index=expected_index - ) - tm.assert_frame_equal(result, expected) - - def test_multiple_functions_tuples_and_non_tuples(df): # #1359 funcs = [("foo", "mean"), "std"] From 5e3f33331e36071fdaa9967ce69aad3a6ea0ccac Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Sun, 10 May 2020 10:16:55 +0100 Subject: [PATCH 17/23] make test match OP exactly --- pandas/tests/groupby/aggregate/test_aggregate.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index a473170ea1aa8..a5bcc1c655cfd 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -263,12 +263,12 @@ def test_agg_multiple_functions_same_name(df): columns=["A", "B", "C"], ) result = df.resample("3T").agg( - {"A": [partial(np.quantile, q=0.9999), partial(np.quantile, q=0.90)]} + {"A": [partial(np.quantile, q=0.9999), partial(np.quantile, q=0.1111)]} ) expected_index = pd.date_range("1/1/2012", freq="3T", periods=6) expected_columns = MultiIndex.from_tuples([("A", "quantile"), ("A", "quantile")]) expected_values = expected_values = np.array( - [df.resample("3T").A.quantile(q=q).values for q in [0.9999, 0.9]] + [df.resample("3T").A.quantile(q=q).values for q in [0.9999, 0.1111]] ).T expected = pd.DataFrame( expected_values, columns=expected_columns, index=expected_index @@ -277,7 +277,7 @@ def test_agg_multiple_functions_same_name(df): # check what happens if ohlc (which expands dimensions) is present result = df.resample("3T").agg( - {"A": ["ohlc", partial(np.quantile, q=0.9999), partial(np.quantile, q=0.90)]} + {"A": ["ohlc", partial(np.quantile, q=0.9999), partial(np.quantile, q=0.1111)]} ) expected_columns = pd.MultiIndex.from_tuples( [ From 51158ef9af8b91ac0cd6405c0c0a8c570df95e36 Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Mon, 18 May 2020 08:26:54 +0100 Subject: [PATCH 18/23] split into two tests --- .../tests/groupby/aggregate/test_aggregate.py | 23 ++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index a5bcc1c655cfd..373b1d7a832f2 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -267,7 +267,7 @@ def test_agg_multiple_functions_same_name(df): ) expected_index = pd.date_range("1/1/2012", freq="3T", periods=6) expected_columns = MultiIndex.from_tuples([("A", "quantile"), ("A", "quantile")]) - expected_values = expected_values = np.array( + expected_values = np.array( [df.resample("3T").A.quantile(q=q).values for q in [0.9999, 0.1111]] ).T expected = pd.DataFrame( @@ -275,10 +275,19 @@ def test_agg_multiple_functions_same_name(df): ) tm.assert_frame_equal(result, expected) - # check what happens if ohlc (which expands dimensions) is present + +def test_agg_multiple_functions_same_name_with_ohlc_present(df): + # GH 30880 + # ohlc expands dimensions, so different test to the above is required. + df = pd.DataFrame( + np.random.randn(1000, 3), + index=pd.date_range("1/1/2012", freq="S", periods=1000), + columns=["A", "B", "C"], + ) result = df.resample("3T").agg( {"A": ["ohlc", partial(np.quantile, q=0.9999), partial(np.quantile, q=0.1111)]} ) + expected_index = pd.date_range("1/1/2012", freq="3T", periods=6) expected_columns = pd.MultiIndex.from_tuples( [ ("A", "ohlc", "open"), @@ -289,10 +298,18 @@ def test_agg_multiple_functions_same_name(df): ("A", "quantile", "A"), ] ) - expected_values = np.hstack([df.resample("3T").A.ohlc(), expected_values]) + expected_values = np.hstack( + [ + df.resample("3T").A.ohlc(), + np.array( + [df.resample("3T").A.quantile(q=q).values for q in [0.9999, 0.1111]] + ).T, + ] + ) expected = pd.DataFrame( expected_values, columns=expected_columns, index=expected_index ) + # PerformanceWarning is thrown by `assert col in right` in assert_frame_equal with tm.assert_produces_warning(PerformanceWarning): tm.assert_frame_equal(result, expected) From 447dfeac6a19dbda270e325e40f64a83439287a9 Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Mon, 18 May 2020 08:28:44 +0100 Subject: [PATCH 19/23] split into two tests --- pandas/tests/groupby/aggregate/test_aggregate.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index 373b1d7a832f2..b0e6a95130284 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -298,14 +298,10 @@ def test_agg_multiple_functions_same_name_with_ohlc_present(df): ("A", "quantile", "A"), ] ) - expected_values = np.hstack( - [ - df.resample("3T").A.ohlc(), - np.array( - [df.resample("3T").A.quantile(q=q).values for q in [0.9999, 0.1111]] - ).T, - ] - ) + non_ohlc_expected_values = np.array( + [df.resample("3T").A.quantile(q=q).values for q in [0.9999, 0.1111]] + ).T + expected_values = np.hstack([df.resample("3T").A.ohlc(), non_ohlc_expected_values]) expected = pd.DataFrame( expected_values, columns=expected_columns, index=expected_index ) From aa988a413cafadd85b815228b6e3125b36b2f53c Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Wed, 27 May 2020 19:00:34 +0100 Subject: [PATCH 20/23] add test with namedtuple --- .../tests/groupby/aggregate/test_aggregate.py | 49 ++++++++++++++++++- 1 file changed, 47 insertions(+), 2 deletions(-) diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index b0e6a95130284..19d6948ea1218 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -255,7 +255,7 @@ def test_agg_multiple_functions_maintain_order(df): tm.assert_index_equal(result.columns, exp_cols) -def test_agg_multiple_functions_same_name(df): +def test_agg_multiple_functions_same_name(): # GH 30880 df = pd.DataFrame( np.random.randn(1000, 3), @@ -276,7 +276,7 @@ def test_agg_multiple_functions_same_name(df): tm.assert_frame_equal(result, expected) -def test_agg_multiple_functions_same_name_with_ohlc_present(df): +def test_agg_multiple_functions_same_name_with_ohlc_present(): # GH 30880 # ohlc expands dimensions, so different test to the above is required. df = pd.DataFrame( @@ -310,6 +310,51 @@ def test_agg_multiple_functions_same_name_with_ohlc_present(df): tm.assert_frame_equal(result, expected) +def test_multiple_aggregations_named_tuple(): + # GH 34380 + df = pd.DataFrame( + { + "name": [ + "abc", + "abc", + "abc", + "abc", + "abc", + "abc", + "xyz", + "xyz", + "xyz", + "xyz", + "xyz", + "xyz", + ], + "change": [ + np.nan, + 1.5, + -0.4, + 2.0, + -0.44444399999999995, + 2.2, + np.nan, + 4.0, + -0.4, + 3.333333, + -0.307692, + 1.222222, + ], + } + ) + result = df.groupby("name")["change"].agg( + pos=pd.NamedAgg(column="change", aggfunc=lambda x: x.gt(0).sum()), + neg=pd.NamedAgg(column="change", aggfunc=lambda x: x.lt(0).sum()), + ) + expected = pd.DataFrame( + {"pos": [3.0, 3.0], "neg": [2.0, 2.0]}, + index=pd.Index(["abc", "xyz"], name="name"), + ) + tm.assert_frame_equal(result, expected) + + def test_multiple_functions_tuples_and_non_tuples(df): # #1359 funcs = [("foo", "mean"), "std"] From 7a62f5fd534ce35a0fe4d98b4fca138c7bc6d910 Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Wed, 27 May 2020 19:06:16 +0100 Subject: [PATCH 21/23] better layout --- .../tests/groupby/aggregate/test_aggregate.py | 36 +++---------------- 1 file changed, 4 insertions(+), 32 deletions(-) diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index 19d6948ea1218..ac6670efb6fc9 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -312,38 +312,10 @@ def test_agg_multiple_functions_same_name_with_ohlc_present(): def test_multiple_aggregations_named_tuple(): # GH 34380 - df = pd.DataFrame( - { - "name": [ - "abc", - "abc", - "abc", - "abc", - "abc", - "abc", - "xyz", - "xyz", - "xyz", - "xyz", - "xyz", - "xyz", - ], - "change": [ - np.nan, - 1.5, - -0.4, - 2.0, - -0.44444399999999995, - 2.2, - np.nan, - 4.0, - -0.4, - 3.333333, - -0.307692, - 1.222222, - ], - } - ) + name = ["abc"] * 6 + ["xyz"] * 6 + change = [np.nan, 1, -0.4, 2.0, -0.4, 2.2] * 2 + + df = pd.DataFrame({"name": name, "change": change}) result = df.groupby("name")["change"].agg( pos=pd.NamedAgg(column="change", aggfunc=lambda x: x.gt(0).sum()), neg=pd.NamedAgg(column="change", aggfunc=lambda x: x.lt(0).sum()), From d80ddc5baba733e6ee6082bbcd357e0411266760 Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Wed, 27 May 2020 19:06:42 +0100 Subject: [PATCH 22/23] better layout --- pandas/tests/groupby/aggregate/test_aggregate.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index ac6670efb6fc9..7e1c6f9558ccc 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -314,7 +314,6 @@ def test_multiple_aggregations_named_tuple(): # GH 34380 name = ["abc"] * 6 + ["xyz"] * 6 change = [np.nan, 1, -0.4, 2.0, -0.4, 2.2] * 2 - df = pd.DataFrame({"name": name, "change": change}) result = df.groupby("name")["change"].agg( pos=pd.NamedAgg(column="change", aggfunc=lambda x: x.gt(0).sum()), From 62d91d1b32e0ca66c5c34f6b8c953100975e96a3 Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Sat, 27 Jun 2020 10:49:13 +0100 Subject: [PATCH 23/23] dont special case empty output --- pandas/core/groupby/generic.py | 6 +++--- pandas/tests/groupby/aggregate/test_aggregate.py | 16 ---------------- 2 files changed, 3 insertions(+), 19 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index d6652a47cad6e..9ab3562fd1d1a 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -323,8 +323,6 @@ def _aggregate_multiple_funcs(self, arg): # let higher level handle return results - if not results: - return DataFrame() output = self._wrap_aggregated_output(results) return self.obj._constructor_expanddim(output, columns=columns) @@ -357,10 +355,12 @@ def _wrap_series_output( if len(output) > 1: result = self.obj._constructor_expanddim(indexed_output, index=index) result.columns = columns - else: + elif not columns.empty: result = self.obj._constructor( indexed_output[0], index=index, name=columns[0] ) + else: + result = self.obj._constructor_expanddim() return result diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index 661d2583cdb27..96e6525f3f7e4 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -310,22 +310,6 @@ def test_agg_multiple_functions_same_name_with_ohlc_present(): tm.assert_frame_equal(result, expected) -def test_multiple_aggregations_named_tuple(): - # GH 34380 - name = ["abc"] * 6 + ["xyz"] * 6 - change = [np.nan, 1, -0.4, 2.0, -0.4, 2.2] * 2 - df = pd.DataFrame({"name": name, "change": change}) - result = df.groupby("name")["change"].agg( - pos=pd.NamedAgg(column="change", aggfunc=lambda x: x.gt(0).sum()), - neg=pd.NamedAgg(column="change", aggfunc=lambda x: x.lt(0).sum()), - ) - expected = pd.DataFrame( - {"pos": [3.0, 3.0], "neg": [2.0, 2.0]}, - index=pd.Index(["abc", "xyz"], name="name"), - ) - tm.assert_frame_equal(result, expected) - - def test_multiple_functions_tuples_and_non_tuples(df): # #1359 funcs = [("foo", "mean"), "std"]