From 7e461a18d9f6928132afec6f48ce968b3e989ba6 Mon Sep 17 00:00:00 2001 From: Kaiqi Dong Date: Mon, 3 Dec 2018 17:43:52 +0100 Subject: [PATCH 01/28] remove \n from docstring --- pandas/core/arrays/datetimes.py | 26 +++++++++++++------------- pandas/core/arrays/timedeltas.py | 16 ++++++++-------- 2 files changed, 21 insertions(+), 21 deletions(-) diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index cfe3afcf3730a..b3df505d56d78 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -82,7 +82,7 @@ def f(self): return result f.__name__ = name - f.__doc__ = docstring + f.__doc__ = "\n{}\n".format(docstring) return property(f) @@ -1072,19 +1072,19 @@ def date(self): return tslib.ints_to_pydatetime(timestamps, box="date") - year = _field_accessor('year', 'Y', "\n The year of the datetime\n") + year = _field_accessor('year', 'Y', "The year of the datetime") month = _field_accessor('month', 'M', - "\n The month as January=1, December=12 \n") - day = _field_accessor('day', 'D', "\nThe days of the datetime\n") - hour = _field_accessor('hour', 'h', "\nThe hours of the datetime\n") - minute = _field_accessor('minute', 'm', "\nThe minutes of the datetime\n") - second = _field_accessor('second', 's', "\nThe seconds of the datetime\n") + "The month as January=1, December=12") + day = _field_accessor('day', 'D', "The days of the datetime") + hour = _field_accessor('hour', 'h', "The hours of the datetime") + minute = _field_accessor('minute', 'm', "The minutes of the datetime") + second = _field_accessor('second', 's', "The seconds of the datetime") microsecond = _field_accessor('microsecond', 'us', - "\nThe microseconds of the datetime\n") + "The microseconds of the datetime") nanosecond = _field_accessor('nanosecond', 'ns', - "\nThe nanoseconds of the datetime\n") + "The nanoseconds of the datetime") weekofyear = _field_accessor('weekofyear', 'woy', - "\nThe week ordinal of the year\n") + "The week ordinal of the year") week = weekofyear _dayofweek_doc = """ The day of the week with Monday=0, Sunday=6. @@ -1129,12 +1129,12 @@ def date(self): "The name of day in a week (ex: Friday)\n\n.. deprecated:: 0.23.0") dayofyear = _field_accessor('dayofyear', 'doy', - "\nThe ordinal day of the year\n") - quarter = _field_accessor('quarter', 'q', "\nThe quarter of the date\n") + "The ordinal day of the year") + quarter = _field_accessor('quarter', 'q', "The quarter of the date") days_in_month = _field_accessor( 'days_in_month', 'dim', - "\nThe number of days in the month\n") + "The number of days in the month") daysinmonth = days_in_month _is_month_doc = """ Indicates whether the date is the {first_or_last} day of the month. diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index 830283d31a929..4afc9f5483c2a 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -59,7 +59,7 @@ def f(self): return result f.__name__ = name - f.__doc__ = docstring + f.__doc__ = "\n{}\n".format(docstring) return property(f) @@ -684,16 +684,16 @@ def to_pytimedelta(self): return tslibs.ints_to_pytimedelta(self.asi8) days = _field_accessor("days", "days", - "\nNumber of days for each element.\n") + "Number of days for each element.") seconds = _field_accessor("seconds", "seconds", - "\nNumber of seconds (>= 0 and less than 1 day) " - "for each element.\n") + "Number of seconds (>= 0 and less than 1 day) " + "for each element.") microseconds = _field_accessor("microseconds", "microseconds", - "\nNumber of microseconds (>= 0 and less " - "than 1 second) for each element.\n") + "Number of microseconds (>= 0 and less " + "than 1 second) for each element.") nanoseconds = _field_accessor("nanoseconds", "nanoseconds", - "\nNumber of nanoseconds (>= 0 and less " - "than 1 microsecond) for each element.\n") + "Number of nanoseconds (>= 0 and less " + "than 1 microsecond) for each element.") @property def components(self): From dea38f24c0067ae3fe9484b837c9649714213bba Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Tue, 14 Jan 2020 21:26:31 +0100 Subject: [PATCH 02/28] fix issue 17038 --- pandas/core/reshape/pivot.py | 4 +++- pandas/tests/reshape/test_pivot.py | 20 ++++++++++++++------ 2 files changed, 17 insertions(+), 7 deletions(-) diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index b443ba142369c..9743d90f4dd04 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -117,7 +117,9 @@ def pivot_table( agged[v] = maybe_downcast_to_dtype(agged[v], data[v].dtype) table = agged - if table.index.nlevels > 1: + + # GH 17038, this check should only happen if index is specified + if table.index.nlevels > 1 and index: # Related GH #17123 # If index_names are integers, determine whether the integers refer # to the level position or name. diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index 743fc50c87e96..46a05123c9fdd 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -896,12 +896,6 @@ def _check_output( totals = table.loc[("All", ""), value_col] assert totals == self.data[value_col].mean() - # no rows - rtable = self.data.pivot_table( - columns=["AA", "BB"], margins=True, aggfunc=np.mean - ) - assert isinstance(rtable, Series) - table = self.data.pivot_table(index=["AA", "BB"], margins=True, aggfunc="mean") for item in ["DD", "EE", "FF"]: totals = table.loc[("All", ""), item] @@ -972,6 +966,20 @@ def test_pivot_integer_columns(self): tm.assert_frame_equal(table, table2, check_names=False) + @pytest.mark.parametrize("cols", [(1, 2), ("a", "b"), (1, "b"), ("a", 1)]) + def test_pivot_table_multiindex_only(self, cols): + # GH 17038 + df2 = DataFrame({cols[0]: [1, 2, 3], cols[1]: [1, 2, 3], "v": [4, 5, 6]}) + + result = df2.pivot_table(values="v", columns=cols) + expected = DataFrame( + [[4, 5, 6]], + columns=MultiIndex.from_tuples([(1, 1), (2, 2), (3, 3)], names=cols), + index=Index(["v"]), + ) + + tm.assert_frame_equal(result, expected) + def test_pivot_no_level_overlap(self): # GH #1181 From cd9e7ac3f31ffaf95cd628863df911dea9fa1248 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Tue, 14 Jan 2020 21:29:43 +0100 Subject: [PATCH 03/28] revert change --- pandas/core/reshape/pivot.py | 3 +-- pandas/tests/reshape/test_pivot.py | 20 ++++++-------------- 2 files changed, 7 insertions(+), 16 deletions(-) diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index 9743d90f4dd04..a7cdbb0da7a4e 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -118,8 +118,7 @@ def pivot_table( table = agged - # GH 17038, this check should only happen if index is specified - if table.index.nlevels > 1 and index: + if table.index.nlevels > 1: # Related GH #17123 # If index_names are integers, determine whether the integers refer # to the level position or name. diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index 46a05123c9fdd..743fc50c87e96 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -896,6 +896,12 @@ def _check_output( totals = table.loc[("All", ""), value_col] assert totals == self.data[value_col].mean() + # no rows + rtable = self.data.pivot_table( + columns=["AA", "BB"], margins=True, aggfunc=np.mean + ) + assert isinstance(rtable, Series) + table = self.data.pivot_table(index=["AA", "BB"], margins=True, aggfunc="mean") for item in ["DD", "EE", "FF"]: totals = table.loc[("All", ""), item] @@ -966,20 +972,6 @@ def test_pivot_integer_columns(self): tm.assert_frame_equal(table, table2, check_names=False) - @pytest.mark.parametrize("cols", [(1, 2), ("a", "b"), (1, "b"), ("a", 1)]) - def test_pivot_table_multiindex_only(self, cols): - # GH 17038 - df2 = DataFrame({cols[0]: [1, 2, 3], cols[1]: [1, 2, 3], "v": [4, 5, 6]}) - - result = df2.pivot_table(values="v", columns=cols) - expected = DataFrame( - [[4, 5, 6]], - columns=MultiIndex.from_tuples([(1, 1), (2, 2), (3, 3)], names=cols), - index=Index(["v"]), - ) - - tm.assert_frame_equal(result, expected) - def test_pivot_no_level_overlap(self): # GH #1181 From e5e912be0f596943067a7df812442764d311a086 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Tue, 14 Jan 2020 21:30:16 +0100 Subject: [PATCH 04/28] revert change --- pandas/core/reshape/pivot.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index a7cdbb0da7a4e..b443ba142369c 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -117,7 +117,6 @@ def pivot_table( agged[v] = maybe_downcast_to_dtype(agged[v], data[v].dtype) table = agged - if table.index.nlevels > 1: # Related GH #17123 # If index_names are integers, determine whether the integers refer From 4e1abde5fae34a79c6912290e5cfbad1491093b7 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Mon, 27 Jan 2020 20:31:28 +0100 Subject: [PATCH 05/28] try fix 31256 --- pandas/core/groupby/groupby.py | 2 +- pandas/tests/groupby/test_categorical.py | 15 +++++++++++++++ 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index a8c96840ff17b..f68471a037ce8 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -935,7 +935,7 @@ def _python_agg_general(self, func, *args, **kwargs): result, counts = self.grouper.agg_series(obj, f) assert result is not None key = base.OutputKey(label=name, position=idx) - output[key] = self._try_cast(result, obj, numeric_only=True) + output[key] = result if len(output) == 0: return self._python_apply_general(f) diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 9323946581a0d..52e17f9a2594b 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -1342,3 +1342,18 @@ def test_series_groupby_categorical_aggregation_getitem(): result = groups["foo"].agg("mean") expected = groups.agg("mean")["foo"] tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("func, expected_values", [ + (pd.Series.nunique, [1, 1, 2]), + (pd.Series.count, [1, 2, 2])]) +def test_groupby_agg_categorical_columns(func, expected_values): + # 31256 + df = pd.DataFrame({"id": [0, 1, 2, 3, 4], + "groups": [0, 1, 1, 2, 2], + "value": pd.Categorical([0, 0, 0, 0, 1]) + }).set_index('id') + result = df.groupby('groups').agg(func) + + expected = pd.DataFrame({"value": expected_values}, index=pd.Index([0, 1, 2], name="groups")) + tm.assert_frame_equal(result, expected) From 0b917c688218e7620dc1cf792d068810708de524 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Mon, 27 Jan 2020 20:33:21 +0100 Subject: [PATCH 06/28] pep8 --- pandas/tests/groupby/test_categorical.py | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 52e17f9a2594b..c3b4b6deea68e 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -1344,16 +1344,22 @@ def test_series_groupby_categorical_aggregation_getitem(): tm.assert_series_equal(result, expected) -@pytest.mark.parametrize("func, expected_values", [ - (pd.Series.nunique, [1, 1, 2]), - (pd.Series.count, [1, 2, 2])]) +@pytest.mark.parametrize( + "func, expected_values", + [(pd.Series.nunique, [1, 1, 2]), (pd.Series.count, [1, 2, 2])], +) def test_groupby_agg_categorical_columns(func, expected_values): # 31256 - df = pd.DataFrame({"id": [0, 1, 2, 3, 4], - "groups": [0, 1, 1, 2, 2], - "value": pd.Categorical([0, 0, 0, 0, 1]) - }).set_index('id') - result = df.groupby('groups').agg(func) + df = pd.DataFrame( + { + "id": [0, 1, 2, 3, 4], + "groups": [0, 1, 1, 2, 2], + "value": pd.Categorical([0, 0, 0, 0, 1]), + } + ).set_index("id") + result = df.groupby("groups").agg(func) - expected = pd.DataFrame({"value": expected_values}, index=pd.Index([0, 1, 2], name="groups")) + expected = pd.DataFrame( + {"value": expected_values}, index=pd.Index([0, 1, 2], name="groups") + ) tm.assert_frame_equal(result, expected) From e9cac5da3f0d23c79a9e7055f05b40652dd7bca9 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Mon, 27 Jan 2020 21:40:00 +0100 Subject: [PATCH 07/28] fix test --- pandas/core/groupby/groupby.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index f68471a037ce8..ed5a82aa06aa0 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -807,7 +807,12 @@ def _try_cast(self, result, obj, numeric_only: bool = False): dtype = obj.dtype if not is_scalar(result): - if is_extension_array_dtype(dtype) and dtype.kind != "M": + # should only cast to ea if it is not a reduction + if ( + is_extension_array_dtype(dtype) + and dtype.kind != "M" + and len(result) == len(obj) + ): # The function can return something of any type, so check # if the type is compatible with the calling EA. # datetime64tz is handled correctly in agg_series, @@ -935,7 +940,7 @@ def _python_agg_general(self, func, *args, **kwargs): result, counts = self.grouper.agg_series(obj, f) assert result is not None key = base.OutputKey(label=name, position=idx) - output[key] = result + output[key] = self._try_cast(result, obj, numeric_only=True) if len(output) == 0: return self._python_apply_general(f) From e03357e16599295e652faebfc9842721bbe17a9a Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Tue, 28 Jan 2020 21:59:26 +0100 Subject: [PATCH 08/28] fix --- pandas/core/arrays/base.py | 2 +- pandas/core/groupby/groupby.py | 2 -- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index c3c91cea43f6b..caad19f4ee4f9 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -47,7 +47,7 @@ def try_cast_to_ea(cls_or_instance, obj, dtype=None): ExtensionArray or obj """ try: - result = cls_or_instance._from_sequence(obj, dtype=dtype) + result = cls_or_instance._from_sequence(obj, dtype=dtype.name) except Exception: # We can't predict what downstream EA constructors may raise result = obj diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index ed5a82aa06aa0..4d14c31955c16 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -807,11 +807,9 @@ def _try_cast(self, result, obj, numeric_only: bool = False): dtype = obj.dtype if not is_scalar(result): - # should only cast to ea if it is not a reduction if ( is_extension_array_dtype(dtype) and dtype.kind != "M" - and len(result) == len(obj) ): # The function can return something of any type, so check # if the type is compatible with the calling EA. From a1df393492890f9bd6b6d5257a0649b6e9bfb6f3 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Tue, 28 Jan 2020 22:08:00 +0100 Subject: [PATCH 09/28] fix up tests --- pandas/tests/groupby/test_categorical.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index c3b4b6deea68e..613f36ce4622f 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -738,7 +738,7 @@ def test_preserve_on_ordered_ops(func, values): g = df.groupby("payload") result = getattr(g, func)() expected = pd.DataFrame( - {"payload": [-2, -1], "col": pd.Series(values, dtype=c.dtype)} + {"payload": [-2, -1], "col": pd.Series(values, dtype="category")} ).set_index("payload") tm.assert_frame_equal(result, expected) @@ -1360,6 +1360,7 @@ def test_groupby_agg_categorical_columns(func, expected_values): result = df.groupby("groups").agg(func) expected = pd.DataFrame( - {"value": expected_values}, index=pd.Index([0, 1, 2], name="groups") + {"value": pd.Index(expected_values, dtype="category")}, + index=pd.Index([0, 1, 2], name="groups"), ) tm.assert_frame_equal(result, expected) From 8743c47e781f6219650f2f7cbb40c8e62407f7ac Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Tue, 28 Jan 2020 22:23:20 +0100 Subject: [PATCH 10/28] preserve order --- pandas/core/arrays/base.py | 1 + pandas/tests/groupby/test_categorical.py | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index caad19f4ee4f9..cb5f1558eaf68 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -48,6 +48,7 @@ def try_cast_to_ea(cls_or_instance, obj, dtype=None): """ try: result = cls_or_instance._from_sequence(obj, dtype=dtype.name) + result = result.set_ordered(dtype.ordered) except Exception: # We can't predict what downstream EA constructors may raise result = obj diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 613f36ce4622f..e5fa2afc859a6 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -737,8 +737,9 @@ def test_preserve_on_ordered_ops(func, values): df = pd.DataFrame({"payload": [-1, -2, -1, -2], "col": c}) g = df.groupby("payload") result = getattr(g, func)() + expected_col = pd.Categorical(values, ordered=True) expected = pd.DataFrame( - {"payload": [-2, -1], "col": pd.Series(values, dtype="category")} + {"payload": [-2, -1], "col": expected_col} ).set_index("payload") tm.assert_frame_equal(result, expected) From 1e10d7179aa1f21829ee2c7020d5c4db6a43c965 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Tue, 28 Jan 2020 22:33:51 +0100 Subject: [PATCH 11/28] add comment --- pandas/core/arrays/base.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index cb5f1558eaf68..1ea699f66bb65 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -48,7 +48,10 @@ def try_cast_to_ea(cls_or_instance, obj, dtype=None): """ try: result = cls_or_instance._from_sequence(obj, dtype=dtype.name) - result = result.set_ordered(dtype.ordered) + + # still preserve order for categorical + if hasattr(cls_or_instance, "ordered"): + result = result.set_ordered(dtype.ordered) except Exception: # We can't predict what downstream EA constructors may raise result = obj From 905b3a5a4a6231ce1b1cb9633ce529f92808776b Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Tue, 28 Jan 2020 22:35:32 +0100 Subject: [PATCH 12/28] better test --- pandas/tests/groupby/test_categorical.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index e5fa2afc859a6..88fe5b113dc89 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -1361,7 +1361,7 @@ def test_groupby_agg_categorical_columns(func, expected_values): result = df.groupby("groups").agg(func) expected = pd.DataFrame( - {"value": pd.Index(expected_values, dtype="category")}, + {"value": pd.Categorical(expected_values)}, index=pd.Index([0, 1, 2], name="groups"), ) tm.assert_frame_equal(result, expected) From c5d670bc0880202794b8478bdc0c7332eb18f007 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Tue, 28 Jan 2020 23:11:02 +0100 Subject: [PATCH 13/28] fixup --- pandas/core/arrays/base.py | 5 +---- pandas/core/groupby/groupby.py | 6 +++++- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 1ea699f66bb65..53cd6acdd613e 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -47,11 +47,8 @@ def try_cast_to_ea(cls_or_instance, obj, dtype=None): ExtensionArray or obj """ try: - result = cls_or_instance._from_sequence(obj, dtype=dtype.name) + result = cls_or_instance._from_sequence(obj, dtype=dtype) - # still preserve order for categorical - if hasattr(cls_or_instance, "ordered"): - result = result.set_ordered(dtype.ordered) except Exception: # We can't predict what downstream EA constructors may raise result = obj diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 4d14c31955c16..6a931567bbccf 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -818,7 +818,11 @@ def _try_cast(self, result, obj, numeric_only: bool = False): # return the same type (Series) as our caller cls = dtype.construct_array_type() - result = try_cast_to_ea(cls, result, dtype=dtype) + result = try_cast_to_ea(cls, result, dtype=dtype.name) + + # still preserve the order for categorical + if hasattr(result, "ordered"): + result = result.set_ordered(dtype.ordered) elif numeric_only and is_numeric_dtype(dtype) or not numeric_only: result = maybe_downcast_to_dtype(result, dtype) From 916d9b27cac7bb5e7ac569e80547d387fcd407d0 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Tue, 28 Jan 2020 23:11:37 +0100 Subject: [PATCH 14/28] remove blank line --- pandas/core/arrays/base.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 53cd6acdd613e..c3c91cea43f6b 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -48,7 +48,6 @@ def try_cast_to_ea(cls_or_instance, obj, dtype=None): """ try: result = cls_or_instance._from_sequence(obj, dtype=dtype) - except Exception: # We can't predict what downstream EA constructors may raise result = obj From 2208fc26fdb9432b72cc678b71ece1ca7dc86229 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Tue, 28 Jan 2020 23:34:50 +0100 Subject: [PATCH 15/28] fix test --- pandas/tests/groupby/test_nth.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pandas/tests/groupby/test_nth.py b/pandas/tests/groupby/test_nth.py index 0f850f2e94581..1c0c5819b7c8b 100644 --- a/pandas/tests/groupby/test_nth.py +++ b/pandas/tests/groupby/test_nth.py @@ -334,9 +334,7 @@ def test_first_last_tz_multi_column(method, ts, alpha): result = getattr(df.groupby("group"), method)() expected = pd.DataFrame( { - "category_string": pd.Categorical( - [alpha, "c"], dtype=category_string.dtype - ), + "category_string": pd.Categorical([alpha, "c"]), "datetimetz": [ts, Timestamp("2013-01-03", tz="US/Eastern")], }, index=pd.Index([1, 2], name="group"), From 86a254c9f116af1960678e2f3c0cf46e3ab9699b Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Tue, 28 Jan 2020 23:36:19 +0100 Subject: [PATCH 16/28] style --- pandas/core/groupby/groupby.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 6a931567bbccf..d93681506bef8 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -807,10 +807,7 @@ def _try_cast(self, result, obj, numeric_only: bool = False): dtype = obj.dtype if not is_scalar(result): - if ( - is_extension_array_dtype(dtype) - and dtype.kind != "M" - ): + if is_extension_array_dtype(dtype) and dtype.kind != "M": # The function can return something of any type, so check # if the type is compatible with the calling EA. # datetime64tz is handled correctly in agg_series, From c36d97ba51ef003473c2ad581e4b0c3d4a2604bd Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Tue, 28 Jan 2020 23:55:12 +0100 Subject: [PATCH 17/28] linting --- pandas/tests/groupby/test_categorical.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 88fe5b113dc89..cc772585c565f 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -738,9 +738,9 @@ def test_preserve_on_ordered_ops(func, values): g = df.groupby("payload") result = getattr(g, func)() expected_col = pd.Categorical(values, ordered=True) - expected = pd.DataFrame( - {"payload": [-2, -1], "col": expected_col} - ).set_index("payload") + expected = pd.DataFrame({"payload": [-2, -1], "col": expected_col}).set_index( + "payload" + ) tm.assert_frame_equal(result, expected) From 3f8ea8f8245831164daddbe7bd82590150145e98 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 29 Jan 2020 07:32:12 -0600 Subject: [PATCH 18/28] wip --- pandas/core/groupby/groupby.py | 9 +++------ pandas/tests/groupby/test_categorical.py | 5 ++--- 2 files changed, 5 insertions(+), 9 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index d93681506bef8..8ab777768e39f 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -813,13 +813,10 @@ def _try_cast(self, result, obj, numeric_only: bool = False): # datetime64tz is handled correctly in agg_series, # so is excluded here. - # return the same type (Series) as our caller - cls = dtype.construct_array_type() - result = try_cast_to_ea(cls, result, dtype=dtype.name) + if len(result) and isinstance(result[0], dtype.type): + cls = dtype.construct_array_type() + result = try_cast_to_ea(cls, result, dtype=dtype) - # still preserve the order for categorical - if hasattr(result, "ordered"): - result = result.set_ordered(dtype.ordered) elif numeric_only and is_numeric_dtype(dtype) or not numeric_only: result = maybe_downcast_to_dtype(result, dtype) diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index cc772585c565f..8a3b306a9e40b 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -737,7 +737,7 @@ def test_preserve_on_ordered_ops(func, values): df = pd.DataFrame({"payload": [-1, -2, -1, -2], "col": c}) g = df.groupby("payload") result = getattr(g, func)() - expected_col = pd.Categorical(values, ordered=True) + expected_col = pd.Categorical(values, dtype=c.dtype) expected = pd.DataFrame({"payload": [-2, -1], "col": expected_col}).set_index( "payload" ) @@ -1361,7 +1361,6 @@ def test_groupby_agg_categorical_columns(func, expected_values): result = df.groupby("groups").agg(func) expected = pd.DataFrame( - {"value": pd.Categorical(expected_values)}, - index=pd.Index([0, 1, 2], name="groups"), + {"value": expected_values}, index=pd.Index([0, 1, 2], name="groups"), ) tm.assert_frame_equal(result, expected) From a6ad1a2bfb41db77a83da4b0f13ffd33f30a8a40 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 29 Jan 2020 07:37:36 -0600 Subject: [PATCH 19/28] alternative --- pandas/tests/groupby/test_nth.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/tests/groupby/test_nth.py b/pandas/tests/groupby/test_nth.py index 1c0c5819b7c8b..0f850f2e94581 100644 --- a/pandas/tests/groupby/test_nth.py +++ b/pandas/tests/groupby/test_nth.py @@ -334,7 +334,9 @@ def test_first_last_tz_multi_column(method, ts, alpha): result = getattr(df.groupby("group"), method)() expected = pd.DataFrame( { - "category_string": pd.Categorical([alpha, "c"]), + "category_string": pd.Categorical( + [alpha, "c"], dtype=category_string.dtype + ), "datetimetz": [ts, Timestamp("2013-01-03", tz="US/Eastern")], }, index=pd.Index([1, 2], name="group"), From c4ebfa960b11dad3e755b56569ba2e555e54a039 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 29 Jan 2020 09:21:36 -0600 Subject: [PATCH 20/28] Fixups * Ensure int for resample().sum() * Ensure float for resample().mean() --- pandas/core/groupby/ops.py | 11 +++++++++++ pandas/tests/resample/test_datetime_index.py | 4 +++- pandas/tests/resample/test_timedelta.py | 2 +- 3 files changed, 15 insertions(+), 2 deletions(-) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 679d3668523c2..2e95daa392976 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -543,6 +543,17 @@ def _cython_operation( if mask.any(): result = result.astype("float64") result[mask] = np.nan + elif ( + how == "add" + and is_integer_dtype(orig_values.dtype) + and is_extension_array_dtype(orig_values.dtype) + ): + # We need this to ensure that Series[Int64Dtype].resample().sum() + # remains int64 dtype. + # Two options for avoiding this special case + # 1. mask-aware ops and avoid casting to float with NaN above + # 2. specify the result dtype when calling this method + result = result.astype("int64") if kind == "aggregate" and self._filter_empty_groups and not counts.all(): assert result.ndim != 2 diff --git a/pandas/tests/resample/test_datetime_index.py b/pandas/tests/resample/test_datetime_index.py index 4860329718f54..3ad82b9e075a8 100644 --- a/pandas/tests/resample/test_datetime_index.py +++ b/pandas/tests/resample/test_datetime_index.py @@ -122,7 +122,9 @@ def test_resample_integerarray(): result = ts.resample("3T").mean() expected = Series( - [1, 4, 7], index=pd.date_range("1/1/2000", periods=3, freq="3T"), dtype="Int64" + [1, 4, 7], + index=pd.date_range("1/1/2000", periods=3, freq="3T"), + dtype="float64", ) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/resample/test_timedelta.py b/pandas/tests/resample/test_timedelta.py index d1bcdc55cb509..a4d14f127b80e 100644 --- a/pandas/tests/resample/test_timedelta.py +++ b/pandas/tests/resample/test_timedelta.py @@ -105,7 +105,7 @@ def test_resample_categorical_data_with_timedeltaindex(): index=pd.to_timedelta([0, 10], unit="s"), ) expected = expected.reindex(["Group_obj", "Group"], axis=1) - expected["Group"] = expected["Group_obj"].astype("category") + expected["Group"] = expected["Group_obj"] tm.assert_frame_equal(result, expected) From bbad886420c1f1e65943d52cb7a8da9978d8cffd Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 29 Jan 2020 09:29:52 -0600 Subject: [PATCH 21/28] revert extranesou --- pandas/tests/groupby/test_categorical.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 8a3b306a9e40b..718ba14b57497 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -737,10 +737,9 @@ def test_preserve_on_ordered_ops(func, values): df = pd.DataFrame({"payload": [-1, -2, -1, -2], "col": c}) g = df.groupby("payload") result = getattr(g, func)() - expected_col = pd.Categorical(values, dtype=c.dtype) - expected = pd.DataFrame({"payload": [-2, -1], "col": expected_col}).set_index( - "payload" - ) + expected = pd.DataFrame( + {"payload": [-2, -1], "col": pd.Series(values, dtype=c.dtype)} + ).set_index("payload") tm.assert_frame_equal(result, expected) From a6a498efc58432f25d7cfea0744e0231c659f7a9 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 29 Jan 2020 09:32:46 -0600 Subject: [PATCH 22/28] non-numeric --- pandas/tests/groupby/test_categorical.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 718ba14b57497..6beb43a39383b 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -1363,3 +1363,12 @@ def test_groupby_agg_categorical_columns(func, expected_values): {"value": expected_values}, index=pd.Index([0, 1, 2], name="groups"), ) tm.assert_frame_equal(result, expected) + + +def test_groupby_agg_non_numeric(): + df = pd.DataFrame( + {"A": pd.Categorical(["a", "a", "b"], categories=["a", "b", "c"])} + ) + result = df.groupby([1, 2, 1]).nunique() + expected = pd.DataFrame({"A": [2, 1]}, index=[1, 2]) + tm.assert_frame_equal(result, expected) From 2a3f5a20b5ae9affd282f806989974e92de7bd8e Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 29 Jan 2020 09:43:42 -0600 Subject: [PATCH 23/28] xfailing test --- .../tests/groupby/aggregate/test_aggregate.py | 21 +++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index 67bdcc246579e..2d31996a8a964 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -663,6 +663,27 @@ def test_aggregate_mixed_types(): tm.assert_frame_equal(result, expected) +@pytest.mark.xfail(reason="Not implemented.") +def test_aggregate_udf_na_extension_type(): + # https://github.com/pandas-dev/pandas/pull/31359 + # This is currently failing to cast back to Int64Dtype. + # The presence of the NA causes two problems + # 1. NA is not an instance of Int64Dtype.type (numpy.int64) + # 2. The presence of an NA forces object type, so the non-NA values is + # a Python int rather than a NumPy int64. Python ints aren't + # instances of numpy.int64. + def aggfunc(x): + if all(x > 2): + return 1 + else: + return pd.NA + + df = pd.DataFrame({"A": pd.array([1, 2, 3])}) + result = df.groupby([1, 1, 2]).agg(aggfunc) + expected = pd.DataFrame({"A": pd.array([1, pd.NA], dtype="Int64")}, index=[1, 2]) + tm.assert_frame_equal(result, expected) + + class TestLambdaMangling: def test_basic(self): df = pd.DataFrame({"A": [0, 0, 1, 1], "B": [1, 2, 3, 4]}) From ceef95e9303b2022f5f8733e08575ee485f4363c Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 29 Jan 2020 11:24:40 -0600 Subject: [PATCH 24/28] release note --- doc/source/whatsnew/v1.0.0.rst | 45 ++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 2abe85f042af1..ce474f0b149d4 100755 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -626,6 +626,51 @@ consistent with the behaviour of :class:`DataFrame` and :class:`Index`. DeprecationWarning: The default dtype for empty Series will be 'object' instead of 'float64' in a future version. Specify a dtype explicitly to silence this warning. Series([], dtype: float64) +Result dtype inference changes for resample operations +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The rules for the result dtype in :meth:`DataFrame.resample` aggregations have change for extension types (:issue:`31359`). +Previously, pandas would attempt to convert the result back to the original dtype, falling back to the usual +inference rules if that was not possible. Now, pandas will only return a result of the original dtype if the +scalar values in the result are instances of the extension dtype's scalar type. + +.. ipython:: python + + df = pd.DataFrame({"A": ['a', 'b']}, dtype='category', index=pd.date_range('2000', periods=2)) + df + + +*pandas 0.25.x* + +.. code-block:: python + + >>> df.resample("2D").agg(lambda x: 'a').A.dtype + CategoricalDtype(categories=['a', 'b'], ordered=False) + +*pandas 1.0.0* + +.. ipython:: python + + df.resample("2D").agg(lambda x: 'a').A.dtype + +This fixes an inconsistency between ``resample`` and ``groupby``. +This also fixes a potential bug, where the **values** of the result might change +depending on how the results are cast back to the original dtype. + +*pandas 0.25.x* + +.. code-block:: python + + df.resample("2D").agg(lambda x: 'c') + + A + 0 NaN + +.. ipython:: python + + df.resample("2D").agg(lambda x: 'c') + + .. _whatsnew_100.api_breaking.python: Increased minimum version for Python From ed91cc166ef7164598291a83b528542da80a208e Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 29 Jan 2020 11:55:08 -0600 Subject: [PATCH 25/28] fixup --- doc/source/whatsnew/v1.0.0.rst | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index ce474f0b149d4..3e9920b422814 100755 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -636,7 +636,8 @@ scalar values in the result are instances of the extension dtype's scalar type. .. ipython:: python - df = pd.DataFrame({"A": ['a', 'b']}, dtype='category', index=pd.date_range('2000', periods=2)) + df = pd.DataFrame({"A": ['a', 'b']}, dtype='category', + index=pd.date_range('2000', periods=2)) df @@ -661,7 +662,7 @@ depending on how the results are cast back to the original dtype. .. code-block:: python - df.resample("2D").agg(lambda x: 'c') + >>> df.resample("2D").agg(lambda x: 'c') A 0 NaN From 9c7af0fe6a85ea30c85ec2116a1c435323604c63 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 29 Jan 2020 13:05:05 -0600 Subject: [PATCH 26/28] fixup --- doc/source/whatsnew/v1.0.0.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 3e9920b422814..70ab4e62f51ba 100755 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -667,6 +667,8 @@ depending on how the results are cast back to the original dtype. A 0 NaN +*pandas 1.0.0* + .. ipython:: python df.resample("2D").agg(lambda x: 'c') From ca3564867b62feb0f70c6b9ae406c0c182e91c4f Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 29 Jan 2020 13:06:51 -0600 Subject: [PATCH 27/28] fixup --- pandas/tests/groupby/test_categorical.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 6beb43a39383b..1c2de8c8c223f 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -1369,6 +1369,10 @@ def test_groupby_agg_non_numeric(): df = pd.DataFrame( {"A": pd.Categorical(["a", "a", "b"], categories=["a", "b", "c"])} ) - result = df.groupby([1, 2, 1]).nunique() expected = pd.DataFrame({"A": [2, 1]}, index=[1, 2]) + + result = df.groupby([1, 2, 1]).agg(pd.Series.nunique) + tm.assert_frame_equal(result, expected) + + result = df.groupby([1, 2, 1]).nunique() tm.assert_frame_equal(result, expected) From 1b826bb710f4fb6f480127042b3c4e26ebb00f7e Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 29 Jan 2020 13:08:00 -0600 Subject: [PATCH 28/28] fixup --- doc/source/whatsnew/v1.0.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 70ab4e62f51ba..8025f7762f110 100755 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -629,7 +629,7 @@ consistent with the behaviour of :class:`DataFrame` and :class:`Index`. Result dtype inference changes for resample operations ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -The rules for the result dtype in :meth:`DataFrame.resample` aggregations have change for extension types (:issue:`31359`). +The rules for the result dtype in :meth:`DataFrame.resample` aggregations have changed for extension types (:issue:`31359`). Previously, pandas would attempt to convert the result back to the original dtype, falling back to the usual inference rules if that was not possible. Now, pandas will only return a result of the original dtype if the scalar values in the result are instances of the extension dtype's scalar type.