From 4c5eddd63e94bacddb96bf61f81a6a8fcd9c33f0 Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 20 Aug 2020 21:19:10 -0700 Subject: [PATCH 1/6] REF: remove unnecesary try/except --- pandas/core/groupby/generic.py | 69 ++++++++++++++++------------------ 1 file changed, 33 insertions(+), 36 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 166631e69f523..51532a75d2d4a 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -31,7 +31,7 @@ import numpy as np from pandas._libs import lib -from pandas._typing import FrameOrSeries, FrameOrSeriesUnion +from pandas._typing import ArrayLike, FrameOrSeries, FrameOrSeriesUnion from pandas.util._decorators import Appender, Substitution, doc from pandas.core.dtypes.cast import ( @@ -60,6 +60,7 @@ validate_func_kwargs, ) import pandas.core.algorithms as algorithms +from pandas.core.arrays import ExtensionArray from pandas.core.base import DataError, SpecificationError import pandas.core.common as com from pandas.core.construction import create_series_with_explicit_dtype @@ -1034,32 +1035,31 @@ def _cython_agg_blocks( no_result = object() - def cast_result_block(result, block: "Block", how: str) -> "Block": - # see if we can cast the block to the desired dtype + def cast_agg_result(result, values: ArrayLike, how: str) -> ArrayLike: + # see if we can cast the values to the desired dtype # this may not be the original dtype assert not isinstance(result, DataFrame) assert result is not no_result - dtype = maybe_cast_result_dtype(block.dtype, how) + dtype = maybe_cast_result_dtype(values.dtype, how) result = maybe_downcast_numeric(result, dtype) - if block.is_extension and isinstance(result, np.ndarray): - # e.g. block.values was an IntegerArray - # (1, N) case can occur if block.values was Categorical + if isinstance(values, ExtensionArray) and isinstance(result, np.ndarray): + # e.g. values was an IntegerArray + # (1, N) case can occur if values was Categorical # and result is ndarray[object] # TODO(EA2D): special casing not needed with 2D EAs assert result.ndim == 1 or result.shape[0] == 1 try: # Cast back if feasible - result = type(block.values)._from_sequence( - result.ravel(), dtype=block.values.dtype + result = type(values)._from_sequence( + result.ravel(), dtype=values.dtype ) except (ValueError, TypeError): # reshape to be valid for non-Extension Block result = result.reshape(1, -1) - agg_block: "Block" = block.make_block(result) - return agg_block + return result def blk_func(block: "Block") -> List["Block"]: new_blocks: List["Block"] = [] @@ -1093,33 +1093,30 @@ def blk_func(block: "Block") -> List["Block"]: # Categoricals. This will done by later self._reindex_output() # Doing it here creates an error. See GH#34951 sgb = get_groupby(obj, self.grouper, observed=True) - try: - result = sgb.aggregate(lambda x: alt(x, axis=self.axis)) - except TypeError: - # we may have an exception in trying to aggregate - # continue and exclude the block - raise + result = sgb.aggregate(lambda x: alt(x, axis=self.axis)) + + result = cast(DataFrame, result) + # unwrap DataFrame to get array + if len(result._mgr.blocks) != 1: + # We've split an object block! Everything we've assumed + # about a single block input returning a single block output + # is a lie. To keep the code-path for the typical non-split case + # clean, we choose to clean up this mess later on. + assert len(locs) == result.shape[1] + for i, loc in enumerate(locs): + agg_block = result.iloc[:, [i]]._mgr.blocks[0] + agg_block.mgr_locs = [loc] + new_blocks.append(agg_block) else: - result = cast(DataFrame, result) - # unwrap DataFrame to get array - if len(result._mgr.blocks) != 1: - # We've split an object block! Everything we've assumed - # about a single block input returning a single block output - # is a lie. To keep the code-path for the typical non-split case - # clean, we choose to clean up this mess later on. - assert len(locs) == result.shape[1] - for i, loc in enumerate(locs): - agg_block = result.iloc[:, [i]]._mgr.blocks[0] - agg_block.mgr_locs = [loc] - new_blocks.append(agg_block) - else: - result = result._mgr.blocks[0].values - if isinstance(result, np.ndarray) and result.ndim == 1: - result = result.reshape(1, -1) - agg_block = cast_result_block(result, block, how) - new_blocks = [agg_block] + result = result._mgr.blocks[0].values + if isinstance(result, np.ndarray) and result.ndim == 1: + result = result.reshape(1, -1) + res_values = cast_agg_result(result, block.values, how) + agg_block = block.make_block(res_values) + new_blocks = [agg_block] else: - agg_block = cast_result_block(result, block, how) + res_values = cast_agg_result(result, block.values, how) + agg_block = block.make_block(res_values) new_blocks = [agg_block] return new_blocks From 42649fbb855a895ee5818d7dc80bdbd0ce0e9f5a Mon Sep 17 00:00:00 2001 From: Karthik Mathur <22126205+mathurk1@users.noreply.github.com> Date: Fri, 21 Aug 2020 17:34:51 -0500 Subject: [PATCH 2/6] TST: add test for agg on ordered categorical cols (#35630) --- .../tests/groupby/aggregate/test_aggregate.py | 79 +++++++++++++++++++ 1 file changed, 79 insertions(+) diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index ce9d4b892d775..8fe450fe6abfc 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -1063,6 +1063,85 @@ def test_groupby_get_by_index(): pd.testing.assert_frame_equal(res, expected) +@pytest.mark.parametrize( + "grp_col_dict, exp_data", + [ + ({"nr": "min", "cat_ord": "min"}, {"nr": [1, 5], "cat_ord": ["a", "c"]}), + ({"cat_ord": "min"}, {"cat_ord": ["a", "c"]}), + ({"nr": "min"}, {"nr": [1, 5]}), + ], +) +def test_groupby_single_agg_cat_cols(grp_col_dict, exp_data): + # test single aggregations on ordered categorical cols GHGH27800 + + # create the result dataframe + input_df = pd.DataFrame( + { + "nr": [1, 2, 3, 4, 5, 6, 7, 8], + "cat_ord": list("aabbccdd"), + "cat": list("aaaabbbb"), + } + ) + + input_df = input_df.astype({"cat": "category", "cat_ord": "category"}) + input_df["cat_ord"] = input_df["cat_ord"].cat.as_ordered() + result_df = input_df.groupby("cat").agg(grp_col_dict) + + # create expected dataframe + cat_index = pd.CategoricalIndex( + ["a", "b"], categories=["a", "b"], ordered=False, name="cat", dtype="category" + ) + + expected_df = pd.DataFrame(data=exp_data, index=cat_index) + + tm.assert_frame_equal(result_df, expected_df) + + +@pytest.mark.parametrize( + "grp_col_dict, exp_data", + [ + ({"nr": ["min", "max"], "cat_ord": "min"}, [(1, 4, "a"), (5, 8, "c")]), + ({"nr": "min", "cat_ord": ["min", "max"]}, [(1, "a", "b"), (5, "c", "d")]), + ({"cat_ord": ["min", "max"]}, [("a", "b"), ("c", "d")]), + ], +) +def test_groupby_combined_aggs_cat_cols(grp_col_dict, exp_data): + # test combined aggregations on ordered categorical cols GH27800 + + # create the result dataframe + input_df = pd.DataFrame( + { + "nr": [1, 2, 3, 4, 5, 6, 7, 8], + "cat_ord": list("aabbccdd"), + "cat": list("aaaabbbb"), + } + ) + + input_df = input_df.astype({"cat": "category", "cat_ord": "category"}) + input_df["cat_ord"] = input_df["cat_ord"].cat.as_ordered() + result_df = input_df.groupby("cat").agg(grp_col_dict) + + # create expected dataframe + cat_index = pd.CategoricalIndex( + ["a", "b"], categories=["a", "b"], ordered=False, name="cat", dtype="category" + ) + + # unpack the grp_col_dict to create the multi-index tuple + # this tuple will be used to create the expected dataframe index + multi_index_list = [] + for k, v in grp_col_dict.items(): + if isinstance(v, list): + for value in v: + multi_index_list.append([k, value]) + else: + multi_index_list.append([k, v]) + multi_index = pd.MultiIndex.from_tuples(tuple(multi_index_list)) + + expected_df = pd.DataFrame(data=exp_data, columns=multi_index, index=cat_index) + + tm.assert_frame_equal(result_df, expected_df) + + def test_nonagg_agg(): # GH 35490 - Single/Multiple agg of non-agg function give same results # TODO: agg should raise for functions that don't aggregate From 47121ddc1c655f428c6c3fcea8fbf02eba85600a Mon Sep 17 00:00:00 2001 From: tkmz-n <60312218+tkmz-n@users.noreply.github.com> Date: Sat, 22 Aug 2020 07:42:50 +0900 Subject: [PATCH 3/6] TST: resample does not yield empty groups (#10603) (#35799) --- pandas/tests/resample/test_timedelta.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/pandas/tests/resample/test_timedelta.py b/pandas/tests/resample/test_timedelta.py index 0fbb60c176b30..3fa85e62d028c 100644 --- a/pandas/tests/resample/test_timedelta.py +++ b/pandas/tests/resample/test_timedelta.py @@ -150,3 +150,18 @@ def test_resample_timedelta_edge_case(start, end, freq, resample_freq): tm.assert_index_equal(result.index, expected_index) assert result.index.freq == expected_index.freq assert not np.isnan(result[-1]) + + +def test_resample_with_timedelta_yields_no_empty_groups(): + # GH 10603 + df = pd.DataFrame( + np.random.normal(size=(10000, 4)), + index=pd.timedelta_range(start="0s", periods=10000, freq="3906250n"), + ) + result = df.loc["1s":, :].resample("3s").apply(lambda x: len(x)) + + expected = pd.DataFrame( + [[768.0] * 4] * 12 + [[528.0] * 4], + index=pd.timedelta_range(start="1s", periods=13, freq="3s"), + ) + tm.assert_frame_equal(result, expected) From 1decb3e0ee1923a29b8eded7507bcb783b3870d0 Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 21 Aug 2020 18:48:02 -0700 Subject: [PATCH 4/6] revert accidental rebase --- pandas/core/groupby/generic.py | 61 ++++++++++++++++++---------------- 1 file changed, 32 insertions(+), 29 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 4b1f6cfe0a662..60e23b14eaf09 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -30,7 +30,7 @@ import numpy as np from pandas._libs import lib -from pandas._typing import ArrayLike, FrameOrSeries, FrameOrSeriesUnion +from pandas._typing import FrameOrSeries, FrameOrSeriesUnion from pandas.util._decorators import Appender, Substitution, doc from pandas.core.dtypes.cast import ( @@ -59,7 +59,6 @@ validate_func_kwargs, ) import pandas.core.algorithms as algorithms -from pandas.core.arrays import ExtensionArray from pandas.core.base import DataError, SpecificationError import pandas.core.common as com from pandas.core.construction import create_series_with_explicit_dtype @@ -1034,31 +1033,32 @@ def _cython_agg_blocks( no_result = object() - def cast_agg_result(result, values: ArrayLike, how: str) -> ArrayLike: - # see if we can cast the values to the desired dtype + def cast_result_block(result, block: "Block", how: str) -> "Block": + # see if we can cast the block to the desired dtype # this may not be the original dtype assert not isinstance(result, DataFrame) assert result is not no_result - dtype = maybe_cast_result_dtype(values.dtype, how) + dtype = maybe_cast_result_dtype(block.dtype, how) result = maybe_downcast_numeric(result, dtype) - if isinstance(values, ExtensionArray) and isinstance(result, np.ndarray): - # e.g. values was an IntegerArray - # (1, N) case can occur if values was Categorical + if block.is_extension and isinstance(result, np.ndarray): + # e.g. block.values was an IntegerArray + # (1, N) case can occur if block.values was Categorical # and result is ndarray[object] # TODO(EA2D): special casing not needed with 2D EAs assert result.ndim == 1 or result.shape[0] == 1 try: # Cast back if feasible - result = type(values)._from_sequence( - result.ravel(), dtype=values.dtype + result = type(block.values)._from_sequence( + result.ravel(), dtype=block.values.dtype ) except (ValueError, TypeError): # reshape to be valid for non-Extension Block result = result.reshape(1, -1) - return result + agg_block: "Block" = block.make_block(result) + return agg_block def blk_func(block: "Block") -> List["Block"]: new_blocks: List["Block"] = [] @@ -1092,25 +1092,28 @@ def blk_func(block: "Block") -> List["Block"]: # Categoricals. This will done by later self._reindex_output() # Doing it here creates an error. See GH#34951 sgb = get_groupby(obj, self.grouper, observed=True) - result = sgb.aggregate(lambda x: alt(x, axis=self.axis)) - - assert isinstance(result, (Series, DataFrame)) # for mypy - # In the case of object dtype block, it may have been split - # in the operation. We un-split here. - result = result._consolidate() - assert isinstance(result, (Series, DataFrame)) # for mypy - assert len(result._mgr.blocks) == 1 - - # unwrap DataFrame to get array - result = result._mgr.blocks[0].values - if isinstance(result, np.ndarray) and result.ndim == 1: - result = result.reshape(1, -1) - res_values = cast_agg_result(result, block.values, how) - agg_block = block.make_block(res_values) - new_blocks = [agg_block] + try: + result = sgb.aggregate(lambda x: alt(x, axis=self.axis)) + except TypeError: + # we may have an exception in trying to aggregate + # continue and exclude the block + raise + else: + assert isinstance(result, (Series, DataFrame)) # for mypy + # In the case of object dtype block, it may have been split + # in the operation. We un-split here. + result = result._consolidate() + assert isinstance(result, (Series, DataFrame)) # for mypy + assert len(result._mgr.blocks) == 1 + + # unwrap DataFrame to get array + result = result._mgr.blocks[0].values + if isinstance(result, np.ndarray) and result.ndim == 1: + result = result.reshape(1, -1) + agg_block = cast_result_block(result, block, how) + new_blocks = [agg_block] else: - res_values = cast_agg_result(result, block.values, how) - agg_block = block.make_block(res_values) + agg_block = cast_result_block(result, block, how) new_blocks = [agg_block] return new_blocks From 5f71a052eaa970944a0d2fc0688cb2f49b96de11 Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 27 Aug 2020 18:33:56 -0700 Subject: [PATCH 5/6] TYP: annotations in core.groupby --- pandas/core/groupby/categorical.py | 11 +++++++++-- pandas/core/groupby/generic.py | 17 +++++++++-------- pandas/core/groupby/groupby.py | 6 +++--- pandas/core/groupby/grouper.py | 6 ++++-- pandas/core/groupby/ops.py | 2 +- 5 files changed, 26 insertions(+), 16 deletions(-) diff --git a/pandas/core/groupby/categorical.py b/pandas/core/groupby/categorical.py index db734bb2f0c07..420069700967a 100644 --- a/pandas/core/groupby/categorical.py +++ b/pandas/core/groupby/categorical.py @@ -1,3 +1,5 @@ +from typing import Optional, Tuple + import numpy as np from pandas.core.algorithms import unique1d @@ -6,9 +8,12 @@ CategoricalDtype, recode_for_categories, ) +from pandas.core.indexes.api import CategoricalIndex -def recode_for_groupby(c: Categorical, sort: bool, observed: bool): +def recode_for_groupby( + c: Categorical, sort: bool, observed: bool +) -> Tuple[Categorical, Optional[Categorical]]: """ Code the categories to ensure we can groupby for categoricals. @@ -73,7 +78,9 @@ def recode_for_groupby(c: Categorical, sort: bool, observed: bool): return c.reorder_categories(cat.categories), None -def recode_from_groupby(c: Categorical, sort: bool, ci): +def recode_from_groupby( + c: Categorical, sort: bool, ci: CategoricalIndex +) -> CategoricalIndex: """ Reverse the codes_to_groupby to account for sort / observed. diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 2afa56b50c3c7..9089b9ae8ca40 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -83,7 +83,7 @@ from pandas.plotting import boxplot_frame_groupby if TYPE_CHECKING: - from pandas.core.internals import Block + from pandas.core.internals import Block # noqa:F401 NamedAgg = namedtuple("NamedAgg", ["column", "aggfunc"]) @@ -1595,7 +1595,7 @@ def _gotitem(self, key, ndim: int, subset=None): Parameters ---------- key : string / list of selections - ndim : 1,2 + ndim : {1, 2} requested ndim of result subset : object, default None subset to act on @@ -1621,7 +1621,7 @@ def _gotitem(self, key, ndim: int, subset=None): raise AssertionError("invalid ndim for _gotitem") - def _wrap_frame_output(self, result, obj) -> DataFrame: + def _wrap_frame_output(self, result, obj: DataFrame) -> DataFrame: result_index = self.grouper.levels[0] if self.axis == 0: @@ -1638,7 +1638,7 @@ def _get_data_to_aggregate(self) -> BlockManager: else: return obj._mgr - def _insert_inaxis_grouper_inplace(self, result): + def _insert_inaxis_grouper_inplace(self, result: DataFrame): # zip in reverse so we can always insert at loc 0 izip = zip( *map( @@ -1716,7 +1716,7 @@ def _wrap_transformed_output( return result - def _wrap_agged_blocks(self, blocks: "Sequence[Block]", items: Index) -> DataFrame: + def _wrap_agged_blocks(self, blocks: Sequence["Block"], items: Index) -> DataFrame: if not self.as_index: index = np.arange(blocks[0].values.shape[-1]) mgr = BlockManager(blocks, axes=[items, index]) @@ -1743,7 +1743,7 @@ def _iterate_column_groupbys(self): exclusions=self.exclusions, ) - def _apply_to_column_groupbys(self, func): + def _apply_to_column_groupbys(self, func) -> DataFrame: from pandas.core.reshape.concat import concat return concat( @@ -1752,7 +1752,7 @@ def _apply_to_column_groupbys(self, func): axis=1, ) - def count(self): + def count(self) -> DataFrame: """ Compute count of group, excluding missing values. @@ -1782,7 +1782,7 @@ def count(self): return self._reindex_output(result, fill_value=0) - def nunique(self, dropna: bool = True): + def nunique(self, dropna: bool = True) -> DataFrame: """ Return DataFrame with counts of unique elements in each position. @@ -1848,6 +1848,7 @@ def nunique(self, dropna: bool = True): ], axis=1, ) + assert isinstance(results, DataFrame) # for mypy if axis_number == 1: results = results.T diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index f96b488fb8d0d..c0250d2a118d8 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -459,7 +459,7 @@ def f(self): @contextmanager -def _group_selection_context(groupby): +def _group_selection_context(groupby: "_GroupBy"): """ Set / reset the _group_selection_context. """ @@ -489,7 +489,7 @@ def __init__( keys: Optional[_KeysArgType] = None, axis: int = 0, level=None, - grouper: "Optional[ops.BaseGrouper]" = None, + grouper: Optional["ops.BaseGrouper"] = None, exclusions=None, selection=None, as_index: bool = True, @@ -734,7 +734,7 @@ def pipe(self, func, *args, **kwargs): plot = property(GroupByPlot) - def _make_wrapper(self, name): + def _make_wrapper(self, name: str) -> Callable: assert name in self._apply_allowlist with _group_selection_context(self): diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 8239a792c65dd..18970ea0544e4 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -568,7 +568,9 @@ def codes(self) -> np.ndarray: @cache_readonly def result_index(self) -> Index: if self.all_grouper is not None: - return recode_from_groupby(self.all_grouper, self.sort, self.group_index) + group_idx = self.group_index + assert isinstance(group_idx, CategoricalIndex) # set in __init__ + return recode_from_groupby(self.all_grouper, self.sort, group_idx) return self.group_index @property @@ -607,7 +609,7 @@ def get_grouper( mutated: bool = False, validate: bool = True, dropna: bool = True, -) -> "Tuple[ops.BaseGrouper, List[Hashable], FrameOrSeries]": +) -> Tuple["ops.BaseGrouper", List[Hashable], FrameOrSeries]: """ Create and return a BaseGrouper, which is an internal mapping of how to create the grouper indexers. diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index c6171a55359fe..2c8251f865c4f 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -82,7 +82,7 @@ class BaseGrouper: def __init__( self, axis: Index, - groupings: "Sequence[grouper.Grouping]", + groupings: Sequence["grouper.Grouping"], sort: bool = True, group_keys: bool = True, mutated: bool = False, From bde6dacb40da849962ab53943b7986213fe64cf4 Mon Sep 17 00:00:00 2001 From: Brock Date: Sat, 29 Aug 2020 16:59:51 -0700 Subject: [PATCH 6/6] update per comments --- pandas/core/groupby/categorical.py | 5 +++-- pandas/core/groupby/generic.py | 21 ++++++++------------- 2 files changed, 11 insertions(+), 15 deletions(-) diff --git a/pandas/core/groupby/categorical.py b/pandas/core/groupby/categorical.py index 420069700967a..4d5acf527a867 100644 --- a/pandas/core/groupby/categorical.py +++ b/pandas/core/groupby/categorical.py @@ -98,7 +98,8 @@ def recode_from_groupby( """ # we re-order to the original category orderings if sort: - return ci.set_categories(c.categories) + return ci.set_categories(c.categories) # type: ignore [attr-defined] # we are not sorting, so add unobserved to the end - return ci.add_categories(c.categories[~c.categories.isin(ci.categories)]) + new_cats = c.categories[~c.categories.isin(ci.categories)] + return ci.add_categories(new_cats) # type: ignore [attr-defined] diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index c4cd4955570bb..e39464628ccaa 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -23,6 +23,7 @@ Type, TypeVar, Union, + cast, ) import warnings @@ -1634,20 +1635,14 @@ def _get_data_to_aggregate(self) -> BlockManager: else: return obj._mgr - def _insert_inaxis_grouper_inplace(self, result: DataFrame): + def _insert_inaxis_grouper_inplace(self, result: DataFrame) -> None: # zip in reverse so we can always insert at loc 0 - izip = zip( - *map( - reversed, - ( - self.grouper.names, - self.grouper.get_group_levels(), - [grp.in_axis for grp in self.grouper.groupings], - ), - ) - ) columns = result.columns - for name, lev, in_axis in izip: + for name, lev, in_axis in zip( + reversed(self.grouper.names), + reversed(self.grouper.get_group_levels()), + reversed([grp.in_axis for grp in self.grouper.groupings]), + ): # GH #28549 # When using .apply(-), name will be in columns already if in_axis and name not in columns: @@ -1844,7 +1839,7 @@ def nunique(self, dropna: bool = True) -> DataFrame: ], axis=1, ) - assert isinstance(results, DataFrame) # for mypy + results = cast(DataFrame, results) if axis_number == 1: results = results.T