Skip to content

ENH: retain masked EA dtypes in groupby with as_index=False #41373

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 31 commits into from
Jul 25, 2021
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
eaf4fa9
ENH: retain masked EA dtypes in groupby with as_index=False
jbrockmendel May 7, 2021
02005f4
32bit compt
jbrockmendel May 7, 2021
93ed7e7
32bit compat
jbrockmendel May 7, 2021
9fbd50f
Merge branch 'master' into ref-group_index
jbrockmendel May 10, 2021
d4a986a
whatsnew
jbrockmendel May 10, 2021
9b1e560
Merge branch 'master' into ref-group_index
jbrockmendel May 17, 2021
fcfc4e4
Merge branch 'master' into ref-group_index
jbrockmendel May 17, 2021
ff5d851
Merge branch 'master' into ref-group_index
jbrockmendel May 17, 2021
cf6b37d
rename result_arraylike-> group_arraylike
jbrockmendel May 17, 2021
a617ee4
Merge branch 'master' into ref-group_index
jbrockmendel May 17, 2021
67aa1c6
Merge branch 'master' into ref-group_index
jbrockmendel May 19, 2021
03cd407
Merge branch 'master' into ref-group_index
jbrockmendel May 19, 2021
c9db010
Merge branch 'master' into ref-group_index
jbrockmendel May 21, 2021
238123b
Merge branch 'master' into ref-group_index
jbrockmendel May 21, 2021
446a77e
fix test
jbrockmendel May 21, 2021
77e90ca
Merge branch 'master' into ref-group_index
jbrockmendel Jun 1, 2021
802d18d
Merge branch 'master' into ref-group_index
jbrockmendel Jun 9, 2021
f767706
Merge branch 'master' into ref-group_index
jbrockmendel Jun 15, 2021
4c83e3e
Merge branch 'master' into ref-group_index
jbrockmendel Jun 16, 2021
b7a8599
Merge branch 'master' into ref-group_index
jbrockmendel Jun 17, 2021
cb37f57
Merge branch 'master' into ref-group_index
jbrockmendel Jun 18, 2021
ac4402b
fix broken uint64 cases
jbrockmendel Jun 18, 2021
30b07f9
Merge branch 'master' into ref-group_index
jbrockmendel Jun 21, 2021
f019783
Merge branch 'master' into ref-group_index
jbrockmendel Jun 22, 2021
c882029
Merge branch 'master' into ref-group_index
jbrockmendel Jul 1, 2021
e22cf97
move whatsnew to 1.4.0
jbrockmendel Jul 1, 2021
efe5976
Merge branch 'master' into ref-group_index
jbrockmendel Jul 7, 2021
6eca84d
Merge branch 'master' of https://github.com/pandas-dev/pandas into re…
jbrockmendel Jul 8, 2021
a32cb83
revert unnecessary convert_datetime
jbrockmendel Jul 8, 2021
0d64355
Merge branch 'master' into ref-group_index
jreback Jul 15, 2021
ecc3151
Merge branch 'master' into ref-group_index
jbrockmendel Jul 24, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions pandas/core/groupby/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -1046,7 +1046,7 @@ def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs)
self._insert_inaxis_grouper_inplace(result)
result.index = np.arange(len(result))

return result._convert(datetime=True)
return result

agg = aggregate

Expand Down Expand Up @@ -1633,7 +1633,7 @@ def _wrap_agged_manager(self, mgr: Manager2D) -> DataFrame:
if self.axis == 1:
result = result.T

return self._reindex_output(result)._convert(datetime=True)
return self._reindex_output(result)

def _iterate_column_groupbys(self):
for i, colname in enumerate(self._selected_obj.columns):
Expand Down
25 changes: 22 additions & 3 deletions pandas/core/groupby/grouper.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import numpy as np

from pandas._typing import (
ArrayLike,
FrameOrSeries,
final,
)
Expand Down Expand Up @@ -462,6 +463,8 @@ def __init__(
self.in_axis = in_axis
self.dropna = dropna

self._group_arraylike = None

# right place for this?
if isinstance(grouper, (Series, Index)) and name is None:
self.name = grouper.name
Expand Down Expand Up @@ -601,6 +604,22 @@ def codes(self) -> np.ndarray:
# expected "ndarray")
return self._codes # type: ignore[return-value]

@cache_readonly
def result_arraylike(self) -> ArrayLike:
"""
Analogous to result_index, but holding an ArrayLike to ensure
we can can retain ExtensionDtypes.
"""
ridx = self.result_index # initialized _group_arraylike
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

do you really need this state? seems very magical here

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i agree, the statefulness is unpleasant. #41375 starts to unwind it

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you try to unwind first? this is adding a lot

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure, let’s get 41375 in and then I’ll rebased and try to trim this down


if self._group_arraylike is None:
# This should only occur when ridx is CategoricalIndex
self._group_arraylike = ridx._values

# error: Incompatible return value type (got "None", expected
# "Union[ExtensionArray, ndarray]")
return self._group_arraylike # type: ignore[return-value]

@cache_readonly
def result_index(self) -> Index:
if self.all_grouper is not None:
Expand All @@ -623,7 +642,7 @@ def _make_codes(self) -> None:
# we have a list of groupers
if isinstance(self.grouper, ops.BaseGrouper):
codes = self.grouper.codes_info
uniques = self.grouper.result_index
uniques = self.grouper.result_arraylike
else:
# GH35667, replace dropna=False with na_sentinel=None
if not self.dropna:
Expand All @@ -633,9 +652,9 @@ def _make_codes(self) -> None:
codes, uniques = algorithms.factorize(
self.grouper, sort=self.sort, na_sentinel=na_sentinel
)
uniques = Index(uniques, name=self.name)
self._codes = codes
self._group_index = uniques
self._group_arraylike = uniques
self._group_index = Index(uniques, name=self.name)

@cache_readonly
def groups(self) -> dict[Hashable, np.ndarray]:
Expand Down
26 changes: 23 additions & 3 deletions pandas/core/groupby/ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -908,6 +908,23 @@ def reconstructed_codes(self) -> list[np.ndarray]:
ids, obs_ids, _ = self.group_info
return decons_obs_group_ids(ids, obs_ids, self.shape, codes, xnull=True)

@cache_readonly
def result_arraylike(self) -> ArrayLike:
"""
Analogous to result_index, but returning an ndarray/ExtensionArray
allowing us to retain ExtensionDtypes not supported by Index.
"""
# TODO: once Index supports arbitrary EAs, this can be removed in favor
# of result_index
if len(self.groupings) == 1:
return self.groupings[0].result_arraylike

codes = self.reconstructed_codes
levels = [ping.result_arraylike for ping in self.groupings]
return MultiIndex(
levels=levels, codes=codes, verify_integrity=False, names=self.names
)._values

@cache_readonly
def result_index(self) -> Index:
if len(self.groupings) == 1:
Expand All @@ -924,12 +941,12 @@ def get_group_levels(self) -> list[Index]:
# Note: only called from _insert_inaxis_grouper_inplace, which
# is only called for BaseGrouper, never for BinGrouper
if len(self.groupings) == 1:
return [self.groupings[0].result_index]
return [self.groupings[0].result_arraylike]

name_list = []
for ping, codes in zip(self.groupings, self.reconstructed_codes):
codes = ensure_platform_int(codes)
levels = ping.result_index.take(codes)
levels = ping.result_arraylike.take(codes)

name_list.append(levels)

Expand Down Expand Up @@ -991,7 +1008,10 @@ def agg_series(self, obj: Series, func: F) -> ArrayLike:
result = self._aggregate_series_fast(obj, func)
cast_back = False

npvalues = lib.maybe_convert_objects(result, try_float=False)
convert_datetime = obj.dtype.kind == "M"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what breaks if we remove this inference entirely?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

huh, nothing now. im pretty sure there was something back when i did this. will revert

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

updated + green

npvalues = lib.maybe_convert_objects(
result, try_float=False, convert_datetime=convert_datetime
)
if cast_back:
# TODO: Is there a documented reason why we dont always cast_back?
out = maybe_cast_pointwise_result(npvalues, obj.dtype, numeric_only=True)
Expand Down
8 changes: 4 additions & 4 deletions pandas/tests/extension/base/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,14 +22,14 @@ def test_grouping_grouper(self, data_for_grouping):
def test_groupby_extension_agg(self, as_index, data_for_grouping):
df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1, 4], "B": data_for_grouping})
result = df.groupby("B", as_index=as_index).A.mean()
_, index = pd.factorize(data_for_grouping, sort=True)
_, uniques = pd.factorize(data_for_grouping, sort=True)

index = pd.Index(index, name="B")
expected = pd.Series([3, 1, 4], index=index, name="A")
if as_index:
index = pd.Index(uniques, name="B")
expected = pd.Series([3, 1, 4], index=index, name="A")
self.assert_series_equal(result, expected)
else:
expected = expected.reset_index()
expected = pd.DataFrame({"B": uniques, "A": [3, 1, 4]})
self.assert_frame_equal(result, expected)

def test_groupby_agg_extension(self, data_for_grouping):
Expand Down
4 changes: 0 additions & 4 deletions pandas/tests/extension/json/test_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -312,10 +312,6 @@ def test_groupby_extension_apply(self):
we'll be able to dispatch unique.
"""

@pytest.mark.parametrize("as_index", [True, False])
def test_groupby_extension_agg(self, as_index, data_for_grouping):
super().test_groupby_extension_agg(as_index, data_for_grouping)

@pytest.mark.xfail(reason="GH#39098: Converts agg result to object")
def test_groupby_agg_extension(self, data_for_grouping):
super().test_groupby_agg_extension(data_for_grouping)
Expand Down
8 changes: 4 additions & 4 deletions pandas/tests/extension/test_boolean.py
Original file line number Diff line number Diff line change
Expand Up @@ -269,14 +269,14 @@ def test_grouping_grouper(self, data_for_grouping):
def test_groupby_extension_agg(self, as_index, data_for_grouping):
df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1], "B": data_for_grouping})
result = df.groupby("B", as_index=as_index).A.mean()
_, index = pd.factorize(data_for_grouping, sort=True)
_, uniques = pd.factorize(data_for_grouping, sort=True)

index = pd.Index(index, name="B")
expected = pd.Series([3, 1], index=index, name="A")
if as_index:
index = pd.Index(uniques, name="B")
expected = pd.Series([3, 1], index=index, name="A")
self.assert_series_equal(result, expected)
else:
expected = expected.reset_index()
expected = pd.DataFrame({"B": uniques, "A": [3, 1]})
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is a user facing change right?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes

self.assert_frame_equal(result, expected)

def test_groupby_agg_extension(self, data_for_grouping):
Expand Down
4 changes: 4 additions & 0 deletions pandas/tests/groupby/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -694,6 +694,10 @@ def test_ops_not_as_index(reduction_func):
expected = expected.rename("size")
expected = expected.reset_index()

if reduction_func != "size":
# 32 bit compat -> groupby preserves dtype whereas reset_index casts to int64
expected["a"] = expected["a"].astype(df["a"].dtype)

g = df.groupby("a", as_index=False)

result = getattr(g, reduction_func)()
Expand Down