Skip to content

[ArrayManager] Remaining GroupBy tests (fix count, pass on libreduction for now) #40050

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Feb 27, 2021
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -157,7 +157,7 @@ jobs:
pytest pandas/tests/reductions/ --array-manager
pytest pandas/tests/generic/test_generic.py --array-manager
pytest pandas/tests/arithmetic/ --array-manager
pytest pandas/tests/groupby/aggregate/ --array-manager
pytest pandas/tests/groupby/ --array-manager
pytest pandas/tests/reshape/merge --array-manager

# indexing subset (temporary since other tests don't pass yet)
Expand Down
6 changes: 6 additions & 0 deletions pandas/core/groupby/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -1815,6 +1815,8 @@ def count(self) -> DataFrame:
ids, _, ngroups = self.grouper.group_info
mask = ids != -1

using_array_manager = isinstance(data, ArrayManager)

def hfunc(bvalues: ArrayLike) -> ArrayLike:
# TODO(2DEA): reshape would not be necessary with 2D EAs
if bvalues.ndim == 1:
Expand All @@ -1824,6 +1826,10 @@ def hfunc(bvalues: ArrayLike) -> ArrayLike:
masked = mask & ~isna(bvalues)

counted = lib.count_level_2d(masked, labels=ids, max_bin=ngroups, axis=1)
if using_array_manager:
# count_level_2d return (1, N) array for single column
# -> extract 1D array
counted = counted[0, :]
return counted

new_mgr = data.grouped_reduce(hfunc)
Expand Down
5 changes: 5 additions & 0 deletions pandas/core/groupby/ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,7 @@
MultiIndex,
ensure_index,
)
from pandas.core.internals import ArrayManager
from pandas.core.series import Series
from pandas.core.sorting import (
compress_group_index,
Expand Down Expand Up @@ -214,6 +215,10 @@ def apply(self, f: F, data: FrameOrSeries, axis: int = 0):
# TODO: can we have a workaround for EAs backed by ndarray?
pass

elif isinstance(sdata._mgr, ArrayManager):
# TODO(ArrayManager) don't use fast_apply / libreduction.apply_frame_axis0
# for now -> relies on BlockManager internals
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

im totally fine with skipping for the time being. medium-term, i think it could use .arrays instead of .blocks, might be easy-ish compat

pass
elif (
com.get_callable_name(f) not in base.plotting_methods
and isinstance(splitter, FrameSplitter)
Expand Down
21 changes: 18 additions & 3 deletions pandas/core/internals/array_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -270,15 +270,30 @@ def grouped_reduce(self: T, func: Callable, ignore_failures: bool = False) -> T:
-------
ArrayManager
"""
# TODO ignore_failures
result_arrays = [func(arr) for arr in self.arrays]
result_arrays: List[np.ndarray] = []
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this looks a whole lot like reduce right?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, it's quite similar in logic (it are also both reduce operations, so not that unsurprising), but IMO they are different enough that trying to share anything will only make it more complex (return value is different, they need to process the result inside the loop differently, etc)

It might be possible to change the return value of reduce to make this easier, but that's a bigger change, so if we want that, it's for a separate PR

result_indices: List[int] = []

for i, arr in enumerate(self.arrays):
try:
res = func(arr)
except (TypeError, NotImplementedError):
if not ignore_failures:
raise
continue
result_arrays.append(res)
result_indices.append(i)

if len(result_arrays) == 0:
index = Index([None]) # placeholder
else:
index = Index(range(result_arrays[0].shape[0]))

return type(self)(result_arrays, [index, self.items])
if ignore_failures:
columns = self.items[np.array(result_indices, dtype="int64")]
else:
columns = self.items

return type(self)(result_arrays, [index, columns])

def operate_blockwise(self, other: ArrayManager, array_op) -> ArrayManager:
"""
Expand Down
5 changes: 4 additions & 1 deletion pandas/tests/groupby/test_allowlist.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
import numpy as np
import pytest

import pandas.util._test_decorators as td

from pandas import (
DataFrame,
Index,
Expand Down Expand Up @@ -355,7 +357,8 @@ def test_groupby_function_rename(mframe):
"cummax",
"cummin",
"cumprod",
"describe",
# TODO(ArrayManager) quantile
pytest.param("describe", marks=td.skip_array_manager_not_yet_implemented),
"rank",
"quantile",
"diff",
Expand Down
11 changes: 9 additions & 2 deletions pandas/tests/groupby/test_apply.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
import numpy as np
import pytest

import pandas.util._test_decorators as td

import pandas as pd
from pandas import (
DataFrame,
Expand Down Expand Up @@ -84,6 +86,7 @@ def test_apply_trivial_fail():
tm.assert_frame_equal(result, expected)


@td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) fast_apply not used
def test_fast_apply():
# make sure that fast apply is correctly called
# rather than raising any kind of error
Expand Down Expand Up @@ -213,6 +216,7 @@ def test_group_apply_once_per_group2(capsys):
assert result == expected


@td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) fast_apply not used
@pytest.mark.xfail(reason="GH-34998")
def test_apply_fast_slow_identical():
# GH 31613
Expand All @@ -233,6 +237,7 @@ def fast(group):
tm.assert_frame_equal(fast_df, slow_df)


@td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) fast_apply not used
@pytest.mark.parametrize(
"func",
[
Expand Down Expand Up @@ -313,6 +318,7 @@ def test_groupby_as_index_apply(df):
tm.assert_index_equal(res, ind)


@td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) quantile
def test_apply_concat_preserve_names(three_group):
grouped = three_group.groupby(["A", "B"])

Expand Down Expand Up @@ -1003,9 +1009,10 @@ def test_apply_function_with_indexing_return_column():
tm.assert_frame_equal(result, expected)


@pytest.mark.xfail(reason="GH-34998")
def test_apply_with_timezones_aware():
def test_apply_with_timezones_aware(using_array_manager, request):
# GH: 27212
if not using_array_manager:
request.node.add_marker(pytest.mark.xfail(reason="GH-34998"))

dates = ["2001-01-01"] * 2 + ["2001-01-02"] * 2 + ["2001-01-03"] * 2
index_no_tz = pd.DatetimeIndex(dates)
Expand Down
10 changes: 9 additions & 1 deletion pandas/tests/groupby/test_categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
import numpy as np
import pytest

import pandas.util._test_decorators as td

import pandas as pd
from pandas import (
Categorical,
Expand Down Expand Up @@ -81,6 +83,7 @@ def get_stats(group):
assert result.index.names[0] == "C"


@td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) quantile
def test_basic():

cats = Categorical(
Expand Down Expand Up @@ -276,7 +279,9 @@ def test_apply(ordered):
tm.assert_series_equal(result, expected)


def test_observed(observed):
# TODO(ArrayManager) incorrect dtype for mean()
@td.skip_array_manager_not_yet_implemented
def test_observed(observed, using_array_manager):
# multiple groupers, don't re-expand the output space
# of the grouper
# gh-14942 (implement)
Expand Down Expand Up @@ -535,6 +540,7 @@ def test_dataframe_categorical_ordered_observed_sort(ordered, observed, sort):
assert False, msg


@td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) quantile
def test_datetime():
# GH9049: ensure backward compatibility
levels = pd.date_range("2014-01-01", periods=4)
Expand Down Expand Up @@ -600,6 +606,7 @@ def test_categorical_index():
tm.assert_frame_equal(result, expected)


@td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) quantile
def test_describe_categorical_columns():
# GH 11558
cats = CategoricalIndex(
Expand All @@ -614,6 +621,7 @@ def test_describe_categorical_columns():
tm.assert_categorical_equal(result.stack().columns.values, cats.values)


@td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) quantile
def test_unstack_categorical():
# GH11558 (example is taken from the original issue)
df = DataFrame(
Expand Down
8 changes: 8 additions & 0 deletions pandas/tests/groupby/test_function.py
Original file line number Diff line number Diff line change
Expand Up @@ -367,6 +367,7 @@ def test_mad(self, gb, gni):
result = gni.mad()
tm.assert_frame_equal(result, expected)

@td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) quantile
def test_describe(self, df, gb, gni):
# describe
expected_index = Index([1, 3], name="A")
Expand Down Expand Up @@ -923,11 +924,13 @@ def test_is_monotonic_decreasing(in_vals, out_vals):
# --------------------------------


@td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) quantile
def test_apply_describe_bug(mframe):
grouped = mframe.groupby(level="first")
grouped.describe() # it works!


@td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) quantile
def test_series_describe_multikey():
ts = tm.makeTimeSeries()
grouped = ts.groupby([lambda x: x.year, lambda x: x.month])
Expand All @@ -937,6 +940,7 @@ def test_series_describe_multikey():
tm.assert_series_equal(result["min"], grouped.min(), check_names=False)


@td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) quantile
def test_series_describe_single():
ts = tm.makeTimeSeries()
grouped = ts.groupby(lambda x: x.month)
Expand All @@ -951,6 +955,7 @@ def test_series_index_name(df):
assert result.index.name == "A"


@td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) quantile
def test_frame_describe_multikey(tsframe):
grouped = tsframe.groupby([lambda x: x.year, lambda x: x.month])
result = grouped.describe()
Expand All @@ -973,6 +978,7 @@ def test_frame_describe_multikey(tsframe):
tm.assert_frame_equal(result, expected)


@td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) quantile
def test_frame_describe_tupleindex():

# GH 14848 - regression from 0.19.0 to 0.19.1
Expand All @@ -992,6 +998,7 @@ def test_frame_describe_tupleindex():
df2.groupby("key").describe()


@td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) quantile
def test_frame_describe_unstacked_format():
# GH 4792
prices = {
Expand All @@ -1018,6 +1025,7 @@ def test_frame_describe_unstacked_format():
tm.assert_frame_equal(result, expected)


@td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) quantile
@pytest.mark.filterwarnings(
"ignore:"
"indexing past lexsort depth may impact performance:"
Expand Down
3 changes: 3 additions & 0 deletions pandas/tests/groupby/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

from pandas.compat import IS64
from pandas.errors import PerformanceWarning
import pandas.util._test_decorators as td

import pandas as pd
from pandas import (
Expand Down Expand Up @@ -210,6 +211,7 @@ def f(grp):
tm.assert_series_equal(result, e)


@td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) quantile
def test_pass_args_kwargs(ts, tsframe):
def f(x, q=None, axis=0):
return np.percentile(x, q, axis=axis)
Expand Down Expand Up @@ -364,6 +366,7 @@ def f3(x):
df2.groupby("a").apply(f3)


@td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) quantile
def test_attr_wrapper(ts):
grouped = ts.groupby(lambda x: x.weekday())

Expand Down
5 changes: 5 additions & 0 deletions pandas/tests/groupby/test_quantile.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,18 @@
import numpy as np
import pytest

import pandas.util._test_decorators as td

import pandas as pd
from pandas import (
DataFrame,
Index,
)
import pandas._testing as tm

# TODO(ArrayManager) quantile
pytestmark = td.skip_array_manager_not_yet_implemented


@pytest.mark.parametrize(
"interpolation", ["linear", "lower", "higher", "nearest", "midpoint"]
Expand Down
9 changes: 8 additions & 1 deletion pandas/tests/groupby/transform/test_transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
import numpy as np
import pytest

import pandas.util._test_decorators as td

from pandas.core.dtypes.common import (
ensure_platform_int,
is_timedelta64_dtype,
Expand Down Expand Up @@ -161,8 +163,11 @@ def test_transform_broadcast(tsframe, ts):
assert_fp_equal(res.xs(idx), agged[idx])


def test_transform_axis_1(request, transformation_func):
def test_transform_axis_1(request, transformation_func, using_array_manager):
# GH 36308
if using_array_manager and transformation_func == "pct_change":
# TODO(ArrayManager) column-wise shift
pytest.skip("ArrayManager: column-wise not yet implemented")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

xfail?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, changed

warn = None
if transformation_func == "tshift":
warn = FutureWarning
Expand All @@ -183,6 +188,8 @@ def test_transform_axis_1(request, transformation_func):
tm.assert_equal(result, expected)


# TODO(ArrayManager) groupby().transform returns DataFrame backed by BlockManager
@td.skip_array_manager_not_yet_implemented
def test_transform_axis_ts(tsframe):

# make sure that we are setting the axes
Expand Down