From cdd78dbe4172e452874ac8b98593aea19bc7b877 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Fri, 24 May 2019 14:44:27 -0400 Subject: [PATCH 01/11] BUG: preserve categorical & sparse types when grouping / pivot closes #18502 --- doc/source/whatsnew/v0.25.0.rst | 29 +++++++++++ pandas/core/groupby/generic.py | 11 +++- pandas/core/groupby/groupby.py | 42 ++++++++++++---- pandas/core/groupby/ops.py | 6 +-- pandas/core/indexing.py | 2 +- pandas/core/internals/blocks.py | 24 ++++++++- pandas/core/internals/construction.py | 5 +- pandas/core/nanops.py | 9 ++-- pandas/tests/groupby/test_function.py | 53 ++++++++++---------- pandas/tests/groupby/test_nth.py | 19 ++++--- pandas/tests/resample/test_datetime_index.py | 6 +++ pandas/tests/sparse/test_groupby.py | 10 ++-- 12 files changed, 154 insertions(+), 62 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 1980e00f1073d..0449f37d3ac28 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -322,6 +322,35 @@ of ``object`` dtype. :attr:`Series.str` will now infer the dtype data *within* t s s.str.startswith(b'a') +.. _whatsnew_0250.api_breaking.groupby_categorical: + +Categorical dtypes are preserved during groupby +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Previously, columns that were categorical, but not the groupby key(s) would be converted to ``object`` dtype during groupby operations. Pandas now will preserve these dtypes. (:issue:`18502`) + +.. ipython:: python + + df = pd.DataFrame( + {'payload': [-1, -2, -1, -2], + 'col': pd.Categorical(["foo", "bar", "bar", "qux"], ordered=True)}) + df + df.dtypes + +*Previous Behavior*: + +.. code-block:: python + + In [5]: df.groupby('payload').first().col.dtype + Out[5]: dtype('O') + +*New Behavior*: + +.. ipython:: python + + df.groupby('payload').first().col.dtype + + .. _whatsnew_0250.api_breaking.incompatible_index_unions: Incompatible Index type unions diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index a10920b7a5afb..f1cc54d5a460f 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -158,12 +158,19 @@ def _cython_agg_blocks(self, how, alt=None, numeric_only=True, obj = self.obj[data.items[locs]] s = groupby(obj, self.grouper) - result = s.aggregate(lambda x: alt(x, axis=self.axis)) + try: + result = s.aggregate(lambda x: alt(x, axis=self.axis)) + except Exception: + # we may have an exception in trying to aggregate + # continue and exclude the block + pass finally: + dtype = block.values.dtype + # see if we can cast the block back to the original dtype - result = block._try_coerce_and_cast_result(result) + result = block._try_coerce_and_cast_result(result, dtype=dtype) newb = block.make_block(result) new_items.append(locs) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 64cacd60da30f..202d4fb15f971 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -786,6 +786,8 @@ def _try_cast(self, result, obj, numeric_only=False): elif is_extension_array_dtype(dtype): # The function can return something of any type, so check # if the type is compatible with the calling EA. + + # return the same type (Series) as our caller try: result = obj._values._from_sequence(result, dtype=dtype) except Exception: @@ -1157,7 +1159,8 @@ def mean(self, *args, **kwargs): """ nv.validate_groupby_func('mean', args, kwargs, ['numeric_only']) try: - return self._cython_agg_general('mean', **kwargs) + return self._cython_agg_general( + 'mean', alt=lambda x, axis: Series(x).mean(**kwargs), **kwargs) except GroupByError: raise except Exception: # pragma: no cover @@ -1179,7 +1182,11 @@ def median(self, **kwargs): Median of values within each group. """ try: - return self._cython_agg_general('median', **kwargs) + return self._cython_agg_general( + 'median', + alt=lambda x, + axis: Series(x).median(axis=axis, **kwargs), + **kwargs) except GroupByError: raise except Exception: # pragma: no cover @@ -1235,7 +1242,10 @@ def var(self, ddof=1, *args, **kwargs): nv.validate_groupby_func('var', args, kwargs) if ddof == 1: try: - return self._cython_agg_general('var', **kwargs) + return self._cython_agg_general( + 'var', + alt=lambda x, axis: Series(x).var(ddof=ddof, **kwargs), + **kwargs) except Exception: f = lambda x: x.var(ddof=ddof, **kwargs) with _group_selection_context(self): @@ -1263,7 +1273,6 @@ def sem(self, ddof=1): Series or DataFrame Standard error of the mean of values within each group. """ - return self.std(ddof=ddof) / np.sqrt(self.count()) @Substitution(name='groupby') @@ -1290,7 +1299,7 @@ def _add_numeric_operations(cls): """ def groupby_function(name, alias, npfunc, - numeric_only=True, _convert=False, + numeric_only=True, min_count=-1): _local_template = """ @@ -1312,17 +1321,30 @@ def f(self, **kwargs): kwargs['min_count'] = min_count self._set_group_selection() + + # try a cython aggregation if we can try: return self._cython_agg_general( alias, alt=npfunc, **kwargs) except AssertionError as e: raise SpecificationError(str(e)) except Exception: - result = self.aggregate( - lambda x: npfunc(x, axis=self.axis)) - if _convert: - result = result._convert(datetime=True) - return result + pass + + # apply a non-cython aggregation + result = self.aggregate( + lambda x: npfunc(x, axis=self.axis)) + + # coerce the resulting columns if we can + if isinstance(result, DataFrame): + for col in result.columns: + result[col] = self._try_cast( + result[col], self.obj[col]) + else: + result = self._try_cast( + result, self.obj) + + return result set_function_name(f, name, cls) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 010047a8be4ed..38478be5a8e07 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -19,7 +19,7 @@ from pandas.core.dtypes.common import ( ensure_float64, ensure_int64, ensure_int_or_float, ensure_object, ensure_platform_int, is_bool_dtype, is_categorical_dtype, is_complex_dtype, - is_datetime64_any_dtype, is_integer_dtype, is_numeric_dtype, + is_datetime64_any_dtype, is_integer_dtype, is_numeric_dtype, is_sparse, is_timedelta64_dtype, needs_i8_conversion) from pandas.core.dtypes.missing import _maybe_fill, isna @@ -451,9 +451,9 @@ def _cython_operation(self, kind, values, how, axis, min_count=-1, # categoricals are only 1d, so we # are not setup for dim transforming - if is_categorical_dtype(values): + if is_categorical_dtype(values) or is_sparse(values): raise NotImplementedError( - "categoricals are not support in cython ops ATM") + "{} are not support in cython ops".format(values.dtype)) elif is_datetime64_any_dtype(values): if how in ['add', 'prod', 'cumsum', 'cumprod']: raise NotImplementedError( diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 1539feb2e0856..6a1e09f6bb303 100755 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -10,7 +10,7 @@ from pandas.core.dtypes.common import ( ensure_platform_int, is_float, is_integer, is_integer_dtype, is_iterator, - is_list_like, is_numeric_dtype, is_scalar, is_sequence, is_sparse) + is_list_like, is_numeric_dtype, is_scalar, is_sequence) from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries from pandas.core.dtypes.missing import _infer_fill_value, isna diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index db0eb44eabbfe..bb06bbbf6011d 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -594,7 +594,8 @@ def _astype(self, dtype, copy=False, errors='raise', values=None, values = self.get_values(dtype=dtype) # _astype_nansafe works fine with 1-d only - values = astype_nansafe(values.ravel(), dtype, copy=True) + values = astype_nansafe( + values.ravel(), dtype, copy=True, **kwargs) # TODO(extension) # should we make this attribute? @@ -1746,6 +1747,27 @@ def _slice(self, slicer): return self.values[slicer] + def _try_cast_result(self, result, dtype=None): + """ + if we have an operation that operates on for example floats + we want to try to cast back to our EA here if possible + + result could be a 2-D numpy array, e.g. the result of + a numeric operation; but it must be shape (1, X) because + we by-definition operate on the ExtensionBlocks one-by-one + + result could also be an EA Array itself, in which case it + is already a 1-D array + """ + try: + + result = self._holder._from_sequence( + np.asarray(result).ravel(), dtype=dtype) + except Exception: + pass + + return result + def formatting_values(self): # Deprecating the ability to override _formatting_values. # Do the warning here, it's only user in pandas, since we diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 96b4ab7f3fbc6..0806e6e927e8d 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -687,7 +687,10 @@ def sanitize_array(data, index, dtype=None, copy=False, data = np.array(data, dtype=dtype, copy=False) subarr = np.array(data, dtype=object, copy=copy) - if is_object_dtype(subarr.dtype) and dtype != 'object': + if (not (is_extension_array_dtype(subarr.dtype) or + is_extension_array_dtype(dtype)) and + is_object_dtype(subarr.dtype) and + not is_object_dtype(dtype)): inferred = lib.infer_dtype(subarr, skipna=False) if inferred == 'period': try: diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index 7923e463c7719..24a28bf0005cb 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -72,11 +72,12 @@ def _f(*args, **kwargs): class bottleneck_switch: - def __init__(self, **kwargs): + def __init__(self, name=None, **kwargs): + self.name = name self.kwargs = kwargs def __call__(self, alt): - bn_name = alt.__name__ + bn_name = self.name or alt.__name__ try: bn_func = getattr(bn, bn_name) @@ -804,7 +805,8 @@ def nansem(values, axis=None, skipna=True, ddof=1, mask=None): def _nanminmax(meth, fill_value_typ): - @bottleneck_switch() + + @bottleneck_switch(name='nan' + meth) def reduction(values, axis=None, skipna=True, mask=None): values, mask, dtype, dtype_max, fill_value = _get_values( @@ -824,7 +826,6 @@ def reduction(values, axis=None, skipna=True, mask=None): result = _wrap_results(result, dtype, fill_value) return _maybe_null_out(result, axis, mask, values.shape) - reduction.__name__ = 'nan' + meth return reduction diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 14f27f0c4c7d8..e4303c0a07076 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -12,7 +12,7 @@ from pandas import ( DataFrame, Index, MultiIndex, Series, Timestamp, date_range, isna) import pandas.core.nanops as nanops -from pandas.util import testing as tm +from pandas.util import _test_decorators as td, testing as tm @pytest.mark.parametrize("agg_func", ['any', 'all']) @@ -144,6 +144,7 @@ def test_arg_passthru(): index=Index([1, 2], name='group'), columns=['int', 'float', 'category_int', 'datetime', 'datetimetz', 'timedelta']) + for attr in ['mean', 'median']: f = getattr(df.groupby('group'), attr) result = f() @@ -459,35 +460,33 @@ def test_groupby_cumprod(): tm.assert_series_equal(actual, expected) -def test_ops_general(): - ops = [('mean', np.mean), - ('median', np.median), - ('std', np.std), - ('var', np.var), - ('sum', np.sum), - ('prod', np.prod), - ('min', np.min), - ('max', np.max), - ('first', lambda x: x.iloc[0]), - ('last', lambda x: x.iloc[-1]), - ('count', np.size), ] - try: - from scipy.stats import sem - except ImportError: - pass - else: - ops.append(('sem', sem)) +def scipy_sem(*args, **kwargs): + from scipy.stats import sem + return sem(*args, ddof=1, **kwargs) + + +@pytest.mark.parametrize( + 'op,targop', + [('mean', np.mean), + ('median', np.median), + ('std', np.std), + ('var', np.var), + ('sum', np.sum), + ('prod', np.prod), + ('min', np.min), + ('max', np.max), + ('first', lambda x: x.iloc[0]), + ('last', lambda x: x.iloc[-1]), + ('count', np.size), + pytest.param( + 'sem', scipy_sem, marks=td.skip_if_no_scipy)]) +def test_ops_general(op, targop): df = DataFrame(np.random.randn(1000)) labels = np.random.randint(0, 50, size=1000).astype(float) - for op, targop in ops: - result = getattr(df.groupby(labels), op)().astype(float) - expected = df.groupby(labels).agg(targop) - try: - tm.assert_frame_equal(result, expected) - except BaseException as exc: - exc.args += ('operation: %s' % op, ) - raise + result = getattr(df.groupby(labels), op)().astype(float) + expected = df.groupby(labels).agg(targop) + tm.assert_frame_equal(result, expected) def test_max_nan_bug(): diff --git a/pandas/tests/groupby/test_nth.py b/pandas/tests/groupby/test_nth.py index 6a08a8d79b63e..b174fb0e0b6f9 100644 --- a/pandas/tests/groupby/test_nth.py +++ b/pandas/tests/groupby/test_nth.py @@ -282,18 +282,21 @@ def test_first_last_tz(data, expected_first, expected_last): ]) def test_first_last_tz_multi_column(method, ts, alpha): # GH 21603 + category_string = pd.Series(list('abc')).astype( + 'category') df = pd.DataFrame({'group': [1, 1, 2], - 'category_string': pd.Series(list('abc')).astype( - 'category'), + 'category_string': category_string, 'datetimetz': pd.date_range('20130101', periods=3, tz='US/Eastern')}) result = getattr(df.groupby('group'), method)() - expepcted = pd.DataFrame({'category_string': [alpha, 'c'], - 'datetimetz': [ts, - Timestamp('2013-01-03', - tz='US/Eastern')]}, - index=pd.Index([1, 2], name='group')) - assert_frame_equal(result, expepcted) + expected = pd.DataFrame( + {'category_string': pd.Categorical( + [alpha, 'c'], dtype=category_string.dtype), + 'datetimetz': [ts, + Timestamp('2013-01-03', + tz='US/Eastern')]}, + index=pd.Index([1, 2], name='group')) + assert_frame_equal(result, expected) def test_nth_multi_index_as_expected(): diff --git a/pandas/tests/resample/test_datetime_index.py b/pandas/tests/resample/test_datetime_index.py index 5711174ef0c9f..830ba6062cc72 100644 --- a/pandas/tests/resample/test_datetime_index.py +++ b/pandas/tests/resample/test_datetime_index.py @@ -112,6 +112,12 @@ def test_resample_integerarray(): dtype="Int64") assert_series_equal(result, expected) + result = ts.resample('3T').mean() + expected = Series([1, 4, 7], + index=pd.date_range('1/1/2000', periods=3, freq='3T'), + dtype='Int64') + assert_series_equal(result, expected) + def test_resample_basic_grouper(series): s = series diff --git a/pandas/tests/sparse/test_groupby.py b/pandas/tests/sparse/test_groupby.py index 531a4360c78a2..bf6055bc12725 100644 --- a/pandas/tests/sparse/test_groupby.py +++ b/pandas/tests/sparse/test_groupby.py @@ -29,11 +29,10 @@ def test_first_last_nth(self): sparse_grouped_last = sparse_grouped.last() sparse_grouped_nth = sparse_grouped.nth(1) - dense_grouped_first = dense_grouped.first().to_sparse() - dense_grouped_last = dense_grouped.last().to_sparse() - dense_grouped_nth = dense_grouped.nth(1).to_sparse() + dense_grouped_first = pd.DataFrame(dense_grouped.first().to_sparse()) + dense_grouped_last = pd.DataFrame(dense_grouped.last().to_sparse()) + dense_grouped_nth = pd.DataFrame(dense_grouped.nth(1).to_sparse()) - # TODO: shouldn't these all be spares or not? tm.assert_frame_equal(sparse_grouped_first, dense_grouped_first) tm.assert_frame_equal(sparse_grouped_last, @@ -69,5 +68,6 @@ def test_groupby_includes_fill_value(fill_value): 'b': [fill_value, 1, fill_value, fill_value]}) sdf = df.to_sparse(fill_value=fill_value) result = sdf.groupby('a').sum() - expected = df.groupby('a').sum().to_sparse(fill_value=fill_value) + expected = pd.DataFrame(df.groupby('a').sum().to_sparse( + fill_value=fill_value)) tm.assert_frame_equal(result, expected, check_index_type=False) From 6751d0b4d59a786b62045c3914ff9f462547d891 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Wed, 26 Jun 2019 21:46:04 -0500 Subject: [PATCH 02/11] typo --- pandas/core/indexing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 6a1e09f6bb303..1539feb2e0856 100755 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -10,7 +10,7 @@ from pandas.core.dtypes.common import ( ensure_platform_int, is_float, is_integer, is_integer_dtype, is_iterator, - is_list_like, is_numeric_dtype, is_scalar, is_sequence) + is_list_like, is_numeric_dtype, is_scalar, is_sequence, is_sparse) from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries from pandas.core.dtypes.missing import _infer_fill_value, isna From b8be789a2818db9fd175470505ba15566625d38a Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Wed, 26 Jun 2019 22:18:57 -0500 Subject: [PATCH 03/11] moar tests --- pandas/tests/groupby/test_categorical.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index f24fa0daa5b18..58a43dc218d33 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -697,6 +697,27 @@ def test_preserve_categorical_dtype(): tm.assert_frame_equal(result2, expected) +@pytest.mark.parametrize( + 'func, values', + [('first', ['second', 'first']), + ('last', ['fourth', 'third']), + ('min', ['fourth', 'first']), + ('max', ['second', 'third'])]) +def test_preserve_on_ordered_ops(func, values): + # gh-18502 + # preserve the categoricals on ops + c = pd.Categorical(['first', 'second', 'third', 'fourth'], ordered=True) + df = pd.DataFrame( + {'payload': [-1, -2, -1, -2], + 'col': c}) + g = df.groupby('payload') + result = getattr(g, func)() + expected = pd.DataFrame( + {'payload': [-2, -1], + 'col': pd.Series(values, dtype=c.dtype)}).set_index('payload') + tm.assert_frame_equal(result, expected) + + def test_categorical_no_compress(): data = Series(np.random.randn(9)) From 31d4635f44148aeb69663306454efd95306f198a Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Wed, 26 Jun 2019 22:44:23 -0500 Subject: [PATCH 04/11] use a fixed random seed --- pandas/tests/sparse/test_pivot.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/tests/sparse/test_pivot.py b/pandas/tests/sparse/test_pivot.py index 114e7b4bacd94..4b77e22024280 100644 --- a/pandas/tests/sparse/test_pivot.py +++ b/pandas/tests/sparse/test_pivot.py @@ -11,12 +11,13 @@ class TestPivotTable: def setup_method(self, method): + rs = np.random.RandomState(0) self.dense = pd.DataFrame({'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'], 'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'], - 'C': np.random.randn(8), - 'D': np.random.randn(8), + 'C': rs.randn(8), + 'D': rs.randn(8), 'E': [np.nan, np.nan, 1, 2, np.nan, 1, np.nan, np.nan]}) self.sparse = self.dense.to_sparse() From ea98679a619ab0dfbfd29b7faaae91ac5b26e8da Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Thu, 27 Jun 2019 09:39:37 -0500 Subject: [PATCH 05/11] xfail on np 1.17 --- pandas/tests/sparse/test_pivot.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pandas/tests/sparse/test_pivot.py b/pandas/tests/sparse/test_pivot.py index 4b77e22024280..2b6d2a4e63a0c 100644 --- a/pandas/tests/sparse/test_pivot.py +++ b/pandas/tests/sparse/test_pivot.py @@ -3,6 +3,7 @@ import pandas as pd import pandas.util.testing as tm +from pandas import _np_version_under1p17 @pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") @@ -48,6 +49,8 @@ def test_pivot_table(self): # values='E', aggfunc='sum') # tm.assert_frame_equal(res_sparse, res_dense) + @pytest.mark.xfail(not _np_version_under1p17, + reason="failing occasionally on numpy > 1.17") def test_pivot_table_multi(self): res_sparse = pd.pivot_table(self.sparse, index='A', columns='B', values=['D', 'E']) From 7ab00fa071c10654aabba6a4158d6c6685eb1713 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Thu, 27 Jun 2019 10:02:45 -0500 Subject: [PATCH 06/11] lint --- pandas/tests/sparse/test_pivot.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/sparse/test_pivot.py b/pandas/tests/sparse/test_pivot.py index 2b6d2a4e63a0c..da5251ff4cbe0 100644 --- a/pandas/tests/sparse/test_pivot.py +++ b/pandas/tests/sparse/test_pivot.py @@ -2,8 +2,8 @@ import pytest import pandas as pd -import pandas.util.testing as tm from pandas import _np_version_under1p17 +import pandas.util.testing as tm @pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") From bad7553d315acdfd1b6fbf598eec574102033af6 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Thu, 27 Jun 2019 11:26:01 -0500 Subject: [PATCH 07/11] groupby tests --- pandas/core/internals/blocks.py | 2 +- pandas/tests/extension/base/groupby.py | 12 ++++++++++++ pandas/tests/extension/decimal/test_decimal.py | 6 +++++- 3 files changed, 18 insertions(+), 2 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index bb06bbbf6011d..652f70746f618 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1762,7 +1762,7 @@ def _try_cast_result(self, result, dtype=None): try: result = self._holder._from_sequence( - np.asarray(result).ravel(), dtype=dtype) + result.ravel(), dtype=dtype) except Exception: pass diff --git a/pandas/tests/extension/base/groupby.py b/pandas/tests/extension/base/groupby.py index 1929dad075695..daeec5923888c 100644 --- a/pandas/tests/extension/base/groupby.py +++ b/pandas/tests/extension/base/groupby.py @@ -64,6 +64,18 @@ def test_groupby_extension_apply( df.groupby("A").apply(groupby_apply_op) df.groupby("A").B.apply(groupby_apply_op) + def test_groupby_apply_identity(self, data_for_grouping): + df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1, 4], + "B": data_for_grouping}) + result = df.groupby('A').B.apply(lambda x: x.array) + expected = pd.Series([df.B.iloc[[0, 1, 6]].array, + df.B.iloc[[2, 3]].array, + df.B.iloc[[4, 5]].array, + df.B.iloc[[7]].array], + index=pd.Index([1, 2, 3, 4], name='A'), + name='B') + self.assert_series_equal(result, expected) + def test_in_numeric_groupby(self, data_for_grouping): df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1, 4], "B": data_for_grouping, diff --git a/pandas/tests/extension/decimal/test_decimal.py b/pandas/tests/extension/decimal/test_decimal.py index 94c0b61c6382a..fbea5c80a6e12 100644 --- a/pandas/tests/extension/decimal/test_decimal.py +++ b/pandas/tests/extension/decimal/test_decimal.py @@ -192,7 +192,11 @@ class TestCasting(BaseDecimal, base.BaseCastingTests): class TestGroupby(BaseDecimal, base.BaseGroupbyTests): - pass + + @pytest.mark.xfail( + reason="needs to correctly define __eq__ to handle nans, xref #27081.") + def test_groupby_apply_identity(self, data_for_grouping): + super().test_groupby_apply_idendeity(data_for_grouping) class TestSetitem(BaseDecimal, base.BaseSetitemTests): From 41e11e130347466f7d0c9d4dd5aba390282b85bb Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Thu, 27 Jun 2019 13:24:07 -0500 Subject: [PATCH 08/11] use strict=False --- pandas/tests/sparse/test_pivot.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/tests/sparse/test_pivot.py b/pandas/tests/sparse/test_pivot.py index da5251ff4cbe0..1841571eb61c8 100644 --- a/pandas/tests/sparse/test_pivot.py +++ b/pandas/tests/sparse/test_pivot.py @@ -50,7 +50,8 @@ def test_pivot_table(self): # tm.assert_frame_equal(res_sparse, res_dense) @pytest.mark.xfail(not _np_version_under1p17, - reason="failing occasionally on numpy > 1.17") + reason="failing occasionally on numpy > 1.17", + strict=False) def test_pivot_table_multi(self): res_sparse = pd.pivot_table(self.sparse, index='A', columns='B', values=['D', 'E']) From ccfcca01cc9b87c19bcb821803ad02ed126cfe4a Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Thu, 27 Jun 2019 15:09:33 -0500 Subject: [PATCH 09/11] review comments --- pandas/core/groupby/generic.py | 2 +- pandas/tests/sparse/test_pivot.py | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index f1cc54d5a460f..7c8c7956f8cb4 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -160,7 +160,7 @@ def _cython_agg_blocks(self, how, alt=None, numeric_only=True, s = groupby(obj, self.grouper) try: result = s.aggregate(lambda x: alt(x, axis=self.axis)) - except Exception: + except TypeError: # we may have an exception in trying to aggregate # continue and exclude the block pass diff --git a/pandas/tests/sparse/test_pivot.py b/pandas/tests/sparse/test_pivot.py index 1841571eb61c8..8f98117f20208 100644 --- a/pandas/tests/sparse/test_pivot.py +++ b/pandas/tests/sparse/test_pivot.py @@ -42,12 +42,12 @@ def test_pivot_table(self): values='E', aggfunc='mean') tm.assert_frame_equal(res_sparse, res_dense) - # ToDo: sum doesn't handle nan properly - # res_sparse = pd.pivot_table(self.sparse, index='A', columns='B', - # values='E', aggfunc='sum') - # res_dense = pd.pivot_table(self.dense, index='A', columns='B', - # values='E', aggfunc='sum') - # tm.assert_frame_equal(res_sparse, res_dense) + def test_pivot_table_with_nans(self): + res_sparse = pd.pivot_table(self.sparse, index='A', columns='B', + values='E', aggfunc='sum') + res_dense = pd.pivot_table(self.dense, index='A', columns='B', + values='E', aggfunc='sum') + tm.assert_frame_equal(res_sparse, res_dense) @pytest.mark.xfail(not _np_version_under1p17, reason="failing occasionally on numpy > 1.17", From 48e7c32e6b4c39103bf9cc0cb3374d9363a27778 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Thu, 27 Jun 2019 16:49:08 -0500 Subject: [PATCH 10/11] typo --- pandas/tests/extension/decimal/test_decimal.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/extension/decimal/test_decimal.py b/pandas/tests/extension/decimal/test_decimal.py index fbea5c80a6e12..ecef835a9c797 100644 --- a/pandas/tests/extension/decimal/test_decimal.py +++ b/pandas/tests/extension/decimal/test_decimal.py @@ -196,7 +196,7 @@ class TestGroupby(BaseDecimal, base.BaseGroupbyTests): @pytest.mark.xfail( reason="needs to correctly define __eq__ to handle nans, xref #27081.") def test_groupby_apply_identity(self, data_for_grouping): - super().test_groupby_apply_idendeity(data_for_grouping) + super().test_groupby_apply_identity(data_for_grouping) class TestSetitem(BaseDecimal, base.BaseSetitemTests): From 3a6a0c08dc5a65427f29fbf4c989b2b895b004aa Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Thu, 27 Jun 2019 18:12:26 -0500 Subject: [PATCH 11/11] fix doc warning on master --- doc/source/whatsnew/v0.25.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 0449f37d3ac28..8f677b1f7dc76 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -838,7 +838,7 @@ ExtensionArray - Bug in :func:`factorize` when passing an ``ExtensionArray`` with a custom ``na_sentinel`` (:issue:`25696`). - :meth:`Series.count` miscounts NA values in ExtensionArrays (:issue:`26835`) -- Keyword argument ``deep`` has been removed from :method:`ExtensionArray.copy` (:issue:`27083`) +- Keyword argument ``deep`` has been removed from :meth:`ExtensionArray.copy` (:issue:`27083`) Other ^^^^^