Skip to content

Commit 8e13da2

Browse files
committed
BUG: make sure that we are passing thru kwargs to groupby
BUG: allow timedelta64 to work in groupby with numeric_only=False closes pandas-dev#5724 Author: Jeff Reback <[email protected]> Closes pandas-dev#15054 from jreback/groupby_arg and squashes the following commits: 768fce1 [Jeff Reback] BUG: make sure that we are passing thru kwargs to groupby BUG: allow timedelta64 to work in groupby with numeric_only=False
1 parent 99afdd9 commit 8e13da2

File tree

4 files changed

+240
-34
lines changed

4 files changed

+240
-34
lines changed

doc/source/whatsnew/v0.20.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -400,6 +400,7 @@ Bug Fixes
400400

401401

402402

403+
- Bug in groupby operations with timedelta64 when passing ``numeric_only=False`` (:issue:`5724`)
403404

404405

405406
- Bug in ``DataFrame.to_html`` with ``index=False`` and ``max_rows`` raising in ``IndexError`` (:issue:`14998`)

pandas/compat/numpy/function.py

+8-2
Original file line numberDiff line numberDiff line change
@@ -306,12 +306,18 @@ def validate_expanding_func(name, args, kwargs):
306306
raise UnsupportedFunctionCall(msg)
307307

308308

309-
def validate_groupby_func(name, args, kwargs):
309+
def validate_groupby_func(name, args, kwargs, allowed=None):
310310
"""
311-
'args' and 'kwargs' should be empty because all of
311+
'args' and 'kwargs' should be empty, except for allowed
312+
kwargs because all of
312313
their necessary parameters are explicitly listed in
313314
the function signature
314315
"""
316+
if allowed is None:
317+
allowed = []
318+
319+
kwargs = set(kwargs) - set(allowed)
320+
315321
if len(args) + len(kwargs) > 0:
316322
raise UnsupportedFunctionCall((
317323
"numpy operations are not valid "

pandas/core/groupby.py

+108-32
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
is_categorical_dtype,
2020
is_datetimelike,
2121
is_datetime_or_timedelta_dtype,
22+
is_datetime64_any_dtype,
2223
is_bool, is_integer_dtype,
2324
is_complex_dtype,
2425
is_bool_dtype,
@@ -109,10 +110,12 @@ def _groupby_function(name, alias, npfunc, numeric_only=True,
109110
@Substitution(name='groupby', f=name)
110111
@Appender(_doc_template)
111112
@Appender(_local_template)
112-
def f(self):
113+
def f(self, **kwargs):
114+
if 'numeric_only' not in kwargs:
115+
kwargs['numeric_only'] = numeric_only
113116
self._set_group_selection()
114117
try:
115-
return self._cython_agg_general(alias, numeric_only=numeric_only)
118+
return self._cython_agg_general(alias, alt=npfunc, **kwargs)
116119
except AssertionError as e:
117120
raise SpecificationError(str(e))
118121
except Exception:
@@ -127,7 +130,9 @@ def f(self):
127130

128131

129132
def _first_compat(x, axis=0):
133+
130134
def _first(x):
135+
131136
x = np.asarray(x)
132137
x = x[notnull(x)]
133138
if len(x) == 0:
@@ -142,6 +147,7 @@ def _first(x):
142147

143148
def _last_compat(x, axis=0):
144149
def _last(x):
150+
145151
x = np.asarray(x)
146152
x = x[notnull(x)]
147153
if len(x) == 0:
@@ -775,14 +781,16 @@ def _try_cast(self, result, obj):
775781
return result
776782

777783
def _cython_transform(self, how, numeric_only=True):
778-
output = {}
784+
output = collections.OrderedDict()
779785
for name, obj in self._iterate_slices():
780786
is_numeric = is_numeric_dtype(obj.dtype)
781787
if numeric_only and not is_numeric:
782788
continue
783789

784790
try:
785791
result, names = self.grouper.transform(obj.values, how)
792+
except NotImplementedError:
793+
continue
786794
except AssertionError as e:
787795
raise GroupByError(str(e))
788796
output[name] = self._try_cast(result, obj)
@@ -792,7 +800,7 @@ def _cython_transform(self, how, numeric_only=True):
792800

793801
return self._wrap_transformed_output(output, names)
794802

795-
def _cython_agg_general(self, how, numeric_only=True):
803+
def _cython_agg_general(self, how, alt=None, numeric_only=True):
796804
output = {}
797805
for name, obj in self._iterate_slices():
798806
is_numeric = is_numeric_dtype(obj.dtype)
@@ -1015,26 +1023,26 @@ def mean(self, *args, **kwargs):
10151023
10161024
For multiple groupings, the result index will be a MultiIndex
10171025
"""
1018-
nv.validate_groupby_func('mean', args, kwargs)
1026+
nv.validate_groupby_func('mean', args, kwargs, ['numeric_only'])
10191027
try:
1020-
return self._cython_agg_general('mean')
1028+
return self._cython_agg_general('mean', **kwargs)
10211029
except GroupByError:
10221030
raise
10231031
except Exception: # pragma: no cover
10241032
self._set_group_selection()
1025-
f = lambda x: x.mean(axis=self.axis)
1033+
f = lambda x: x.mean(axis=self.axis, **kwargs)
10261034
return self._python_agg_general(f)
10271035

10281036
@Substitution(name='groupby')
10291037
@Appender(_doc_template)
1030-
def median(self):
1038+
def median(self, **kwargs):
10311039
"""
10321040
Compute median of groups, excluding missing values
10331041
10341042
For multiple groupings, the result index will be a MultiIndex
10351043
"""
10361044
try:
1037-
return self._cython_agg_general('median')
1045+
return self._cython_agg_general('median', **kwargs)
10381046
except GroupByError:
10391047
raise
10401048
except Exception: # pragma: no cover
@@ -1044,7 +1052,7 @@ def median(self):
10441052
def f(x):
10451053
if isinstance(x, np.ndarray):
10461054
x = Series(x)
1047-
return x.median(axis=self.axis)
1055+
return x.median(axis=self.axis, **kwargs)
10481056
return self._python_agg_general(f)
10491057

10501058
@Substitution(name='groupby')
@@ -1063,7 +1071,7 @@ def std(self, ddof=1, *args, **kwargs):
10631071

10641072
# TODO: implement at Cython level?
10651073
nv.validate_groupby_func('std', args, kwargs)
1066-
return np.sqrt(self.var(ddof=ddof))
1074+
return np.sqrt(self.var(ddof=ddof, **kwargs))
10671075

10681076
@Substitution(name='groupby')
10691077
@Appender(_doc_template)
@@ -1080,10 +1088,10 @@ def var(self, ddof=1, *args, **kwargs):
10801088
"""
10811089
nv.validate_groupby_func('var', args, kwargs)
10821090
if ddof == 1:
1083-
return self._cython_agg_general('var')
1091+
return self._cython_agg_general('var', **kwargs)
10841092
else:
10851093
self._set_group_selection()
1086-
f = lambda x: x.var(ddof=ddof)
1094+
f = lambda x: x.var(ddof=ddof, **kwargs)
10871095
return self._python_agg_general(f)
10881096

10891097
@Substitution(name='groupby')
@@ -1400,39 +1408,39 @@ def cumcount(self, ascending=True):
14001408
@Appender(_doc_template)
14011409
def cumprod(self, axis=0, *args, **kwargs):
14021410
"""Cumulative product for each group"""
1403-
nv.validate_groupby_func('cumprod', args, kwargs)
1411+
nv.validate_groupby_func('cumprod', args, kwargs, ['numeric_only'])
14041412
if axis != 0:
1405-
return self.apply(lambda x: x.cumprod(axis=axis))
1413+
return self.apply(lambda x: x.cumprod(axis=axis, **kwargs))
14061414

1407-
return self._cython_transform('cumprod')
1415+
return self._cython_transform('cumprod', **kwargs)
14081416

14091417
@Substitution(name='groupby')
14101418
@Appender(_doc_template)
14111419
def cumsum(self, axis=0, *args, **kwargs):
14121420
"""Cumulative sum for each group"""
1413-
nv.validate_groupby_func('cumsum', args, kwargs)
1421+
nv.validate_groupby_func('cumsum', args, kwargs, ['numeric_only'])
14141422
if axis != 0:
1415-
return self.apply(lambda x: x.cumsum(axis=axis))
1423+
return self.apply(lambda x: x.cumsum(axis=axis, **kwargs))
14161424

1417-
return self._cython_transform('cumsum')
1425+
return self._cython_transform('cumsum', **kwargs)
14181426

14191427
@Substitution(name='groupby')
14201428
@Appender(_doc_template)
1421-
def cummin(self, axis=0):
1429+
def cummin(self, axis=0, **kwargs):
14221430
"""Cumulative min for each group"""
14231431
if axis != 0:
14241432
return self.apply(lambda x: np.minimum.accumulate(x, axis))
14251433

1426-
return self._cython_transform('cummin')
1434+
return self._cython_transform('cummin', **kwargs)
14271435

14281436
@Substitution(name='groupby')
14291437
@Appender(_doc_template)
1430-
def cummax(self, axis=0):
1438+
def cummax(self, axis=0, **kwargs):
14311439
"""Cumulative max for each group"""
14321440
if axis != 0:
14331441
return self.apply(lambda x: np.maximum.accumulate(x, axis))
14341442

1435-
return self._cython_transform('cummax')
1443+
return self._cython_transform('cummax', **kwargs)
14361444

14371445
@Substitution(name='groupby')
14381446
@Appender(_doc_template)
@@ -1828,6 +1836,28 @@ def wrapper(*args, **kwargs):
18281836
def _cython_operation(self, kind, values, how, axis):
18291837
assert kind in ['transform', 'aggregate']
18301838

1839+
# can we do this operation with our cython functions
1840+
# if not raise NotImplementedError
1841+
1842+
# we raise NotImplemented if this is an invalid operation
1843+
# entirely, e.g. adding datetimes
1844+
1845+
# categoricals are only 1d, so we
1846+
# are not setup for dim transforming
1847+
if is_categorical_dtype(values):
1848+
raise NotImplementedError(
1849+
"categoricals are not support in cython ops ATM")
1850+
elif is_datetime64_any_dtype(values):
1851+
if how in ['add', 'prod', 'cumsum', 'cumprod']:
1852+
raise NotImplementedError(
1853+
"datetime64 type does not support {} "
1854+
"operations".format(how))
1855+
elif is_timedelta64_dtype(values):
1856+
if how in ['prod', 'cumprod']:
1857+
raise NotImplementedError(
1858+
"timedelta64 type does not support {} "
1859+
"operations".format(how))
1860+
18311861
arity = self._cython_arity.get(how, 1)
18321862

18331863
vdim = values.ndim
@@ -3155,9 +3185,9 @@ def _iterate_slices(self):
31553185
continue
31563186
yield val, slicer(val)
31573187

3158-
def _cython_agg_general(self, how, numeric_only=True):
3188+
def _cython_agg_general(self, how, alt=None, numeric_only=True):
31593189
new_items, new_blocks = self._cython_agg_blocks(
3160-
how, numeric_only=numeric_only)
3190+
how, alt=alt, numeric_only=numeric_only)
31613191
return self._wrap_agged_blocks(new_items, new_blocks)
31623192

31633193
def _wrap_agged_blocks(self, items, blocks):
@@ -3183,29 +3213,75 @@ def _wrap_agged_blocks(self, items, blocks):
31833213

31843214
_block_agg_axis = 0
31853215

3186-
def _cython_agg_blocks(self, how, numeric_only=True):
3187-
data, agg_axis = self._get_data_to_aggregate()
3216+
def _cython_agg_blocks(self, how, alt=None, numeric_only=True):
3217+
# TODO: the actual managing of mgr_locs is a PITA
3218+
# here, it should happen via BlockManager.combine
31883219

3189-
new_blocks = []
3220+
data, agg_axis = self._get_data_to_aggregate()
31903221

31913222
if numeric_only:
31923223
data = data.get_numeric_data(copy=False)
31933224

3225+
new_blocks = []
3226+
new_items = []
3227+
deleted_items = []
31943228
for block in data.blocks:
31953229

3196-
result, _ = self.grouper.aggregate(
3197-
block.values, how, axis=agg_axis)
3230+
locs = block.mgr_locs.as_array
3231+
try:
3232+
result, _ = self.grouper.aggregate(
3233+
block.values, how, axis=agg_axis)
3234+
except NotImplementedError:
3235+
# generally if we have numeric_only=False
3236+
# and non-applicable functions
3237+
# try to python agg
3238+
3239+
if alt is None:
3240+
# we cannot perform the operation
3241+
# in an alternate way, exclude the block
3242+
deleted_items.append(locs)
3243+
continue
3244+
3245+
# call our grouper again with only this block
3246+
obj = self.obj[data.items[locs]]
3247+
s = groupby(obj, self.grouper)
3248+
result = s.aggregate(lambda x: alt(x, axis=self.axis))
3249+
result = result._data.blocks[0]
31983250

31993251
# see if we can cast the block back to the original dtype
32003252
result = block._try_coerce_and_cast_result(result)
32013253

3202-
newb = make_block(result, placement=block.mgr_locs)
3254+
new_items.append(locs)
3255+
newb = block.make_block_same_class(result)
32033256
new_blocks.append(newb)
32043257

32053258
if len(new_blocks) == 0:
32063259
raise DataError('No numeric types to aggregate')
32073260

3208-
return data.items, new_blocks
3261+
# reset the locs in the blocks to correspond to our
3262+
# current ordering
3263+
indexer = np.concatenate(new_items)
3264+
new_items = data.items.take(np.sort(indexer))
3265+
3266+
if len(deleted_items):
3267+
3268+
# we need to adjust the indexer to account for the
3269+
# items we have removed
3270+
# really should be done in internals :<
3271+
3272+
deleted = np.concatenate(deleted_items)
3273+
ai = np.arange(len(data))
3274+
mask = np.zeros(len(data))
3275+
mask[deleted] = 1
3276+
indexer = (ai - mask.cumsum())[indexer]
3277+
3278+
offset = 0
3279+
for b in new_blocks:
3280+
l = len(b.mgr_locs)
3281+
b.mgr_locs = indexer[offset:(offset + l)]
3282+
offset += l
3283+
3284+
return new_items, new_blocks
32093285

32103286
def _get_data_to_aggregate(self):
32113287
obj = self._obj_with_exclusions

0 commit comments

Comments
 (0)