Skip to content

BUG: make sure that we are passing thru kwargs to groupby #15054

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.20.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -332,6 +332,7 @@ Bug Fixes



- Bug in groupby operations with timedelta64 when passing ``numeric_only=False`` (:issue:`5724`)


- Bug in ``DataFrame.to_html`` with ``index=False`` and ``max_rows`` raising in ``IndexError`` (:issue:`14998`)
Expand Down
10 changes: 8 additions & 2 deletions pandas/compat/numpy/function.py
Original file line number Diff line number Diff line change
Expand Up @@ -306,12 +306,18 @@ def validate_expanding_func(name, args, kwargs):
raise UnsupportedFunctionCall(msg)


def validate_groupby_func(name, args, kwargs):
def validate_groupby_func(name, args, kwargs, allowed=None):
"""
'args' and 'kwargs' should be empty because all of
'args' and 'kwargs' should be empty, except for allowed
kwargs because all of
their necessary parameters are explicitly listed in
the function signature
"""
if allowed is None:
allowed = []

kwargs = set(kwargs) - set(allowed)

if len(args) + len(kwargs) > 0:
raise UnsupportedFunctionCall((
"numpy operations are not valid "
Expand Down
140 changes: 108 additions & 32 deletions pandas/core/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
is_categorical_dtype,
is_datetimelike,
is_datetime_or_timedelta_dtype,
is_datetime64_any_dtype,
is_bool, is_integer_dtype,
is_complex_dtype,
is_bool_dtype,
Expand Down Expand Up @@ -109,10 +110,12 @@ def _groupby_function(name, alias, npfunc, numeric_only=True,
@Substitution(name='groupby', f=name)
@Appender(_doc_template)
@Appender(_local_template)
def f(self):
def f(self, **kwargs):
if 'numeric_only' not in kwargs:
kwargs['numeric_only'] = numeric_only
self._set_group_selection()
try:
return self._cython_agg_general(alias, numeric_only=numeric_only)
return self._cython_agg_general(alias, alt=npfunc, **kwargs)
except AssertionError as e:
raise SpecificationError(str(e))
except Exception:
Expand All @@ -127,7 +130,9 @@ def f(self):


def _first_compat(x, axis=0):

def _first(x):

x = np.asarray(x)
x = x[notnull(x)]
if len(x) == 0:
Expand All @@ -142,6 +147,7 @@ def _first(x):

def _last_compat(x, axis=0):
def _last(x):

x = np.asarray(x)
x = x[notnull(x)]
if len(x) == 0:
Expand Down Expand Up @@ -775,14 +781,16 @@ def _try_cast(self, result, obj):
return result

def _cython_transform(self, how, numeric_only=True):
output = {}
output = collections.OrderedDict()
for name, obj in self._iterate_slices():
is_numeric = is_numeric_dtype(obj.dtype)
if numeric_only and not is_numeric:
continue

try:
result, names = self.grouper.transform(obj.values, how)
except NotImplementedError:
continue
except AssertionError as e:
raise GroupByError(str(e))
output[name] = self._try_cast(result, obj)
Expand All @@ -792,7 +800,7 @@ def _cython_transform(self, how, numeric_only=True):

return self._wrap_transformed_output(output, names)

def _cython_agg_general(self, how, numeric_only=True):
def _cython_agg_general(self, how, alt=None, numeric_only=True):
output = {}
for name, obj in self._iterate_slices():
is_numeric = is_numeric_dtype(obj.dtype)
Expand Down Expand Up @@ -1015,26 +1023,26 @@ def mean(self, *args, **kwargs):

For multiple groupings, the result index will be a MultiIndex
"""
nv.validate_groupby_func('mean', args, kwargs)
nv.validate_groupby_func('mean', args, kwargs, ['numeric_only'])
try:
return self._cython_agg_general('mean')
return self._cython_agg_general('mean', **kwargs)
except GroupByError:
raise
except Exception: # pragma: no cover
self._set_group_selection()
f = lambda x: x.mean(axis=self.axis)
f = lambda x: x.mean(axis=self.axis, **kwargs)
return self._python_agg_general(f)

@Substitution(name='groupby')
@Appender(_doc_template)
def median(self):
def median(self, **kwargs):
"""
Compute median of groups, excluding missing values

For multiple groupings, the result index will be a MultiIndex
"""
try:
return self._cython_agg_general('median')
return self._cython_agg_general('median', **kwargs)
except GroupByError:
raise
except Exception: # pragma: no cover
Expand All @@ -1044,7 +1052,7 @@ def median(self):
def f(x):
if isinstance(x, np.ndarray):
x = Series(x)
return x.median(axis=self.axis)
return x.median(axis=self.axis, **kwargs)
return self._python_agg_general(f)

@Substitution(name='groupby')
Expand All @@ -1063,7 +1071,7 @@ def std(self, ddof=1, *args, **kwargs):

# TODO: implement at Cython level?
nv.validate_groupby_func('std', args, kwargs)
return np.sqrt(self.var(ddof=ddof))
return np.sqrt(self.var(ddof=ddof, **kwargs))

@Substitution(name='groupby')
@Appender(_doc_template)
Expand All @@ -1080,10 +1088,10 @@ def var(self, ddof=1, *args, **kwargs):
"""
nv.validate_groupby_func('var', args, kwargs)
if ddof == 1:
return self._cython_agg_general('var')
return self._cython_agg_general('var', **kwargs)
else:
self._set_group_selection()
f = lambda x: x.var(ddof=ddof)
f = lambda x: x.var(ddof=ddof, **kwargs)
return self._python_agg_general(f)

@Substitution(name='groupby')
Expand Down Expand Up @@ -1400,39 +1408,39 @@ def cumcount(self, ascending=True):
@Appender(_doc_template)
def cumprod(self, axis=0, *args, **kwargs):
"""Cumulative product for each group"""
nv.validate_groupby_func('cumprod', args, kwargs)
nv.validate_groupby_func('cumprod', args, kwargs, ['numeric_only'])
if axis != 0:
return self.apply(lambda x: x.cumprod(axis=axis))
return self.apply(lambda x: x.cumprod(axis=axis, **kwargs))

return self._cython_transform('cumprod')
return self._cython_transform('cumprod', **kwargs)

@Substitution(name='groupby')
@Appender(_doc_template)
def cumsum(self, axis=0, *args, **kwargs):
"""Cumulative sum for each group"""
nv.validate_groupby_func('cumsum', args, kwargs)
nv.validate_groupby_func('cumsum', args, kwargs, ['numeric_only'])
if axis != 0:
return self.apply(lambda x: x.cumsum(axis=axis))
return self.apply(lambda x: x.cumsum(axis=axis, **kwargs))

return self._cython_transform('cumsum')
return self._cython_transform('cumsum', **kwargs)

@Substitution(name='groupby')
@Appender(_doc_template)
def cummin(self, axis=0):
def cummin(self, axis=0, **kwargs):
"""Cumulative min for each group"""
if axis != 0:
return self.apply(lambda x: np.minimum.accumulate(x, axis))

return self._cython_transform('cummin')
return self._cython_transform('cummin', **kwargs)

@Substitution(name='groupby')
@Appender(_doc_template)
def cummax(self, axis=0):
def cummax(self, axis=0, **kwargs):
"""Cumulative max for each group"""
if axis != 0:
return self.apply(lambda x: np.maximum.accumulate(x, axis))

return self._cython_transform('cummax')
return self._cython_transform('cummax', **kwargs)

@Substitution(name='groupby')
@Appender(_doc_template)
Expand Down Expand Up @@ -1828,6 +1836,28 @@ def wrapper(*args, **kwargs):
def _cython_operation(self, kind, values, how, axis):
assert kind in ['transform', 'aggregate']

# can we do this operation with our cython functions
# if not raise NotImplementedError

# we raise NotImplemented if this is an invalid operation
# entirely, e.g. adding datetimes

# categoricals are only 1d, so we
# are not setup for dim transforming
if is_categorical_dtype(values):
raise NotImplementedError(
"categoricals are not support in cython ops ATM")
elif is_datetime64_any_dtype(values):
if how in ['add', 'prod', 'cumsum', 'cumprod']:
raise NotImplementedError(
"datetime64 type does not support {} "
"operations".format(how))
elif is_timedelta64_dtype(values):
if how in ['prod', 'cumprod']:
raise NotImplementedError(
"timedelta64 type does not support {} "
"operations".format(how))

arity = self._cython_arity.get(how, 1)

vdim = values.ndim
Expand Down Expand Up @@ -3155,9 +3185,9 @@ def _iterate_slices(self):
continue
yield val, slicer(val)

def _cython_agg_general(self, how, numeric_only=True):
def _cython_agg_general(self, how, alt=None, numeric_only=True):
new_items, new_blocks = self._cython_agg_blocks(
how, numeric_only=numeric_only)
how, alt=alt, numeric_only=numeric_only)
return self._wrap_agged_blocks(new_items, new_blocks)

def _wrap_agged_blocks(self, items, blocks):
Expand All @@ -3183,29 +3213,75 @@ def _wrap_agged_blocks(self, items, blocks):

_block_agg_axis = 0

def _cython_agg_blocks(self, how, numeric_only=True):
data, agg_axis = self._get_data_to_aggregate()
def _cython_agg_blocks(self, how, alt=None, numeric_only=True):
# TODO: the actual managing of mgr_locs is a PITA
# here, it should happen via BlockManager.combine

new_blocks = []
data, agg_axis = self._get_data_to_aggregate()

if numeric_only:
data = data.get_numeric_data(copy=False)

new_blocks = []
new_items = []
deleted_items = []
for block in data.blocks:

result, _ = self.grouper.aggregate(
block.values, how, axis=agg_axis)
locs = block.mgr_locs.as_array
try:
result, _ = self.grouper.aggregate(
block.values, how, axis=agg_axis)
except NotImplementedError:
# generally if we have numeric_only=False
# and non-applicable functions
# try to python agg

if alt is None:
# we cannot perform the operation
# in an alternate way, exclude the block
deleted_items.append(locs)
continue

# call our grouper again with only this block
obj = self.obj[data.items[locs]]
s = groupby(obj, self.grouper)
result = s.aggregate(lambda x: alt(x, axis=self.axis))
result = result._data.blocks[0]

# see if we can cast the block back to the original dtype
result = block._try_coerce_and_cast_result(result)

newb = make_block(result, placement=block.mgr_locs)
new_items.append(locs)
newb = block.make_block_same_class(result)
new_blocks.append(newb)

if len(new_blocks) == 0:
raise DataError('No numeric types to aggregate')

return data.items, new_blocks
# reset the locs in the blocks to correspond to our
# current ordering
indexer = np.concatenate(new_items)
new_items = data.items.take(np.sort(indexer))

if len(deleted_items):

# we need to adjust the indexer to account for the
# items we have removed
# really should be done in internals :<

deleted = np.concatenate(deleted_items)
ai = np.arange(len(data))
mask = np.zeros(len(data))
mask[deleted] = 1
indexer = (ai - mask.cumsum())[indexer]

offset = 0
for b in new_blocks:
l = len(b.mgr_locs)
b.mgr_locs = indexer[offset:(offset + l)]
offset += l

return new_items, new_blocks

def _get_data_to_aggregate(self):
obj = self._obj_with_exclusions
Expand Down
Loading