Skip to content

Commit dec6354

Browse files
committed
ENH: add Series & DataFrame .agg/.aggregate to provide convienent
function application that mimics the groupby(..).agg/.aggregate interface .apply is now a synonym for .agg, and will accept dict/list-likes for aggregations CLN: rename .name attr -> ._selection_name from SeriesGroupby for compat (didn't exist on DataFrameGroupBy) resolves conflicts w.r.t. setting .name on a groupby object closes #1623 closes #14464 custom .describe closes #14483 closes #7014
1 parent 3f523f3 commit dec6354

File tree

9 files changed

+442
-47
lines changed

9 files changed

+442
-47
lines changed

pandas/core/base.py

+55-13
Original file line numberDiff line numberDiff line change
@@ -289,7 +289,9 @@ class SelectionMixin(object):
289289
}
290290

291291
@property
292-
def name(self):
292+
def _selection_name(self):
293+
""" return a name for myself; this would ideally be the 'name' property, but
294+
we cannot conflict with the Series.name property which can be set """
293295
if self._selection is None:
294296
return None # 'result'
295297
else:
@@ -404,6 +406,26 @@ def aggregate(self, func, *args, **kwargs):
404406

405407
agg = aggregate
406408

409+
def _try_aggregate_string_function(self, arg, *args, **kwargs):
410+
"""
411+
if arg is a string, then try to operate on it:
412+
- try to find a function on ourselves
413+
- try to find a numpy function
414+
- raise
415+
416+
"""
417+
assert isinstance(arg, compat.string_types)
418+
419+
f = getattr(self, arg, None)
420+
if f is not None:
421+
return f(*args, **kwargs)
422+
423+
f = getattr(np, arg, None)
424+
if f is not None:
425+
return f(self, *args, **kwargs)
426+
427+
raise ValueError("{} is an unknown string function".format(arg))
428+
407429
def _aggregate(self, arg, *args, **kwargs):
408430
"""
409431
provide an implementation for the aggregators
@@ -427,14 +449,19 @@ def _aggregate(self, arg, *args, **kwargs):
427449
is_aggregator = lambda x: isinstance(x, (list, tuple, dict))
428450
is_nested_renamer = False
429451

452+
_axis = kwargs.pop('_axis', None)
453+
if _axis is None:
454+
_axis = getattr(self, 'axis', 0)
430455
_level = kwargs.pop('_level', None)
456+
431457
if isinstance(arg, compat.string_types):
432-
return getattr(self, arg)(*args, **kwargs), None
458+
return self._try_aggregate_string_function(arg, *args,
459+
**kwargs), None
433460

434461
if isinstance(arg, dict):
435462

436463
# aggregate based on the passed dict
437-
if self.axis != 0: # pragma: no cover
464+
if _axis != 0: # pragma: no cover
438465
raise ValueError('Can only pass dict with axis=0')
439466

440467
obj = self._selected_obj
@@ -560,26 +587,33 @@ def _agg(arg, func):
560587
ABCDataFrame):
561588
result = concat([result[k] for k in keys], keys=keys, axis=1)
562589
else:
563-
from pandas import DataFrame
564-
result = DataFrame(result)
590+
from pandas import DataFrame, Series
591+
try:
592+
result = DataFrame(result)
593+
except ValueError:
594+
# we have a dict of scalars
595+
result = Series(result, name=self.name)
565596

566597
return result, True
567-
elif hasattr(arg, '__iter__'):
568-
return self._aggregate_multiple_funcs(arg, _level=_level), None
598+
elif is_list_like(arg) and arg not in compat.string_types:
599+
# we require a list, but not an 'str'
600+
return self._aggregate_multiple_funcs(arg,
601+
_level=_level,
602+
_axis=_axis), None
569603
else:
570604
result = None
571605

572-
cy_func = self._is_cython_func(arg)
573-
if cy_func and not args and not kwargs:
574-
return getattr(self, cy_func)(), None
606+
f = self._is_cython_func(arg)
607+
if f and not args and not kwargs:
608+
return getattr(self, f)(), None
575609

576610
# caller can react
577611
return result, True
578612

579-
def _aggregate_multiple_funcs(self, arg, _level):
613+
def _aggregate_multiple_funcs(self, arg, _level, _axis):
580614
from pandas.tools.merge import concat
581615

582-
if self.axis != 0:
616+
if _axis != 0:
583617
raise NotImplementedError("axis other than 0 is not supported")
584618

585619
if self._selected_obj.ndim == 1:
@@ -617,7 +651,15 @@ def _aggregate_multiple_funcs(self, arg, _level):
617651
except SpecificationError:
618652
raise
619653

620-
return concat(results, keys=keys, axis=1)
654+
try:
655+
return concat(results, keys=keys, axis=1)
656+
except TypeError:
657+
# shape change
658+
from pandas.types.cast import _maybe_convert_nested_object
659+
from pandas import Series
660+
result = Series(results, index=keys, name=self.name)
661+
result = _maybe_convert_nested_object(result)
662+
return result
621663

622664
def _shallow_copy(self, obj=None, obj_type=None, **kwargs):
623665
""" return a new object with the replacement attributes """

pandas/core/frame.py

+40-4
Original file line numberDiff line numberDiff line change
@@ -4073,6 +4073,38 @@ def diff(self, periods=1, axis=0):
40734073
# ----------------------------------------------------------------------
40744074
# Function application
40754075

4076+
def _gotitem(self, key, ndim, subset=None):
4077+
"""
4078+
sub-classes to define
4079+
return a sliced object
4080+
4081+
Parameters
4082+
----------
4083+
key : string / list of selections
4084+
ndim : 1,2
4085+
requested ndim of result
4086+
subset : object, default None
4087+
subset to act on
4088+
"""
4089+
if subset is None:
4090+
subset = self
4091+
4092+
# TODO: _shallow_copy(subset)?
4093+
return self[key]
4094+
4095+
def aggregate(self, func, axis=0, *args, **kwargs):
4096+
axis = self._get_axis_number(axis)
4097+
4098+
# TODO: flipped axis
4099+
result = None
4100+
if axis == 0:
4101+
result, how = self._aggregate(func, axis=0, *args, **kwargs)
4102+
if result is None:
4103+
return self.apply(func, axis=axis, args=args, **kwargs)
4104+
return result
4105+
4106+
agg = aggregate
4107+
40764108
def apply(self, func, axis=0, broadcast=False, raw=False, reduce=None,
40774109
args=(), **kwds):
40784110
"""
@@ -4134,16 +4166,20 @@ def apply(self, func, axis=0, broadcast=False, raw=False, reduce=None,
41344166
applied : Series or DataFrame
41354167
"""
41364168
axis = self._get_axis_number(axis)
4137-
if kwds or args and not isinstance(func, np.ufunc):
41384169

4170+
# dispatch to agg
4171+
if axis == 0 and isinstance(func, (list, dict)):
4172+
return self.aggregate(func, axis=axis, *args, **kwds)
4173+
4174+
if len(self.columns) == 0 and len(self.index) == 0:
4175+
return self._apply_empty_result(func, axis, reduce, *args, **kwds)
4176+
4177+
if kwds or args and not isinstance(func, np.ufunc):
41394178
def f(x):
41404179
return func(x, *args, **kwds)
41414180
else:
41424181
f = func
41434182

4144-
if len(self.columns) == 0 and len(self.index) == 0:
4145-
return self._apply_empty_result(func, axis, reduce, *args, **kwds)
4146-
41474183
if isinstance(f, np.ufunc):
41484184
with np.errstate(all='ignore'):
41494185
results = f(self.values)

pandas/core/generic.py

+12-2
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@
3333
SettingWithCopyError, SettingWithCopyWarning,
3434
AbstractMethodError)
3535

36-
from pandas.core.base import PandasObject
36+
from pandas.core.base import PandasObject, SelectionMixin
3737
from pandas.core.index import (Index, MultiIndex, _ensure_index,
3838
InvalidIndexError)
3939
import pandas.core.indexing as indexing
@@ -91,7 +91,7 @@ def _single_replace(self, to_replace, method, inplace, limit):
9191
return result
9292

9393

94-
class NDFrame(PandasObject):
94+
class NDFrame(PandasObject, SelectionMixin):
9595
"""
9696
N-dimensional analogue of DataFrame. Store multi-dimensional in a
9797
size-mutable, labeled data structure
@@ -428,6 +428,16 @@ def size(self):
428428
"""number of elements in the NDFrame"""
429429
return np.prod(self.shape)
430430

431+
@property
432+
def _selected_obj(self):
433+
""" internal compat with SelectionMixin """
434+
return self
435+
436+
@property
437+
def _obj_with_exclusions(self):
438+
""" internal compat with SelectionMixin """
439+
return self
440+
431441
def _expand_axes(self, key):
432442
new_axes = []
433443
for k, ax in zip(key, self.axes):

pandas/core/groupby.py

+21-18
Original file line numberDiff line numberDiff line change
@@ -703,7 +703,7 @@ def _python_apply_general(self, f):
703703
not_indexed_same=mutated or self.mutated)
704704

705705
def _iterate_slices(self):
706-
yield self.name, self._selected_obj
706+
yield self._selection_name, self._selected_obj
707707

708708
def transform(self, func, *args, **kwargs):
709709
raise AbstractMethodError(self)
@@ -886,9 +886,9 @@ def reset_identity(values):
886886
result = concat(values, axis=self.axis)
887887

888888
if (isinstance(result, Series) and
889-
getattr(self, 'name', None) is not None):
889+
getattr(self, '_selection_name', None) is not None):
890890

891-
result.name = self.name
891+
result.name = self._selection_name
892892

893893
return result
894894

@@ -2575,7 +2575,7 @@ class SeriesGroupBy(GroupBy):
25752575
exec(_def_str)
25762576

25772577
@property
2578-
def name(self):
2578+
def _selection_name(self):
25792579
"""
25802580
since we are a series, we by definition only have
25812581
a single name, but may be the result of a selection or
@@ -2718,12 +2718,12 @@ def _aggregate_multiple_funcs(self, arg, _level):
27182718

27192719
def _wrap_output(self, output, index, names=None):
27202720
""" common agg/transform wrapping logic """
2721-
output = output[self.name]
2721+
output = output[self._selection_name]
27222722

27232723
if names is not None:
27242724
return DataFrame(output, index=index, columns=names)
27252725
else:
2726-
name = self.name
2726+
name = self._selection_name
27272727
if name is None:
27282728
name = self._selected_obj.name
27292729
return Series(output, index=index, name=name)
@@ -2741,7 +2741,7 @@ def _wrap_transformed_output(self, output, names=None):
27412741
def _wrap_applied_output(self, keys, values, not_indexed_same=False):
27422742
if len(keys) == 0:
27432743
# GH #6265
2744-
return Series([], name=self.name, index=keys)
2744+
return Series([], name=self._selection_name, index=keys)
27452745

27462746
def _get_index():
27472747
if self.grouper.nkeys > 1:
@@ -2754,7 +2754,7 @@ def _get_index():
27542754
# GH #823
27552755
index = _get_index()
27562756
result = DataFrame(values, index=index).stack()
2757-
result.name = self.name
2757+
result.name = self._selection_name
27582758
return result
27592759

27602760
if isinstance(values[0], (Series, dict)):
@@ -2766,7 +2766,8 @@ def _get_index():
27662766
not_indexed_same=not_indexed_same)
27672767
else:
27682768
# GH #6265
2769-
return Series(values, index=_get_index(), name=self.name)
2769+
return Series(values, index=_get_index(),
2770+
name=self._selection_name)
27702771

27712772
def _aggregate_named(self, func, *args, **kwargs):
27722773
result = {}
@@ -2938,7 +2939,7 @@ def nunique(self, dropna=True):
29382939

29392940
return Series(res,
29402941
index=ri,
2941-
name=self.name)
2942+
name=self._selection_name)
29422943

29432944
@deprecate_kwarg('take_last', 'keep',
29442945
mapping={True: 'last', False: 'first'})
@@ -3002,7 +3003,7 @@ def value_counts(self, normalize=False, sort=True, ascending=False,
30023003
# multi-index components
30033004
labels = list(map(rep, self.grouper.recons_labels)) + [lab[inc]]
30043005
levels = [ping.group_index for ping in self.grouper.groupings] + [lev]
3005-
names = self.grouper.names + [self.name]
3006+
names = self.grouper.names + [self._selection_name]
30063007

30073008
if dropna:
30083009
mask = labels[-1] != -1
@@ -3037,7 +3038,7 @@ def value_counts(self, normalize=False, sort=True, ascending=False,
30373038

30383039
if is_integer_dtype(out):
30393040
out = _ensure_int64(out)
3040-
return Series(out, index=mi, name=self.name)
3041+
return Series(out, index=mi, name=self._selection_name)
30413042

30423043
# for compat. with algos.value_counts need to ensure every
30433044
# bin is present at every index level, null filled with zeros
@@ -3068,7 +3069,7 @@ def value_counts(self, normalize=False, sort=True, ascending=False,
30683069

30693070
if is_integer_dtype(out):
30703071
out = _ensure_int64(out)
3071-
return Series(out, index=mi, name=self.name)
3072+
return Series(out, index=mi, name=self._selection_name)
30723073

30733074
def count(self):
30743075
""" Compute count of group, excluding missing values """
@@ -3081,7 +3082,7 @@ def count(self):
30813082

30823083
return Series(out,
30833084
index=self.grouper.result_index,
3084-
name=self.name,
3085+
name=self._selection_name,
30853086
dtype='int64')
30863087

30873088
def _apply_to_column_groupbys(self, func):
@@ -3191,7 +3192,7 @@ def aggregate(self, arg, *args, **kwargs):
31913192
try:
31923193
assert not args and not kwargs
31933194
result = self._aggregate_multiple_funcs(
3194-
[arg], _level=_level)
3195+
[arg], _level=_level, _axis=self.axis)
31953196
result.columns = Index(
31963197
result.columns.levels[0],
31973198
name=self._selected_obj.columns.name)
@@ -3422,7 +3423,8 @@ def first_non_None_value(values):
34223423
except (ValueError, AttributeError):
34233424
# GH1738: values is list of arrays of unequal lengths fall
34243425
# through to the outer else caluse
3425-
return Series(values, index=key_index, name=self.name)
3426+
return Series(values, index=key_index,
3427+
name=self._selection_name)
34263428

34273429
# if we have date/time like in the original, then coerce dates
34283430
# as we are stacking can easily have object dtypes here
@@ -3445,8 +3447,9 @@ def first_non_None_value(values):
34453447
# only coerce dates if we find at least 1 datetime
34463448
coerce = True if any([isinstance(x, Timestamp)
34473449
for x in values]) else False
3448-
# self.name not passed through to Series as the result
3449-
# should not take the name of original selection of columns
3450+
# self._selection_name not passed through to Series as the
3451+
# result should not take the name of original selection
3452+
# of columns
34503453
return (Series(values, index=key_index)
34513454
._convert(datetime=True,
34523455
coerce=coerce))

0 commit comments

Comments
 (0)